Repository: Shenyi-Z/ToCa
Branch: main
Commit: e84096ffd85a
Files: 713
Total size: 6.6 MB

Directory structure:
gitextract_nz69m5ai/

├── COCO_caption_prompts_30k.txt
├── DiT-ToCa/
│   ├── cache_functions/
│   │   ├── __init__.py
│   │   ├── attention.py
│   │   ├── cache_cutfresh.py
│   │   ├── cache_init.py
│   │   ├── cal_type.py
│   │   ├── force_init.py
│   │   ├── force_scheduler.py
│   │   ├── fresh_ratio_scheduler.py
│   │   ├── global_force_fresh.py
│   │   ├── score_evaluate.py
│   │   ├── scores.py
│   │   ├── token_merge.py
│   │   └── update_cache.py
│   ├── diffusion/
│   │   ├── __init__.py
│   │   ├── diffusion_utils.py
│   │   ├── gaussian_diffusion.py
│   │   ├── respace.py
│   │   └── timestep_sampler.py
│   ├── download.py
│   ├── environment-dit.yml
│   ├── models.py
│   ├── sample.py
│   ├── sample_ddp.py
│   └── train.py
├── DrawBench200.txt
├── LICENSE
├── Open-Sora/
│   ├── Dockerfile
│   ├── LICENSE
│   ├── README.md
│   ├── assets/
│   │   └── texts/
│   │       ├── VBench/
│   │       │   ├── all_category.txt
│   │       │   ├── all_dimension.txt
│   │       │   ├── all_i2v.txt
│   │       │   ├── prompts_per_category/
│   │       │   │   ├── animal.txt
│   │       │   │   ├── architecture.txt
│   │       │   │   ├── food.txt
│   │       │   │   ├── human.txt
│   │       │   │   ├── lifestyle.txt
│   │       │   │   ├── plant.txt
│   │       │   │   ├── scenery.txt
│   │       │   │   └── vehicles.txt
│   │       │   └── prompts_per_dimension/
│   │       │       ├── appearance_style.txt
│   │       │       ├── color.txt
│   │       │       ├── human_action.txt
│   │       │       ├── multiple_objects.txt
│   │       │       ├── object_class.txt
│   │       │       ├── overall_consistency.txt
│   │       │       ├── scene.txt
│   │       │       ├── spatial_relationship.txt
│   │       │       ├── subject_consistency.txt
│   │       │       ├── temporal_flickering.txt
│   │       │       └── temporal_style.txt
│   │       ├── imagenet_id.txt
│   │       ├── imagenet_labels.txt
│   │       ├── rand_types.txt
│   │       ├── t2i_samples.txt
│   │       ├── t2i_sigma.txt
│   │       ├── t2v_car.txt
│   │       ├── t2v_latte.txt
│   │       ├── t2v_pllava.txt
│   │       ├── t2v_ref.txt
│   │       ├── t2v_samples.txt
│   │       ├── t2v_short.txt
│   │       ├── t2v_sora.txt
│   │       ├── ucf101_id.txt
│   │       └── ucf101_labels.txt
│   ├── build/
│   │   └── lib/
│   │       ├── opensora/
│   │       │   ├── acceleration/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── checkpoint.py
│   │       │   │   ├── communications.py
│   │       │   │   ├── parallel_states.py
│   │       │   │   ├── plugin.py
│   │       │   │   └── shardformer/
│   │       │   │       ├── __init__.py
│   │       │   │       ├── modeling/
│   │       │   │       │   ├── __init__.py
│   │       │   │       │   └── t5.py
│   │       │   │       └── policy/
│   │       │   │           ├── __init__.py
│   │       │   │           └── t5_encoder.py
│   │       │   ├── datasets/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── aspect.py
│   │       │   │   ├── bucket.py
│   │       │   │   ├── dataloader.py
│   │       │   │   ├── datasets.py
│   │       │   │   ├── read_video.py
│   │       │   │   ├── sampler.py
│   │       │   │   ├── utils.py
│   │       │   │   └── video_transforms.py
│   │       │   └── models/
│   │       │       ├── cache_functions/
│   │       │       │   ├── __init__.py
│   │       │       │   ├── attention.py
│   │       │       │   ├── cache_cutfresh.py
│   │       │       │   ├── cache_init.py
│   │       │       │   ├── force_init.py
│   │       │       │   ├── force_scheduler.py
│   │       │       │   ├── fresh_ratio_scheduler.py
│   │       │       │   ├── global_force_fresh.py
│   │       │       │   ├── score_evaluate.py
│   │       │       │   ├── scores.py
│   │       │       │   ├── token_merge.py
│   │       │       │   └── update_cache.py
│   │       │       ├── dit/
│   │       │       │   ├── __init__.py
│   │       │       │   └── dit.py
│   │       │       ├── latte/
│   │       │       │   ├── __init__.py
│   │       │       │   └── latte.py
│   │       │       ├── layers/
│   │       │       │   ├── __init__.py
│   │       │       │   └── blocks.py
│   │       │       ├── pixart/
│   │       │       │   ├── __init__.py
│   │       │       │   ├── pixart.py
│   │       │       │   └── pixart_sigma.py
│   │       │       ├── stdit/
│   │       │       │   ├── __init__.py
│   │       │       │   ├── stdit.py
│   │       │       │   ├── stdit2.py
│   │       │       │   ├── stdit3 copy.py
│   │       │       │   └── stdit3.py
│   │       │       └── text_encoder/
│   │       │           ├── __init__.py
│   │       │           ├── classes.py
│   │       │           ├── clip.py
│   │       │           └── t5.py
│   │       ├── tools/
│   │       │   ├── caption/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── acceleration/
│   │       │   │   │   ├── __init__.py
│   │       │   │   │   └── llava/
│   │       │   │   │       ├── __init__.py
│   │       │   │   │       └── policies/
│   │       │   │   │           ├── __init__.py
│   │       │   │   │           ├── llama.py
│   │       │   │   │           └── mistral.py
│   │       │   │   ├── camera_motion/
│   │       │   │   │   ├── __init__.py
│   │       │   │   │   ├── camera_motion.py
│   │       │   │   │   ├── detect.py
│   │       │   │   │   ├── utils.py
│   │       │   │   │   └── visualizer.py
│   │       │   │   ├── camera_motion_detect.py
│   │       │   │   ├── caption_gpt4.py
│   │       │   │   ├── caption_llama3.py
│   │       │   │   ├── caption_llava.py
│   │       │   │   └── utils.py
│   │       │   ├── datasets/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── analyze.py
│   │       │   │   ├── convert.py
│   │       │   │   ├── datautil.py
│   │       │   │   ├── filter_panda10m.py
│   │       │   │   ├── split.py
│   │       │   │   ├── transform.py
│   │       │   │   └── utils.py
│   │       │   ├── frame_interpolation/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── interpolation.py
│   │       │   │   ├── networks/
│   │       │   │   │   ├── __init__.py
│   │       │   │   │   ├── amt_g.py
│   │       │   │   │   └── blocks/
│   │       │   │   │       ├── __init__.py
│   │       │   │   │       ├── feat_enc.py
│   │       │   │   │       ├── ifrnet.py
│   │       │   │   │       ├── multi_flow.py
│   │       │   │   │       └── raft.py
│   │       │   │   └── utils/
│   │       │   │       ├── __init__.py
│   │       │   │       ├── dist_utils.py
│   │       │   │       ├── flow_utils.py
│   │       │   │       └── utils.py
│   │       │   ├── scene_cut/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── convert_id_to_path.py
│   │       │   │   ├── cut.py
│   │       │   │   └── scene_detect.py
│   │       │   └── scoring/
│   │       │       ├── aesthetic/
│   │       │       │   ├── __init__.py
│   │       │       │   └── inference.py
│   │       │       └── matching/
│   │       │           ├── __init__.py
│   │       │           └── inference.py
│   │       ├── vbench/
│   │       │   ├── __init__.py
│   │       │   ├── aesthetic_quality.py
│   │       │   ├── appearance_style.py
│   │       │   ├── background_consistency.py
│   │       │   ├── cli/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── evaluate.py
│   │       │   │   ├── static_filter.py
│   │       │   │   └── vbench.py
│   │       │   ├── color.py
│   │       │   ├── dynamic_degree.py
│   │       │   ├── human_action.py
│   │       │   ├── imaging_quality.py
│   │       │   ├── motion_smoothness.py
│   │       │   ├── multiple_objects.py
│   │       │   ├── object_class.py
│   │       │   ├── overall_consistency.py
│   │       │   ├── scene.py
│   │       │   ├── spatial_relationship.py
│   │       │   ├── subject_consistency.py
│   │       │   ├── temporal_flickering.py
│   │       │   ├── temporal_style.py
│   │       │   ├── third_pary/
│   │       │   │   ├── 0.txt
│   │       │   │   ├── RAFT/
│   │       │   │   │   ├── __init__.py
│   │       │   │   │   └── core/
│   │       │   │   │       ├── __init__.py
│   │       │   │   │       ├── corr.py
│   │       │   │   │       ├── datasets.py
│   │       │   │   │       ├── extractor.py
│   │       │   │   │       ├── raft.py
│   │       │   │   │       ├── update.py
│   │       │   │   │       └── utils_core/
│   │       │   │   │           ├── __init__.py
│   │       │   │   │           ├── augmentor.py
│   │       │   │   │           ├── flow_viz.py
│   │       │   │   │           ├── frame_utils.py
│   │       │   │   │           └── utils.py
│   │       │   │   ├── ViCLIP/
│   │       │   │   │   ├── __init__.py
│   │       │   │   │   ├── simple_tokenizer.py
│   │       │   │   │   ├── viclip.py
│   │       │   │   │   ├── viclip_text.py
│   │       │   │   │   └── viclip_vision.py
│   │       │   │   ├── __init__.py
│   │       │   │   ├── amt/
│   │       │   │   │   ├── benchmarks/
│   │       │   │   │   │   ├── __init__.py
│   │       │   │   │   │   ├── adobe240.py
│   │       │   │   │   │   ├── gopro.py
│   │       │   │   │   │   ├── snu_film.py
│   │       │   │   │   │   ├── speed_parameters.py
│   │       │   │   │   │   ├── ucf101.py
│   │       │   │   │   │   ├── vimeo90k.py
│   │       │   │   │   │   ├── vimeo90k_tta.py
│   │       │   │   │   │   └── xiph.py
│   │       │   │   │   ├── datasets/
│   │       │   │   │   │   ├── __init__.py
│   │       │   │   │   │   ├── adobe_datasets.py
│   │       │   │   │   │   ├── gopro_datasets.py
│   │       │   │   │   │   └── vimeo_datasets.py
│   │       │   │   │   ├── flow_generation/
│   │       │   │   │   │   ├── __init__.py
│   │       │   │   │   │   ├── gen_flow.py
│   │       │   │   │   │   └── liteflownet/
│   │       │   │   │   │       ├── __init__.py
│   │       │   │   │   │       └── run.py
│   │       │   │   │   ├── losses/
│   │       │   │   │   │   ├── __init__.py
│   │       │   │   │   │   └── loss.py
│   │       │   │   │   ├── metrics/
│   │       │   │   │   │   ├── __init__.py
│   │       │   │   │   │   └── psnr_ssim.py
│   │       │   │   │   └── networks/
│   │       │   │   │       ├── AMT-G.py
│   │       │   │   │       ├── AMT-L.py
│   │       │   │   │       ├── AMT-S.py
│   │       │   │   │       └── blocks/
│   │       │   │   │           ├── __init__.py
│   │       │   │   │           ├── feat_enc.py
│   │       │   │   │           ├── ifrnet.py
│   │       │   │   │           ├── multi_flow.py
│   │       │   │   │           └── raft.py
│   │       │   │   ├── grit_model.py
│   │       │   │   ├── grit_src/
│   │       │   │   │   └── centernet2/
│   │       │   │   │       └── centernet/
│   │       │   │   │           ├── __init__.py
│   │       │   │   │           ├── config.py
│   │       │   │   │           └── modeling/
│   │       │   │   │               ├── __init__.py
│   │       │   │   │               ├── backbone/
│   │       │   │   │               │   ├── __init__.py
│   │       │   │   │               │   ├── bifpn.py
│   │       │   │   │               │   ├── bifpn_fcos.py
│   │       │   │   │               │   ├── dla.py
│   │       │   │   │               │   ├── dlafpn.py
│   │       │   │   │               │   ├── fpn_p5.py
│   │       │   │   │               │   └── res2net.py
│   │       │   │   │               ├── debug.py
│   │       │   │   │               ├── dense_heads/
│   │       │   │   │               │   ├── __init__.py
│   │       │   │   │               │   ├── centernet.py
│   │       │   │   │               │   ├── centernet_head.py
│   │       │   │   │               │   └── utils.py
│   │       │   │   │               ├── layers/
│   │       │   │   │               │   ├── __init__.py
│   │       │   │   │               │   ├── deform_conv.py
│   │       │   │   │               │   ├── heatmap_focal_loss.py
│   │       │   │   │               │   ├── iou_loss.py
│   │       │   │   │               │   └── ml_nms.py
│   │       │   │   │               ├── meta_arch/
│   │       │   │   │               │   ├── __init__.py
│   │       │   │   │               │   └── centernet_detector.py
│   │       │   │   │               └── roi_heads/
│   │       │   │   │                   ├── __init__.py
│   │       │   │   │                   ├── custom_fast_rcnn.py
│   │       │   │   │                   ├── custom_roi_heads.py
│   │       │   │   │                   └── fed_loss.py
│   │       │   │   ├── tag2Text/
│   │       │   │   │   ├── __init__.py
│   │       │   │   │   ├── med.py
│   │       │   │   │   ├── swin_transformer.py
│   │       │   │   │   ├── tag2text.py
│   │       │   │   │   ├── tag_class.py
│   │       │   │   │   └── vit.py
│   │       │   │   └── umt/
│   │       │   │       ├── __init__.py
│   │       │   │       ├── datasets/
│   │       │   │       │   ├── __init__.py
│   │       │   │       │   ├── build.py
│   │       │   │       │   ├── kinetics.py
│   │       │   │       │   ├── kinetics_sparse.py
│   │       │   │       │   ├── mae.py
│   │       │   │       │   ├── masking_generator.py
│   │       │   │       │   ├── mixup.py
│   │       │   │       │   ├── rand_augment.py
│   │       │   │       │   ├── random_erasing.py
│   │       │   │       │   ├── ssv2.py
│   │       │   │       │   ├── transforms.py
│   │       │   │       │   ├── video_transforms.py
│   │       │   │       │   └── volume_transforms.py
│   │       │   │       ├── functional.py
│   │       │   │       └── models/
│   │       │   │           ├── __init__.py
│   │       │   │           ├── clip.py
│   │       │   │           ├── modeling_finetune.py
│   │       │   │           ├── modeling_pretrain.py
│   │       │   │           └── modeling_pretrain_umt.py
│   │       │   └── utils.py
│   │       └── vbench2_beta_i2v/
│   │           ├── __init__.py
│   │           ├── camera_motion.py
│   │           ├── crop_to_diff_ratio.py
│   │           ├── i2v_background.py
│   │           ├── i2v_subject.py
│   │           └── utils.py
│   ├── configs/
│   │   ├── dit/
│   │   │   ├── inference/
│   │   │   │   ├── 16x256x256.py
│   │   │   │   ├── 1x256x256-class.py
│   │   │   │   └── 1x256x256.py
│   │   │   └── train/
│   │   │       ├── 16x256x256.py
│   │   │       └── 1x256x256.py
│   │   ├── latte/
│   │   │   ├── inference/
│   │   │   │   ├── 16x256x256-class.py
│   │   │   │   └── 16x256x256.py
│   │   │   └── train/
│   │   │       └── 16x256x256.py
│   │   ├── opensora/
│   │   │   ├── inference/
│   │   │   │   ├── 16x256x256.py
│   │   │   │   ├── 16x512x512-rflow.py
│   │   │   │   ├── 16x512x512.py
│   │   │   │   └── 64x512x512.py
│   │   │   └── train/
│   │   │       ├── 16x256x256-mask.py
│   │   │       ├── 16x256x256-spee-rflow.py
│   │   │       ├── 16x256x256-spee.py
│   │   │       ├── 16x256x256.py
│   │   │       ├── 16x512x512.py
│   │   │       ├── 360x512x512.py
│   │   │       ├── 64x512x512-sp.py
│   │   │       └── 64x512x512.py
│   │   ├── opensora-v1-1/
│   │   │   ├── inference/
│   │   │   │   ├── sample-ref.py
│   │   │   │   └── sample.py
│   │   │   └── train/
│   │   │       ├── benchmark.py
│   │   │       ├── image.py
│   │   │       ├── image_rflow.py
│   │   │       ├── stage1.py
│   │   │       ├── stage2.py
│   │   │       ├── stage3.py
│   │   │       └── video.py
│   │   └── opensora-v1-2/
│   │       └── inference/
│   │           └── sample.py
│   ├── docs/
│   │   ├── acceleration.md
│   │   ├── commands.md
│   │   ├── config.md
│   │   ├── data_processing.md
│   │   ├── datasets.md
│   │   ├── installation.md
│   │   ├── report_01.md
│   │   ├── report_02.md
│   │   ├── report_03.md
│   │   ├── structure.md
│   │   ├── vae.md
│   │   └── zh_CN/
│   │       ├── README.md
│   │       ├── READMEv1.1.md
│   │       ├── acceleration.md
│   │       ├── commands.md
│   │       ├── datasets.md
│   │       ├── report_v1.md
│   │       ├── report_v2.md
│   │       ├── report_v3.md
│   │       ├── structure.md
│   │       └── vae.md
│   ├── environment-opensora.yml
│   ├── eval/
│   │   ├── README.md
│   │   ├── human_eval/
│   │   │   ├── generate.sh
│   │   │   └── launch.sh
│   │   ├── loss/
│   │   │   ├── eval_loss.py
│   │   │   ├── launch.sh
│   │   │   └── tabulate_rl_loss.py
│   │   ├── sample.sh
│   │   ├── vae/
│   │   │   ├── cal_flolpips.py
│   │   │   ├── cal_lpips.py
│   │   │   ├── cal_psnr.py
│   │   │   ├── cal_ssim.py
│   │   │   ├── eval_common_metric.py
│   │   │   ├── flolpips/
│   │   │   │   ├── correlation/
│   │   │   │   │   └── correlation.py
│   │   │   │   ├── flolpips.py
│   │   │   │   ├── pretrained_networks.py
│   │   │   │   ├── pwcnet.py
│   │   │   │   └── utils.py
│   │   │   └── script/
│   │   │       └── eval.sh
│   │   ├── vbench/
│   │   │   ├── VBench_full_info.json
│   │   │   ├── calc_vbench.py
│   │   │   ├── launch.sh
│   │   │   ├── launch_calc.sh
│   │   │   └── tabulate_vbench_scores.py
│   │   └── vbench_i2v/
│   │       ├── calc_vbench_i2v.py
│   │       ├── json_to_txt.py
│   │       ├── launch.sh
│   │       └── launch_calc.sh
│   ├── gradio/
│   │   ├── README.md
│   │   ├── app.py
│   │   └── requirements.txt
│   ├── notebooks/
│   │   ├── inference.ipynb
│   │   └── launch.ipynb
│   ├── opensora/
│   │   ├── __init__.py
│   │   ├── acceleration/
│   │   │   ├── __init__.py
│   │   │   ├── checkpoint.py
│   │   │   ├── communications.py
│   │   │   ├── parallel_states.py
│   │   │   ├── plugin.py
│   │   │   └── shardformer/
│   │   │       ├── __init__.py
│   │   │       ├── modeling/
│   │   │       │   ├── __init__.py
│   │   │       │   └── t5.py
│   │   │       └── policy/
│   │   │           ├── __init__.py
│   │   │           └── t5_encoder.py
│   │   ├── datasets/
│   │   │   ├── __init__.py
│   │   │   ├── aspect.py
│   │   │   ├── bucket.py
│   │   │   ├── dataloader.py
│   │   │   ├── datasets.py
│   │   │   ├── read_video.py
│   │   │   ├── sampler.py
│   │   │   ├── utils.py
│   │   │   └── video_transforms.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── cache_functions/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── cache_cutfresh.py
│   │   │   │   ├── cache_init.py
│   │   │   │   ├── force_init.py
│   │   │   │   ├── force_scheduler.py
│   │   │   │   ├── fresh_ratio_scheduler.py
│   │   │   │   ├── global_force_fresh.py
│   │   │   │   ├── score_evaluate.py
│   │   │   │   ├── scores.py
│   │   │   │   ├── token_merge.py
│   │   │   │   └── update_cache.py
│   │   │   ├── dit/
│   │   │   │   ├── __init__.py
│   │   │   │   └── dit.py
│   │   │   ├── latte/
│   │   │   │   ├── __init__.py
│   │   │   │   └── latte.py
│   │   │   ├── layers/
│   │   │   │   ├── __init__.py
│   │   │   │   └── blocks.py
│   │   │   ├── pixart/
│   │   │   │   └── pixart.py
│   │   │   ├── stdit/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── stdit.py
│   │   │   │   ├── stdit2.py
│   │   │   │   └── stdit3.py
│   │   │   ├── text_encoder/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── classes.py
│   │   │   │   ├── clip.py
│   │   │   │   └── t5.py
│   │   │   └── vae/
│   │   │       ├── __init__.py
│   │   │       ├── discriminator.py
│   │   │       ├── losses.py
│   │   │       ├── lpips.py
│   │   │       ├── utils.py
│   │   │       ├── vae.py
│   │   │       ├── vae_temporal.py
│   │   │       └── video_sdxl/
│   │   │           └── blocks.py
│   │   ├── registry.py
│   │   ├── schedulers/
│   │   │   ├── __init__.py
│   │   │   ├── dpms/
│   │   │   │   ├── __init__.py
│   │   │   │   └── dpm_solver.py
│   │   │   ├── iddpm/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── diffusion_utils.py
│   │   │   │   ├── gaussian_diffusion.py
│   │   │   │   ├── respace.py
│   │   │   │   ├── speed.py
│   │   │   │   └── timestep_sampler.py
│   │   │   └── rf/
│   │   │       ├── __init__.py
│   │   │       └── rectified_flow.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── ckpt_utils.py
│   │       ├── config_utils.py
│   │       ├── inference_utils.py
│   │       ├── lr_scheduler.py
│   │       ├── misc.py
│   │       └── train_utils.py
│   ├── opensora.egg-info/
│   │   ├── PKG-INFO
│   │   ├── SOURCES.txt
│   │   ├── dependency_links.txt
│   │   ├── requires.txt
│   │   └── top_level.txt
│   ├── pyproject.toml
│   ├── requirements/
│   │   ├── requirements-cu121.txt
│   │   ├── requirements-data.txt
│   │   ├── requirements-eval.txt
│   │   ├── requirements-pllava.txt
│   │   ├── requirements-vae.txt
│   │   └── requirements.txt
│   ├── scripts/
│   │   ├── inference.py
│   │   ├── inference_vae.py
│   │   └── misc/
│   │       ├── extract_feat.py
│   │       └── launch_extract_feat.sh
│   ├── setup.py
│   ├── tests/
│   │   ├── test_attn.py
│   │   └── test_lr_scheduler.py
│   └── tools/
│       ├── __init__.py
│       ├── caption/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── acceleration/
│       │   │   ├── __init__.py
│       │   │   └── llava/
│       │   │       ├── __init__.py
│       │   │       └── policies/
│       │   │           ├── __init__.py
│       │   │           ├── llama.py
│       │   │           └── mistral.py
│       │   ├── camera_motion/
│       │   │   ├── __init__.py
│       │   │   ├── camera_motion.py
│       │   │   ├── detect.py
│       │   │   ├── requirements.txt
│       │   │   ├── utils.py
│       │   │   └── visualizer.py
│       │   ├── camera_motion_detect.py
│       │   ├── caption_gpt4.py
│       │   ├── caption_llama3.py
│       │   ├── caption_llava.py
│       │   ├── pllava_dir/
│       │   │   └── caption_pllava.py
│       │   └── utils.py
│       ├── datasets/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── analyze.py
│       │   ├── convert.py
│       │   ├── datautil.py
│       │   ├── filter_panda10m.py
│       │   ├── split.py
│       │   ├── transform.py
│       │   └── utils.py
│       ├── frame_interpolation/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── interpolation.py
│       │   ├── networks/
│       │   │   ├── __init__.py
│       │   │   ├── amt_g.py
│       │   │   └── blocks/
│       │   │       ├── __init__.py
│       │   │       ├── feat_enc.py
│       │   │       ├── ifrnet.py
│       │   │       ├── multi_flow.py
│       │   │       └── raft.py
│       │   └── utils/
│       │       ├── __init__.py
│       │       ├── dist_utils.py
│       │       ├── flow_utils.py
│       │       └── utils.py
│       ├── scene_cut/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── convert_id_to_path.py
│       │   ├── cut.py
│       │   └── scene_detect.py
│       └── scoring/
│           ├── README.md
│           ├── __init__.py
│           ├── aesthetic/
│           │   ├── __init__.py
│           │   └── inference.py
│           ├── matching/
│           │   ├── __init__.py
│           │   └── inference.py
│           ├── ocr/
│           │   ├── __init__.py
│           │   ├── dbnetpp.py
│           │   └── inference.py
│           └── optical_flow/
│               ├── __init__.py
│               ├── inference.py
│               └── unimatch/
│                   ├── __init__.py
│                   ├── attention.py
│                   ├── backbone.py
│                   ├── geometry.py
│                   ├── matching.py
│                   ├── position.py
│                   ├── reg_refine.py
│                   ├── transformer.py
│                   ├── trident_conv.py
│                   ├── unimatch.py
│                   └── utils.py
├── PixArt-alpha-ToCa/
│   ├── Dockerfile
│   ├── README(PixArt-alpha).md
│   ├── app/
│   │   ├── app.py
│   │   ├── app_512.py
│   │   ├── app_controlnet.py
│   │   ├── app_lcm.py
│   │   ├── style.css
│   │   └── style_controlnet.css
│   ├── asset/
│   │   ├── docs/
│   │   │   ├── pixart-dreambooth.md
│   │   │   ├── pixart.md
│   │   │   ├── pixart_comfyui.md
│   │   │   ├── pixart_controlnet.md
│   │   │   ├── pixart_inpaint.md
│   │   │   ├── pixart_lcm.md
│   │   │   └── sasolver.md
│   │   ├── examples.py
│   │   └── samples.txt
│   ├── configs/
│   │   ├── PixArt_xl2_internal.py
│   │   ├── PixArt_xl2_sam.py
│   │   ├── pixart_app_config/
│   │   │   ├── PixArt_xl2_img1024_controlHed.py
│   │   │   ├── PixArt_xl2_img1024_dreambooth.py
│   │   │   └── PixArt_xl2_img512_controlHed.py
│   │   └── pixart_config/
│   │       ├── PixArt_xl2_img1024_internal.py
│   │       ├── PixArt_xl2_img1024_internalms.py
│   │       ├── PixArt_xl2_img1024_lcm.py
│   │       ├── PixArt_xl2_img256_SAM.py
│   │       ├── PixArt_xl2_img256_internal.py
│   │       ├── PixArt_xl2_img512_internal.py
│   │       └── PixArt_xl2_img512_internalms.py
│   ├── diffusion/
│   │   ├── __init__.py
│   │   ├── data/
│   │   │   ├── __init__.py
│   │   │   ├── builder.py
│   │   │   ├── datasets/
│   │   │   │   ├── Dreambooth.py
│   │   │   │   ├── InternalData.py
│   │   │   │   ├── InternalData_ms.py
│   │   │   │   ├── SA.py
│   │   │   │   ├── __init__.py
│   │   │   │   ├── pixart_control.py
│   │   │   │   └── utils.py
│   │   │   └── transforms.py
│   │   ├── dpm_solver.py
│   │   ├── iddpm.py
│   │   ├── lcm_scheduler.py
│   │   ├── model/
│   │   │   ├── __init__.py
│   │   │   ├── builder.py
│   │   │   ├── cache_functions/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── cache_cutfresh.py
│   │   │   │   ├── cache_init.py
│   │   │   │   ├── force_init.py
│   │   │   │   ├── force_scheduler.py
│   │   │   │   ├── fresh_ratio_scheduler.py
│   │   │   │   ├── global_force_fresh.py
│   │   │   │   ├── score_evaluate.py
│   │   │   │   ├── scores.py
│   │   │   │   ├── token_merge.py
│   │   │   │   └── update_cache.py
│   │   │   ├── diffusion_utils.py
│   │   │   ├── dpm_solver.py
│   │   │   ├── edm_sample.py
│   │   │   ├── gaussian_diffusion.py
│   │   │   ├── hed.py
│   │   │   ├── llava/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── llava_mpt.py
│   │   │   │   └── mpt/
│   │   │   │       ├── attention.py
│   │   │   │       ├── blocks.py
│   │   │   │       ├── configuration_mpt.py
│   │   │   │       ├── modeling_mpt.py
│   │   │   │       ├── norm.py
│   │   │   │       └── param_init_fns.py
│   │   │   ├── nets/
│   │   │   │   ├── PixArt.py
│   │   │   │   ├── PixArtMS.py
│   │   │   │   ├── PixArt_blocks.py
│   │   │   │   ├── __init__.py
│   │   │   │   └── pixart_controlnet.py
│   │   │   ├── respace.py
│   │   │   ├── sa_solver.py
│   │   │   ├── t5.py
│   │   │   ├── timestep_sampler.py
│   │   │   └── utils.py
│   │   ├── sa_sampler.py
│   │   ├── sa_solver_diffusers.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── checkpoint.py
│   │       ├── data_sampler.py
│   │       ├── dist_utils.py
│   │       ├── logger.py
│   │       ├── lr_scheduler.py
│   │       ├── misc.py
│   │       └── optimizer.py
│   ├── docker-compose.yml
│   ├── docker-entrypoint.sh
│   ├── docker-readme.md
│   ├── environment-pixart.yml
│   ├── environment.yml
│   ├── notebooks/
│   │   ├── PixArt_xl2_img512_internal_for_pokemon_sample_training.py
│   │   ├── convert-checkpoint-to-diffusers.ipynb
│   │   ├── infer.ipynb
│   │   └── train.ipynb
│   ├── requirements.txt
│   ├── scripts/
│   │   ├── infer_pixart_8_bits.py
│   │   ├── inference.py
│   │   ├── inference_ddp.py
│   │   ├── inference_lcm.py
│   │   ├── interface.py
│   │   ├── interface_controlnet.py
│   │   ├── pipeline_pixart_inpaint.py
│   │   └── pipeline_pixart_reference.py
│   ├── timing_analysis.py
│   ├── timing_info.json
│   ├── tools/
│   │   ├── VLM_caption_lightning.py
│   │   ├── convert_pixart_alpha_to_diffusers.py
│   │   ├── download.py
│   │   └── extract_features.py
│   ├── train.sh
│   ├── train_latents.py
│   └── train_scripts/
│       ├── train.py
│       ├── train_controlnet.py
│       ├── train_diffusers.py
│       ├── train_dreambooth.py
│       ├── train_pixart_lcm.py
│       ├── train_pixart_lcm_lora.py
│       └── train_pixart_lora_hf.py
├── PixArt-alpha-ToCa-tools/
│   └── clip_score.py
├── README.md
└── flux-ToCa/
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── demo_gr.py
    ├── demo_st.py
    ├── demo_st_fill.py
    ├── docs/
    │   ├── fill.md
    │   ├── image-variation.md
    │   ├── structural-conditioning.md
    │   └── text-to-image.md
    ├── model_cards/
    │   ├── FLUX.1-dev.md
    │   └── FLUX.1-schnell.md
    ├── model_licenses/
    │   ├── LICENSE-FLUX1-dev
    │   └── LICENSE-FLUX1-schnell
    ├── pyproject.toml
    ├── setup.py
    └── src/
        ├── flux/
        │   ├── __init__.py
        │   ├── __main__.py
        │   ├── _version.py
        │   ├── api.py
        │   ├── cli.py
        │   ├── cli_control.py
        │   ├── cli_fill.py
        │   ├── cli_redux.py
        │   ├── ideas/
        │   │   ├── __init__.py
        │   │   └── cache_denoise.py
        │   ├── math.py
        │   ├── model.py
        │   ├── modules/
        │   │   ├── autoencoder.py
        │   │   ├── cache_functions/
        │   │   │   ├── __init__.py
        │   │   │   ├── attention.py
        │   │   │   ├── cache_cutfresh.py
        │   │   │   ├── cache_init.py
        │   │   │   ├── cal_type.py
        │   │   │   ├── force_init.py
        │   │   │   ├── force_scheduler.py
        │   │   │   ├── fresh_ratio_scheduler.py
        │   │   │   ├── global_force_fresh.py
        │   │   │   ├── score_evaluate.py
        │   │   │   ├── scores.py
        │   │   │   ├── support_set_selection.py
        │   │   │   ├── token_merge.py
        │   │   │   └── update_cache.py
        │   │   ├── conditioner.py
        │   │   ├── image_embedders.py
        │   │   ├── layers.py
        │   │   └── lora.py
        │   ├── sampling.py
        │   └── util.py
        ├── geneval_flux.py
        └── sample.py

================================================
FILE CONTENTS
================================================

================================================
FILE: COCO_caption_prompts_30k.txt
================================================
A man about to return a serve with his tennis racket
A horse drawn carriage in a historic city.
Two gray fire hydrants sitting next to each other at a park.
A goat with horns is standing in a grassy field.
Several buses parked under a carport in a parking lot.
A bowl of soup with bread and a cup of coffee.
Three polar bears walk across a snowy field.
A glass mosaic vase filled with colorful flowers.
A bunch of apples in large trays on top of wooden crates.
A girl does a skateboard trick in the air.
A dog rests in the grass next to a fire hydrant.
A giraffe standing in the shade of a near by tree.
a plate of chilies with carrots and peas
A motorcycle parked in front of a red brick wall.
A tour bus stopped near a mountain while people gather nearby
A young girl and her dad play with kites in a park.
a man who appears to be herding sheep is closing two big fence doors
A man with glasses holding a glass of wine.
Group of skiers and different colored outfits on Ace Eastlynn.
A fortune note on a tea bag next to a bagel.
a man walks along the beach with a surfboard
The view of a clean toilet surrounded by marble tile.
a couple of people that are skateboarding down the road
A couple of cars are riding down the street from a window view.
A pet crate and a lot of tools and wires.
a man holding a kite and a dog in a field.
Four women with snowboards and gear are posing for a photograph near some snowy mounds.
The boy and his dog are posing for the camera.
The streets and the double decker bus are lit up in the night.
A small vase has a good luck plant in it.
a man riding on the back of a bike on dirt ground.
A group of people standing on the street.
The girl is riding her skateboard while using her cell phone.
A person doing a trick on a skateboard in the road
A lone zebra is walking in tall green grass.
A fuzzy image of some people on skate boards.
A man throws a frisbee to another man with two children.
a number of people in a body of water with a small boat
A complete train set, with tracks, buildings, and three piece train.
A couple of buses parked in front of a building.
A girl wearing a wet suit surfing in the ocean.
A picture of something and it appears like sustenance.
A man on a skateboard on a concrete lip.
A person walking next to a horse at a horse show.
A man with a beard looks pensive and wears a tie.
A blue vase filled with colorful flowers sitting on the ground.
A street name on a sign built into a curb.
Two zebras are behind a fence on green grass.
Young person on the street skateboarding wearing a helmet.
A toilet with a red seat in a small bathroom with red tiles.
Woman lying in an unmowed field with a frisbee.
Cars and a bus driving down a busy road.
A blurry image of some car lights on a dark night.
A smiling young man stands beneath an Obama street sign.
this is a train riding under a bridge
A group of giraffes eating bark off trees.
A horse is looking in the living room window of a farmhouse.
A man holding a frisbee on a beach next to another man.
a mixture of black and white sheep in a dried out field
A chocolate caked covered in strawberries sitting next to a knife.
Two people standing next to each other on a snow covered slope.
A young boy is flying a kite in the park.
Baseball pitcher in the process of pitching in a baseball game.
Two men are in the water on a boat.
A fighting plane turning sideways in a cloudy sky.
Several red roosters together in a small area.
A few small boats sail down a waterway.
A dog and a cat are looking at the snowy front yard through a glass door
A small child sitting on a shelf with teddy bears.
One giraffe standing behind a dead tree branch.
A bright green kite with a scary monster face flying high
A man holding a tennis racket and staring at the camera with pride.
This is a thing that is straightforward and plain.
A large open field with small bushes and trees, and a giraffe standing in the middle of the field.
Two women who are sitting at a table together.
A small dog eating out of a bowl on the floor.
a cow that is standing up eating a pan
Two people smile as they ride on an elephant.
some city workers work on a car crash
A giraffe walking past a tree on a dirt landscape.
A tow truck vehicle on a street in a city area.
Professional baseball player winding up to pitch the ball.
A dog is wearing a baseball hat over it's eyes.
A bearded man standing in front of bookcases
A horse drawn carriage traveling away fro ma very large cathedral.
a man is standing inside of a food truck
There is a stop sign with two road signs on top of it.
A girl looking a a beautiful view of the Rockies.
A very big high ceiling room with a yellow fan.
A man in an office chair looking at a laptop next to a glass of wine.
a brown desk a keyboard a computer and a monitor and speakers
A woman talking on her cell phone while walking.
An abandoned train with lots of graffiti painted on it.
a person bending down cutting another persons hair
A plane at an airport with a truck driving past
there are many men sitting at a small table
A girl looking inside a living cartoon refrigerator.
Two sheep are in a dirt outdoor enclosure.
People look on as a ball heads towards a batter.
Several glazed donuts are lined up on a tray.
A guy standing on a snowboard in the snow.
There is now image here to provide a caption for.
A man practicing baseball on a field.
A bedroom that has a large computer desk in it.
A close up of a pizza with spinach on it.
a living room with a couch a tv and a table
A man checks his cell phone as he walks to his car in the parking lot.
Several people on skis in the snow outside of a lodge.
A small pink beanie hat next to a cell phone.
A mother handing her son a piece of cake on a  paper plate.
a man in a tuxedo sits at a table and uses a laptop
A young man holding a Nintendo Wii game controller.
A wrap of some sort on a plate with potatoes
A motor boat next to a beach and others in the background
A fruit bowl sitting on a table with bananas and apples.
Three people running around in grass playing Frisbee.
A  person sits down to their meal of a sandwich on a croissant with a side of french fries.
A man is holding up an old cellphone
a couple of animals standing in a field
a cup on a table next to a tv
A large brick building with a tall tower containing a clock near the top
A figurine of a little boy riding a snow board in yellow pants.
A memorial set on a fence by the ocean with flowers and teddy bears.
A group of white sheep eating from blue bowls
A couple of kids laying in a bed with an umbrella.
A small table set with pastries and tea
A rainbow lorikeet parrot eats sun flower seeds.
A white plate topped with a sandwich and a salad.
A woman getting ready to hit a tennis ball.
A tennis player jumps into the air and swings his racket.
A mustached man is standing in front of a larger mustache.
Street signs showing the intersection of Eight Mile and Shadyside
A white desk has a computer, keyboard, globe and green phone.
A man with bandaged hands lying in bed.
A baseball player is trying to hit the ball.
a woman gets ready to pet a big horse
A herd of black cows grazing in a field.
A very pretty girl looking at her cell phone.
A man rides a bicycle across a wet intersection.
a man with a bat walks as other look on
Two zebras are running through some high grass.
A baseball stadium full of fans while two teams play ball.
there is two pictures of a female tennis player
There are flowers that are in a vase filled with rocks.
Fruit and vegetables are hanging in a metal basket.
A boat that is sitting in the water.
A living room with a couch, TV, and fireplace.
some people walking up a snowy hill with skis
A woman is eating spam off of a plate with a camera next to her.
A person that is playing in a tennis game.
A red truck in street next to wall and buildings.
A light with multiple bulbs is on a tall post.
Grey toned elephant head closeup with grass and hill background.
A man in striped shirt sitting on a fire hydrant.
A person on a skateboard and bike at a skate park.
a close up of a red tie and a white and blue shirt
A kid about to ride his skateboard down a pool.
LIVING ROOM WITH COUCH, TABLE, END TABLE, LAMP, PHONE, AND MIRROR
A happy boy is waiting outside with his suitcase.
Two men stand together using their cell phones.
A group of men cutting into a celebratory cake
A man eating a donut wrapped in tissue paper.
A bedroom with a bed with blue cover and blue curtains, and a pair of shoes on the wooden floor.
A penguin is standing and pecking at a teddy bear left in the snow.
Two brown bears sitting on top of a black and white checkered bed.
Two lambs with black heads look out from a gate.
This person is holding a cell phone while standing on the sidewalk.
There are a group of people snow boarding on the hill
A jet on a runway near other jets.
A baseball player is getting ready to go to bat.
Man on a black motorcycle wearing a helmet.
Horse standing in dead grass area near fenced field.
someone is skiing through the trees by themself.
The zebras are grazing in the field together.
The cut sandwich has meat, lettuce and tomato.
A small dog sitting on a stuffed animal teddy bear
A person holding a slice of pizza in their hands.
A man wearing a hat eating a hotdog at a sporting event.
A painting of a man sitting next to a woman near the ocean.
People are standing in front of a small store
a train on tracks at the front of the train depot
Elephants moving along on a very open field of some sort.
A cat sits curiously perched in an empty cup
Large collection of scissors attached with price tags.
An older man and two kids sitting on a bench.
Two motorcycle riders are riding a motorcycle bike.
People watching motor cross bike riders racing on a field
Two small children stand together scrubbing an elephant.
a close up of a shirtless man wearing a neck tie
The view of a distant mountain taken from an airplane window.
A train on tracks in a city with high rises.
PEOPLE BOARDING A BUS PARKED ON A STREET.
A woman holds a decorative umbrella and walks with a man.
A couple of cows standing on top of a grass covered field.
A crossed eyed man holding a remote in his mouth.
A single young giraffe stands and looks forward.
two giraffes standing next to several huge rocks
A person is flying a kite on the beach.
Two giraffes are standing next to a tall fence.
there is a person holding up a nokia lumia phone
A boy sitting at a table while he puts something in his mouth.
A man sitting in front of a laptop computer in an office.
A woman is on the side of a mountain in ski gear.
A view of a bed from across the room, it has a TV tray on it.
View of traffic signal against a dark sky that looks like rain.
A boy being affectionate with a baby on a bed.
A man with a top hat on and a carrot in his mouth.
There is a stove and a sink in a narrow kitchen.
A hand picking up a bunch of bananas from a display.
A cow and a person on a horse in the dirt.
Many doughnuts on a display in a store.
umbrellas, trees and a hut line a sandy beach
A man eating a hot dog on a tray
Some stacked with much sublime sustenance ready to eat.
Seven people smile as they pose with tennis rackets.
A school bus is parked by a street sign.
A train coming out of an enclosure under a snowy mountain.
A woman holding a surfboard walking into the ocean toward a dog.
Three people on snowboards riding down a snowy slope.
A black dog on a leash holding a frisbee in his mouth.
A woman water boarding in a lake near land.
A red stop light on a street at night.
A kitchen with wood cabinets and white countertops.
A bathroom decorated in pink ceramic tile and wallpaper..
A platter on a table that has pizza on it.
A snowboarding is doing tricks on a ramp.
A desk with a computer, a keyboard, a mouse, a bobble head, speakers and a lava lamp on it.
A man in a crowd balancing a skateboard on top of his head
A bathroom with a paper dispenser, toilet roll and garbage bin.
A seagull majestically flying through the air over the ocean.
A bed topped with a colorful blanket and lots of papers.
A person looking into a convex mirror on the front of a school bus.
wild animals graze in fields in front of a lake and snow-covered mountains
The side dish of the meal consists of a macaroni salad.
A plated meal on a table with flowers.
THERE ARE TWO ZEBRAS THAT ARE STANDING BY EACH OTHER
a cross walk sign in a busy city as light up the walk symbol
A small cooked pizza on a dining table.
a close up of a bunch of bananas and a container of garlic
a couple of people sit on a horse pulled cart
A wooden park bench sitting in front of a window.
Two white vases on a shelf next to a window.
Wearing shorts, a man holds up a snowboard while standing in the snow.
Two people playing tennis in a neighborhood park.
An outdoor area that has a glass top table with a plate on it and a blue vase with flowers in it.
An empty bathroom with toilet and pictures on the wall.
Two yellow bowls of food containing broccoli and potatoes.
Group of people paddling boats on water in front of a city.
A child holds an object while someone else cuts it
A man approaching a water ski jump holding on to a wire.
a toilet with a shower near by with tiled walls
A skateboarder performs a trick while being photographed.
A white fire hydrant near the address 700 Jones Street.
A bed with red sheets on it and messy blanket and a lap top.
A yellow school bus traveling a dark road.
A boy wearing a helmet and using a skateboard on the sidewalk.
A woman holding a knife over an unconscious woman.
A close-up of a man brushing his teeth.
a man sits next to a child as he uses a computer
a kitchen view of cabinets a stove microwave adn refridgerator
A dog and a railroad official and one person in train yard.
A motorcycle is parked inside of a building.
Two elephants that are walking in the dirt.
A baseball player throws a pitch while others watch from the dugout.
A group of people walking around an area together.
people on a small boat in a body of water
a train station and a train removing much smoke
There is a elephant in the grass, there are also trees in the background.
A small dog sitting on top of a couch cushion.
A security officer using a segway as a footrest
The mostly eaten pizza slice is next to olive pieces.
A baseball player starting his run for first base.
A man standing on top of a tennis court holding a racquet.
A post clock is positioned on the sidewalk with flags in the background.
Cars drive down a multi-lane road and pass businesses.
Inside of a living room with a sofa and several tables.
Two small suitcase is sitting in front of a white sheet.
a living room with couches and a table
Elephant with a brown eye hyper focused in the camera.
A man in a white shirt has his hands on another man's shirt collar.
A picture of five african american's sitting on a bench and chair.
A counter holds tomatoes, bananas, pineapples and other fruits.
A girl and a woman watching a candle being lit up on top of a cake.
Pair of giraffes walking on grassy area in enclosure.
A red fire hydrant next to some stones.
a yellow and red apple and some bananas
A green and yellow rain on tracks with building in background.
A large elephant is standing in a fenced in enclosure.
A boy on a boogie board in the snow.
Photos of sports memorabilia including shirts, caps, and baseball bats.
A young child mutton busting at a rodeo event
The bald man leaning against the tree holds his face in place with one gloved hand.
A young man with a neck tie untied around his neck
Sheep are running across a green field of grass.
a simple, normal toilet with the lid closed
A cat look through a window at a dog.
Busy city street with red signs on the traffic lights.
several cows one lying and one standing in a dirt field
A plastic container sitting on top of a table.
An old fashioned television and a newer electronic gadget sitting on top.
A white pickup truck sitting in front of two wooden scaffolding.
A child biting into a piece of pizza
The people are over by the cows in the water
A couple of cows standing on a lush green field.
A view along the transept of an older style church.
The bed in the room has been made with a large purple blanket.
A table that has people sitting around it with food in front of them.
a group of small dogs are staring out of the window
Two people sitting on a park bench near trees.
A table has a bowl, candle, and Christmas decorations on it.
A Skyteam Delta airline passenger jet taking off from an airport.
a black white yellow blue green and red kite and a person
A STREET SIGN POINTING IN WITH A TREE AND BUILDING IN THE BACKGROUND
Two different slices of pizza with tissue paper under it on a paper plate.
People riding a small boat under a very large bridge.
A group of people injured and covered in blood.
Three fire hydrants that are green stands near a parking sign at night.
A striped giraffe is grazing in the grass.
three chickens some water a fence and trees
Four men play tennis together on a sunny day.
A counter filled with coffee, cookies, and bagels.
a baseball player with a bat on a field
A green truck with a canvas tarp over the bed.
A man stands in a large kitchen holding a coffee mug.
An old motorcycle standing in a grassy field.
A table with a white plate of food that includes salad and sausage.
A cat on top of the counter sitting next to vegetables.
A small green leafy plant in the ground.
Two people eating hot dogs on a busy sidewalk.
A bird perched on the limb of a tree
a person riding a snow board on a snowy surface
A boy doing a trick on a skateboard off a rail.
Two woman are standing behind a large teddy bear.
A couple of people standing in the water under a kite.
A close up of the front of an old locomotive.
The man in the hat is carrying an umbrella.
A red brick building next to a green door.
A dog looking at a book called "The Marriage of True Minds" by Stephen Evans.
This is an image of an Air Canada plane flying.
A bird perched on a plant in the middle of a forest.
A living room filled with brown furniture on top of hard wood flooring.
A black computer keyboard in a dim room
The traffic lights are clearly visible for us to see.
A couple of birds sitting on top of a rocky beach cliff.
A couple of people throwing a Frisbee in a field.
A sandwich with a dipping sauce served on a plate
Road sign for the corner of Jackson and Montgomery
A large group of giraffes roaming around in an enclosure.
Stainless steel industrial stove sitting in a white and black kitchen.
A little girl is posing for a picture and holding an umbrella.
A couple of people with surfboards on a beach.
a woman is standing in front of a giraffe
A man in a blue shirt looking at his cell phone.
A man on a skate board jumps high in the air for a trick.
A woman sitting on the grass behind a pile of stuffed animals.
The barbecue sandwich is on a plate near a glass of wine.
Wrapped utensils are a part of a sterile and healthy meal.
A metallic refrigerator freezer sitting in a kitchen.
The child is putting the tooth brush in his mouth.
A white comforter with a toy, book and child shirt on the top of it.
A picture of a room in a house.
The wildflower is sitting in the glass of water,
A girl showing parrots to a group of children
Two zebras fighting outside in an opening near some trees.
Two surfers walk down the beach holding their boards.
A group of people in a courtyard next to a pavilion.
A renovated kitchen with wooden cabinets and white refridgerator
Some soldiers are standing in line for food
a man on a surfboard surfing a wave
Two giraffes are standing together in the wild.
The boat and the truck are parked by the dock.
Oranges and bananas sitting in a stack together.
An intersection with traffic lights and a street sign.
Three horses standing close but in an open field.
A person is riding a motorcycle in the mud.
A statue of a giraffe is in a Children 's Hospital.
A man is standing in a room with something yellow.
A mother elephant and her tiny calf walk through the trees.
A breakfast plate with eggs and meats, served with a gourmet coffee.
a black and white photo of a person holding a skate board
A boy swinging a baseball bat at a ball.
A meter maid car is by a fire hydrant.
A person flying a kite over another person on a roof.
A  car parked behind a wooden bench.
A young skateboarder rides down the street alone.
A row of bikes parked along a sidewalk beside some cars
The bathroom in the home was just cleaned.
A circus act with five elephants and some women put on a show.
There is a muffin with white frosting and walnut bits on it.
A curious giraffe leaning over into a car at a zoo field.
Two signs one with the speed limit and one telling what freeway is which way.
a lady that is on a tennis court with a racket
A man sitting on his couch using his laptop
A statue of a cowboy on a horse in the middle of building.
There are street signs that show a direction of travel
A couple of small beds and mirror in a room.
A couple of large jetliner sitting on top of an airport tarmac.
The room is decorated in terra cotta tile.
A white plate topped with a pizza next to a bowl of salad.
Many suit cases are stacked on top one another
A gray and white cat sitting in front of a mirror.
A dog riding on the back of a motorcycle down a street.
A coffe and plate of bread sit next to a pillar.
a red and white tail of a large plane
Several pilots walking as a group across a street.
A man riding a skateboard on the side of a ramp.
a woman walking by a display with teddy beas and bottles
A street sign on a light pole near on a city street.
A male standing behind eight pieces of luggage.
a vintage black and white picture of a train
Two colorful umbrellas open against a blue sky.
A small bookshelf is filled with books and decorative items.
Several people walking in the snow, some carrying skis.
a bottle of whiskey and a bottle in a brown bag on top of a fridge
Women waiting for luggage at an airport luggage carousel.
Two sheep standing side by side at a petting zoo.
A pizza pie sitting on a board on a table
A couple of airplanes that are on a runway.
a bathroom with white walls and brown tile
A woman walks down the water with a surfboard.
this is an image of a train with black smoke.
A lady is on the entrance of a train holding her luggage.
A man is catching a frisbee while playing a game.
A skateboarder with his skateboard is sitting on the side of a ramp.
On a bright day, a young elephant in partial shade near a tree.
Young people stand near a bus with a large amount of luggage.
A bear costume cutting some cake with a Park ranger.
a large clock reading 5 54 on the side of a building
An unfurnished room contains a sleeping bag on the carpet.
A little girl that is standing next to a horse.
Two white bowls with vegetables, meats and herbs and chopsticks nearby.
A kitchen with a stove, refrigerator and cabinets.
a small child is sitting on a bench outside
Brown, white, and black rams eating on a hill.
A skateboarder performing in front of a crowd riding a rail.
This is an image of scooters and bicycles.
A cell phone peeks out of a crocheted cell phone holder.
A kitchen that has white cabinets and black counters.
Man on cellphone behind curtain while art displayed in front.
Two men in bucket hats taking frisbees out of a frisbee golf bucket.
A snowboarder grabs his board while high up in the air.
A woman's eyes are hidden by the cast of a shadow.
A black and white cat sitting in a bathroom sink.
A stop sign in the grass beside an old farm silo.
An Asian family that is eating pizza together.
A yellow train traveling through the green countryside.
A man and woman are standing beside each other playing a video game.
Two cats lounging on the back of a couch.
This picture shows sand, water, and some type of silver and red pole equipment.
A cut dog in a basket with orange ears.
An adult and child are skiing in the snow.
Two children playing with the knob on money meters.
Flowers sitting in a glass vase on a desk.
A laptop computer is seen sitting next to a television.
A small bathroom has a vanity, mirror, toilet and bathtub.
a black and white photo of people eating
a group of zebra standing in the sand in a fenced area
A man standing by a kitchen counter doing something
A bird is sitting on a silver truck
A mother and son sitting in a bed with two cats
A large piece of meat surrounded by vegetables.
White oatmeal sitting next to toast, coffee, and orange juice.
A bathroom with a shower combination tub and sink.
A white and brown cat sitting on the shelf in a cabinet.
A man is driving a small train with children.
A man and a woman cutting a sheet cake with a knife.
a man cooking some hot dogs on a grill
some kind of chicken, rice, and vegetable dish on a pizza tray being served to a man.
One man stands on top of the train while another man stands on the platform.
A piece of cake is served on a plate.
a close up of a child on a skate board
Fruit juice is spilled all over a counter next to a knife and two pastries.
A junk pile that looks to be piled with old bathroom sinks.
Two black and yellow circular clocks affixed to an office building.
There is a single bed in an old room with a window.
An a kitchen is being cleaned and decluttered.
Colorado Rockies' pitcher about to release ball from mound.
A bird perched on top of a tree branch under a light blue sky.
There's a desk with a laptop, phone charging, and other various electronics.
The two men are standing outside by the tail of the airplane.
A couple of brown horses pulling people on a wagon.
A small plane flying over an ocean with waves
The large SUV drives along a busy street.
A row of motorcycles parked on a city street.
a kitchen decorated with a couple american flags
A room with a chair, a piano, and a laptop.
A cat sitting on the side of a car door window.
Two pelicans on the sidewalk in the foreground with several more in the water in the background
A black and white photo of a man and woman sitting on a bench.
A picture of a thick crust pizza and a bottle of wine, setting on a table.
A MAN WEARING A SUIT AND A TIE STARING
Male surfer riding a large wave with sun low in the sky
A snowboarder hitting a trick on a trail, jumping over a person.
The man is feeding the elephant with milk
A bathroom with white fixtures and blue accessories
two brown horses in a field gazing around
Several people that are playing video games together.
A plate with beans, broccoli, small sausages, fork and a small container.
A black and grey double decker bus next to a building.
A smoking women in a scarf makes a phone call.
A person is traveling down the road on a motorcycle.
A group of men in colorful jackets skiing down a hill
A man and two others skiing across a snow covered field.
A man is standing over a black motorcycle.
A KITHCEH WITH A MICROWAVE SINK AND REFG
A jar of peanuts and a cell phone sit on a laptop computer on a cluttered table top.
A woman holding her child so she can see her birthday cake.
there are two giraffes that seem to be embracing each other
A table with wine glasses and people on the counter
A wooden bench sitting on top of a green grass covered ground.
A long couch with many pillows, a table and some seat cushions around it.
Six people are paddle boarding in the ocean.
Two young men play a game of soccer on a field.
A black and white image of tennis players.
This is an aerial view of a tennis player hitting the ball.
An item is capture here in the photo.
A cow grazing in a field next to a fence.
An individual is taken in this very picture.
People are outside flying kites in the sun.
a person riding a skate board on a street
There is a flower display in the corner of a room
A man pulling a sled behind him while using ski poles.
A person playing a game of tennis and other people watching.
A decorative congratulations cake for a graduating student.
A group of young people standing next to each other on top of a field.
a semi truck loaded to the top with sheep
A white truck crosses an intersection behind a traffic light.
Two street signs sitting on top of a metal pole.
The man in the red shirt is going to hit the ball with his racket.
A shop called Pendulum with a clock out front.
a traffic light on the side walk of a city street
Two signs above a blue pole under a blue sky.
A skateboarder is getting ready to skate down a ramp.
A man wearing a black vest and black glasses.
A bicyclist stopped beside a fence feeding or petting sheep.
An orange and white bus crossing under a blue footbridge.
A flock of birds landing in a field of grass
A man reaching into a bucket near an elephant while another elephant stands near a pond in the back.
A cat standing on the keyboard of a laptop.
An empty and open silver metallic refrigerator in a kitchen
Couple of people about to share kiss in front of wooden building
A person para-sailing in the water with mountains in the background.
a yellow taxi riding down a street that has a building with clock
a snall toilet and a sink in a bathroom
A zebra standing on a grassy pasture in the daytime.
A group of people playing a game with remote controllers.
A fighter jet with two streams of smoke coming out the back.
A bike parked on the side of a city street.
A glass plate topped with sliced apples and caramel.
A player swings at the ball during a baseball game.
A dog and man sit on rocks by water.
A bunch of food on a tortia in some foil
A woman tennis player is in a cropped photo.
A plate of chicken, rice, and some vegetables.
A man is staring at the viewer while a man plays a guitar and a woman sticks her thumb up sitting on a busy sidewalk.
A large jetliner sitting on top of a runway.
A person presenting a birthday cake to another.
A woman with a child on skis go down the snow.
A fridge in the middle of some cabinets
Kitchen knives and scissors are stored in a wooden holder.
A man speaks to some children on a farm.
Two zebras are walking along a path outdoors.
A woman lying in a bed looking at a laptop.
A lot of motorcycle people that are on the road.
A dog sits on a rug with its eyes closed.
A pick up truck parked near a strange house
A deformed orange sitting on top of an orange tree.
peeled banana sits on a table uneaten and ripe
A clock hangs from the wall of a beat up room
A woman sitting on a chair blow drying her hair.
A white plane with two people standing in front of it
A young girl with a cape holds onto a kite.
The puppy is eating food from the tiny bowl.
A hummingbird is floating next to the feeder.
A man riding a surfboard on a wave in the ocean.
A werid skirt like outfit on a person.
A security officer is setting up traffic guiding signs
A tropical beach with a banana tree in the forefront.
Military officer in dress uniform with many medals.
A boy doing a trick on a skateboard on a ramp.
A young boy is eating a meal in his pajamas.
This is a vintage photo with four men in it.
A dog is laying on the bed like a person.
Players react to the ball being hit at a baseball game.
A toilet with its lid raised in a stall.
Young boy and his plastic skateboard at home
Set of toy animals sitting in front of a red wooden wagon.
A silly brown dog wears sunglasses as it sits in a car
Broccoli is on a cutting board and is being cut in to smaller pieces.
an image of a tennis racket and tennis ball
A snowboarder sitting down with his snowboard on his feet.
A tennis player pauses during a game in a public tennis court.
A group of people on skis and snowboards outside.
a couple of trains parked on some tracks under a closed roof
Red double decker buses on a city roundabout.
there is a game that is ging on at thte gym and people aer looking
The man is wearing a tee shirt and a tie-dyed tie.
A group of motor cycles parked on the street
A set of two pictures showing a group of young people standing under a gazebo and next to surfboards.
A man is shown feeding an baby elephant.
There is no image here to provide a caption for.
Three zebras are shown in a black and white photo.
A Chinese public train waiting at the station.
A man walking along the shore with a surf board.
Two giraffes looking at a photographer inside of a barn.
A dog looks up at a flying disk.
A group of men standing on a city street.
A dog stares intently off to the left in front of a glaring TV.
A small pizza sitting in a frying pan of food.
A mirror that is on a tiled wall.
A black and white photo of people waiting at a boat ramp
Modern jet airplanes lined up on the runway ready for take off
Two white cows sitting in a farm area.
A stop sigm at an intersection with some graffitti on it.
A seagull at the beach with food in its mouth
A passenger bus that is driving down the street.
An umbrella strapped to the cross bar of a bicycle
View off the wingtip of a passenger airliner on a taxiway.
A cute little girl smiles for the camera
A city with traffic lights, cars and buses.
A man skate boarding in a pool with another man looking on.
a young woman cuts up some food on a trey
Two men skateboarding with a light and a camera.
A group of drinking glasses sit along a bar, with two people nearby.
A person on a motorcycle on a track near another person.
A food item is shown on a napkin.
two giraffes are standing in the open field.
A man riding the back of a brown horse.
A bunch of statues that are in the grass.
A man riding a snowboard down a snow covered slope.
A man swinging a baseball bat at a ball.
A cat sticking its head out of a cement wall looking up.
a few cowboys stand watching some animals outside
The woman is riding the horse on the course.
a woman sitting on a wooden park bench smiling at the camera
A cat sitting on a wooden chair in a room.
The animals are grazing on the wheat grain
Two men playing frisbee in a park
Two people standing in a market by a fruit stand.
Several cars are seen going down a city street.
A grown elephant and a young elephant roam freely together in an open field.
Woman in sunglasses hugging a red fire hydrant.
a teen standing on a skateboard while riding part of the wall
a close up of a cat laying in a luggage bag
A big family pose for a picture with a surfboard
two motorcycles line up as they lean against some seats
Two puffins sitting in some grass on a mountain.
Many young men pose for a picture.
a person showing cellphones on sale in a shop
A YOUNG GIRL ON A SKATEBOARD IN A PARKING LOT
A man is posing excitedly on a surfboard.
A very nice boat on the water with a dog on it.
A young man holding a Wii mote plays a video game
A stack of suitcases stacked in a front lawn.
A gang of bikers riding down a street.
Little girls play soccer on a field on a sunny day.
A living room and dining area with hard wood floors.
A bunch of ripe oranges are stacked neatly on top of each other.
Someone is making a sandwich consisting of carrots and alfalfa sprouts.
A brown cardboard box filled with bananas, apples, oranges and kiwis.
The young kids are playing a game of soccer.
The kitchen bar is near a dining room table.
A food container with five sections filled with various items.
A woman sits on a brick wall, holding her umbrella, looking out at the city.
Woman talking on cellphone in a dining room.
A small industrial machine car on train tracks.
A desk witgmh a telephone, laptop, cell phone and a book on it.
a man in glasses is playing with a white controller
A BIG  BOX OF TOMATO AND BASIL PIZZA.
A large group of people on a field playing soccer.
A yellow and black train is on a train track.
A black and white cat curled up on a brown checked sofa.
A man is talking a picture of a bus.
Two horses standing near each other in  a field
A lone swan swims in a river near a bridge.
Woman playing tennis with bleachers in the background.
A vintage photograph of a war plane flying
Two food items are displayed on separate plates.
A brown and gold fire hydrant in front of a brick building.
THIS IS A PICTURE OF A FEW ZEBRAS GRAZING IN A LARGE ENCLOSER
an image of umbrellas lined up with tables
A group of sheep walk along a dirt path.
A desk with a pc, monitor, laptop, mouse, and stuffed animal
A gray and white cat laying on it's back with it's head looking up in a open drawer.
A man walking a bike near a train station.
Photo of a living room with a Christmas tree in the corner.
A toiler and some buckets in a small room.
Large group of ships tied together at a peer.
An empty road with a red stoplight that spans voer the road.
A black dog playing in the ocean while barking.
A dark gray bird flying towards a palm tree
A motor home parked along side an outdoor flea market.
A closeup of a wine glass and a wine bottle
A boy jumping up over a bench on a skateboard.
A huge, captive fish gapes his mouth open at a woman taking a photograph in an aquarium.
a person reading a book and cooking food on a stove
Onlookers watch as a skateboarder performs a jump.
Some wooden benches are in the middle of the forest.
A man standing on top of a sidewalk holding a skateboard.
A laptop sitting on a living space table with a spacious desert view.
four colorful vases of different types are sitting on a shelf.
A close up of a stop light positioned against a high rise building.
A room features two identical beds with stools at the end.
A computer screen showing photos on it while a smokestack is visible out the window
Night shot of skateboarders in wide open area with lights above.
A couple of giraffe standing next to a zebra near a rock wall.
A table holding a white plate with bananas and a brown glass.
A bus that is travelling on a road in a town that has many houses and buildings.
A pair of giraffes grazing on hay by a fence.
A couple of plates of sausages, broccoli and purple food.
A family of giraffe on a wild field next to zebras.
A large cow standing in a grassy field near other cows.
A tennis player is jumping and reaching to hit the ball.
A couple of people on a wall playing with a Frisbee.
a laptop on the ground near a turn table
An old clock with a flower design in a small room.
a kitchen with a refrigerator near a sink
Men loading luggage from a train onto a cart.
A fan closely watches the professional baseball batter
Little boy and girl sitting on the porch eating their meal.
A man standing behind a display case filled with jewelry.
A black and white photo of a man fixing anthers tie.
a person preparing an authentic pizza on a wooden spoon
A street sign next to a traffic sign next to tall buildings..
Herd of wild cattle walking along the beach
A city street with people out and two large buses
a small dog with some glasses over its eyes
a dormitory consisting of many beds lined up along the wall
A kitchen has a stove,microwave, and wooden cabinets.
An electric commuter train on the tracks under a cloudy sky
The person is wearing black clothes, shoes, and hose.
There is a child that is walking in the gradd
Hundreds of bicycle enthusiasts embark on a race on a city street.
A man in the air skateboarding at the park
A close up of a Harley Davidson parked on the road.
A man holding video game controllers and playing.
Gray and white bird with red crest using bird feeder.
A white car passing a person in a black jacket.
a large train is going down the tracks outside
Two ponies together standing on a mountainous terrain.
A small bathroom with a shower, sink, and wooden medicine cabinet.
Small train coming out of a tunnel on an overpass.
A group of people who are standing in the dirt.
bread with banana milk and nutella on a table
A clean and tidy kitchen with a stove, dishwasher, microwave, widow and a door.
A cat sleeping on top of an open laptop computer.
Two motorcycle riders talking on the side of the road.
A bunch of people is watching something and a man in a brown and blue stripped shirt has his fingers in his ears.
A ukelele is passed over a table with cake and lots of food.
A giraffe is standing near its fenced area observing.
A pasture with sheep in front of a large home
A large group of birds sitting on metal pipes in the water.
A group of people sit down at a table to share a meal.
A man pinning a number to a child's shirt.
A dog sleeping on bed against the wall.
a room with wood flooring filled with furniture.
Two zebras stand close to each other in a field.
a person on a bicycle a bus a truck and a child
a vandalized stop sign in the dark with a sky background
a red and yellow trains engine pulling its cars and some tracks
A woman sitting on a bed with a laptop.
Two people with surfboards are standing in a sandy parking lot.
A PERSON JUMPING FROM A SLOPE ON A SNOWBOARD
Long bamboo poles with umbrella tops in front of the sky
A stove is away from the wall in a kitchen area.
A trolley driving down a street lined with tall buildings.
A giraffe looking concerned on a grass field.
Two metal lamps are placed beside a window.
A close-up photo of a pool table with a man playing.
A group of different parking meters displayed together.
A boy is doing a trick on a skateboard.
A black and white photo of a city street with old cars and people on it.
a number of people standing around a large group of luggage bags
piece of cake with a plate and fork
Large giraffe roams in the lush green vegetation.
a blue and pink kit with streamers flying in a clear blue sky
A man pushes a brightly smiling little girl on a swing.
Two stuffed bears that are next to each other.
A car that is parked in some snow.
A zebra standing in some brush without leaves.
A bunch of people walking across the air field to get to their plane.
A man hitting a tennis ball with a racquet.
a close up of stuffed animal with metal pieces on his chest
A person is on skis in a very snowy place.
airport coming in to dock at the airport
a person on a city street operating a cell phone
Two man standing near each other in a park.
A man is eating a hot dog and talking to a young girl.
A person bending over to adjust a child's skis.
A woman in a boat eating a sandwich.
A person standing on a mountain top with some skis.
A group of cattle walking across a lush green field.
two giraffes standing under a tree to get some shade
An owner plays tug-of-war with their Golden Retriever
A dog looking up and running to catch a frisbee.
an open toilet on the side walk of a street
A man with a very bright orange hat sitting in a car.
A traffic light next to a busy street in front of a brick highrise building.
a person standing near a bush near an elephant
A woman that is standing on a sidewalk.
Three park benches are in a garden type setting.
a bathroom with a toilet and a bath tub
A flower vase in the center of the kitchen table.
A bull walks up to a pile of wood and a teddy bear.
a couple of coaches in a cluttered living room
THREE BOYS RIDING THEIR BICYCLES ON A STREET.
there is a male baseball player about to throw the ball
A couple of people standing around holding snowboards.
A photograph of a highly decorated cake on a table.
A passenger bus that is driving down the street.
A large building with a railroad crossing near it
A bike is covered and parked on a street.
A small bird in a tree with red fruit on the  tree
A cat looking out a window at a bird.
High school girls soccer game action shot of green versus red team.
A person with a lighter lighting several sticks.
Reflection of a school bus in its own side view mirror.
two people in costume pose for a photo
brown bathroom with white toilet and white sink
A pretty young lady eating a hot dog on a bun.
An elderly man blowing out birthday candles on his cake.
A plate filled with breakfast foods sits on top of a wooden table.
This clean bathroom has a tile floor and a brown toilet lid
A woman with a painted face is on a phone.
A green bird bath decorated with various jewels
A fancy clock graces the corner of this old building.
A cat sitting in front of a television watching a hockey game.
A chair and a blue umbrella are attached to a wheel.
People sitting around an oval table in a restaurant posing for a photo.
A street scene with a truck and trailer in the foreground.
A locomotive on train tracks in a wooded countryside.
A train riding pass a platform and buildings.
a stuffed sandwich with meat, cheese and pickles
A man riding on top of a board on a wave.
A man and woman making a cut into a wedding cake
Some cows stand beneath the shade of some trees.
Pink lunch box with compartments for all types of food
two white birds flying over the sea water
The lobby has a few people in it but for the most part it isn't very busy.
A double oven with one side completely full and the other empty
Man are standing near a couch holding Wii controllers.
A pizza on a board with a pizza cutter
A BABY EATING A MEAL WITH HIS TOY DRUMMER BEAR
A tan dog laying next to a park bench.
A skateboarder performing a jump off the edge of a stone wall.
a cat sits on the floor looking at the camera
THERE IS A CITY BUS ON THE STREET
a couple of people that are sitting on a bench
A person sitting in bed with a dog on his lap.
A seaplane is docked near a residential area.
An umbrella and rain boots in a corner
A laptop and a desktop computer sitting on a table.
A zebra and a giraffe walking in opposite directions.
Two children staring out a window while on public transportation.
Giraffe and zebra grazing in a field next to plants.
A stop sign is shown on the side of a corner.
A modernist kitchen, with a white and aluminum color theme.
a large room that has a big kitchen table in it
A baseball player hitting  a baseball with a bat.
A baseball player running to catch a baseball during a game.
A large white bus on a city street.
A woman putting post it notes on a wall in a room.
The baseball player is practicing his swing for his favorite game.
Assorted pastries and tongs have been arranged above stacks of plates.
A cellphone and a remote control sitting on top of a book.
A woman is skiing down a high mountain.
A man standing on a tennis court holding a racquet.
A herd of elephants with birds at sunset.
A white plate topped with broccoli and meat covered in sauce.
A woman is smiling and holding a monkey.
A black and white photo shows workers working on a road.
A small girl holding skies in the snow
A person is running with a kite in the air.
People watching two elephants from behind a cement platform.
A man flips a skateboard while doing a trick.
The toothbrushes have a holder on the bathroom sink.
a train station with a train sitting parked in it
A guy on a skateboard at the top of a concrete bowl.
Many stuffed animals hanging and sticking to a tree.
Several "One Way" Signs are placed near an "All Way" Stop sign.
A man riding a surfboard on a wave in the ocean.
Two young boys eating carrots while sitting on a bed.
a long train is going down the rail road outside
Five benches in the park in an area surrounded by trees
some people are pushing a truck in a lot
A group of small children having a birthday party.
A dish contains carrots, onions and other vegetables.
The two baseball players are walking on the sidewalk.
A room with decorations on a shelf and a painting on the wall.
a close up of a laptop on a desk
A laptop computer is sitting on a table top.
a couple of people on skate boards do a trick
A closeup photo of a bulldog wearing an Army style hat.
A sprinkled doughnut with pink icing sitting on a plate.
A kitchen area with refrigerator in the background and a sink and stovetop oven on the side surrounded with wooden cabinets.
two zebras are in their pen at the zoo
A baseball player holding a bat on top of a field.
A couple of skateboards, two sitting on the sidewalk, the other on the board.
A red stop sign sitting above a traffic light.
A desk with two computers, phone, and other accoutrements.
A teenager in wild clothing playing a video game
Blue, pink, purple, and yellow flowers are in a red vase.
A hot dog with a large amount of cheese.
A city street filled with traffic at night
The box of a dozen donuts has two different flavors.
A living room setting with furniture and lamp
The three men are dressed in costumes.
A red double-decker bus driving down the road.
two giraffes and one is eating some food
An iPod and a laptop computer on a desk
an orange and white cat and its orange play toy
The closeup of a clock on the face of a tower.
Large poster on wall behind white commode in dark tiled bathroom.
A dish with shrimp and cucumbers and lettuce.
A man with a skateboard that is up in the air.
Children are looking at a zebra in an enclosure.
People are sitting on a motorcycle with a woman standing behind them
Oriental woman preparing to put a toothbrush into her mouth.
A table with a laptop, bag of coffee and cellphone on it.
A girl excited about a cake at her table.
Dog laying down on a grey and yellow striped couch.
The top of a desk with a keyboard, computer and phone.
A coal fired train with passengers behind a split rail fence.
A collection of fine furniture is displayed in a room.
A transit train badly in need of a paint job
A plate with a roasted carrot and broccoli on it.
A woman holding a tennis racquet prepares to play tennis.
A person gets ready to swing their racket.
a girafee looking around by some people
A girl stands on a bed and appears to be crying.
A very large building with a tower near some water.
a close up of person sitting with a laptop
a cream colored dog lying on a brown carpet.
White toilet with a shower with a tree on it beside it.
A catcher has his mitt out as a baseball batter swings his bat and hits the ball.
A man who is sitting at a table with a plate in front of him.
a lady covering herself from rain with an umbrella
A man wearing a gas mask and a suit and yellow striped tie.
A glass bowl filled with noodle salad on to of a table.
Two elephants are standing on the grass near some trees.
Three boys sitting in chairs with game controllers.
A city intersection with a sign redirecting traffic.
Nice looking front room with brown furniture to decorate it with.
A train with a red caboose sitting on tracks.
Military looking truck parked in an old warehouse
The living room actually features several different colors.
Small pizza sits on a plate on a restaurant table.
A man holding a bat next to a catcher and umpire.
The teddy bear was left on the empty bench.
very many pizzas in a plate in a kitchen
A black microwave on a cabinet in a hotel room.
Man flying a tailed kite high into the sky
A man that has a gold tie on.
The outdoor furniture with a table umbrella is made out of wood.
A family sitting at a booth in a restaurant looking up.
Two women standing on tree stumps with a boy and a teddy bear
A group of men let their horses drink from a fountain.
The girl is eating her pizza with a fork.
a grey dog seated on a chair of a vehicle
A brown cat lying at the back of a car
A group of kids standing beside an opened fire hydrant.
A herd of giraffe running across a field.
A person rides a horse in front of a large group of people.
A woman sitting with her legs crossed on a bench in a green field.
A man riding a skateboard on top of a cement park.
BRIGHT RED FIRE HYDRANT WITH A SIGN NEXT TO IT
The cat is laying partially in the light with its eyes closed.
A couple of men playing soccer against each other on a field.
Two people work on a shed while standing on a tractor.
there are two grizzly bears walking down the gravel road
Two men running and playing baseball with plate and grass
A couple women with remotes in a room.
A red trolley passing by a group of people under umbrellas.
A young girl standing on top of a tennis court holding a racquet.
A large open concept living room leads into the dining room.
A person with a red umbrella walking towards a bike chained to a lamppost.
A bunch of broccoli spread out on a table
A closeup view of food on a plate
a dried out tree with fruit hanging on it
A small and a large teddy bear sitting in plants
View from the stands of sparsely attended tennis match
A man that is sitting down next to a cops motorcycle.
a brown table with a toaster a plate and a black microwave
A snowy road with snow covered trees on which a skier is traveling.
two guys are outside moving a refrigerator
An Asian woman in front of a body of water with two umbrellas
A man standing on top of a blue tennis court.
Signs at a city intersection indicate no turning is allowed.
Two elephants walking in water next to grassy area.
A man on a skateboard doing a trick.
Woman in a white uniform holding a pencil to wall.
A small boat with several flags on it moving across the water.
a big building with a clock built inside of it
A market has many fruits and vegetables on display for sale
there is a large truck and a yellow truck behind it
a store front has many stuffed animals on display
a tennis getting ready to hit the tennis ball
A group of sheep walking down a path with a few stopping to eat grass along the side.
A small cat laying on a couch in a room.
A fire hydrant behind a gate on the sidewalk.
Giraffes eating leaves from bushes near logs on sand.
A man sitting in a chair looking at someone's food.
A zebra with its mouth open and lip in the air.
A man holding skis and poles and walking up a snow covered mountain slope.
A man is cross country skiing on a bright day.
a woman in white is cutting into a cake
A person surfing a white water rapid.
A glass vase with yellow flowers in it
A rear view mirror on the side of a car reflecting a mountain range.
Two sheep are in a barn standing next to each other.
A double Decker bus is traveling down a street.
An old steam locomotive waits at a country station.
A snow covered parking lot meter in front of a building
London double decker bus in motion on street
The horse is grazing in the fenced coral.
a zebra is laying down in some dirt
A man in his skiing gear is on his board looking on.
a guy on a surfboard with a kite attached to it.
A sheep with its new born lamb in a field.
A toy ship made out of Legos is attached to the side of the refrigerator.
A person up in the air on a skateboard.
Elderly man sitting on a bench facing the beach.
A bathroom has pictures hanging on the wall.
A small hot pink bathroom with a few touches of royal blue on the toilet is shown.
A father and son in a kitchen preparing a meal.
The parking meters are posted beside a cement wall.
these men are playing a sport in a field
A large city bus making a turn at a crosswalk with a clock tower behind it.
Two woman play Wii video game with wireless controllers.
A square of cheesecake on a marble cutting board with a two-pronged fork.
A plant in a blue cup on a windowsill.
A bright red bench sitting in front of a decorated store front.
A man has some food hanging out of his mouth.
A group of people standing around each other near a street.
A child at a table sitting in front of a birthday cake.
two giraffes standing next to some trees
Many people are standing next to a very large plane with its bottom doors open
A man in black shirt and white shorts playing tennis.
Two different transit trains can be seen in this photograph.
A group of people around a table with a blue tablecloth
A bedroom with two beds and a table with a lamp.
A bear jumping into a pool of water.
a man with a napkin at his neck eating a dangling food
An elephant walking by a group of ATVs.
Two people that are sitting on a table.
Two photos of a young man in a suit and tie
A young boy with a helmet rolling down the street on his skateboard
Commercial airplane flying in the air on a cloudy day.
The neat bedroom has a large window in it.
Two cows on a grass covered hillside on a sunny day.
bathroom with white toilet and white sink berside each oher
A bowl filled with yellow bananas and green apples.
An aircraft soars over a beach near a city.
a person sitting on a bench while the rest look somehwere else
A lot of people  that are on a sidewalk.
The sun is blocked by a statue holding a round object.
A man with an umbrella hat stands next to another man.
Four horses walking across grass on a lake with mountains in the background.
Outside view of the MGM Grand in Las Vegas with people sitting and walking.
a brown and white cat is looking in a mirror with glowing yellow eyes
A fork sticks out of a parking meter.
A man in a bathroom on an airplane.
Bunches of bananas in yellow and green hang from a ceiling.
A white and red double decker bus on street next to car.
ACCIDENT SCENE WITH FIRE TRUCK, AMBULANCE, VEHICLES, AND PEOPLE
View of jet airliner taking off over tree top.
A train sits on the tracks while people stand near by.
A professional baseball player about to hit the ball
A full view of an outdoor space with many things to see.
a red motorcycle with a windshield parked on the sidewalk
A dog with a tiara on and his head rested on an armrest.
Two guys standing on the right hand side of a motorcycle.
A bath tub in the center of a large bathroom.
A stop sign and two fire hydrants set up in the woods.
A condiment filled hotdog is in a red basket next to an iced beverage.
Two slices of pizza on a table with one beverage.
a store sits in front of a fire hydrant
A set of three cow statues siting above a crowded walkway.
A large plate of doughnuts on a table.
a bunch of sheep in a field eating
A child on a blanket with an apple.
A woman leads a race horse down a cobblestone path.
A man drinking a beverage with his sandwich.
A person is squatting by a banana tree.
a small group of fish about to be cleaned
Two men standing in a parking lot dressed in business atire
An arrangement of fruits and vegetables are laid out on a counter.
A kitchen with a large wooden table and clutter on it's counter.
A slightly dirty room that has green items on the floor
A cat is laying on the lap of a man playing video games.
People sitting around a long table using laptops
A bear is next to a body of water outside.
Girl on phone looking up a statue of Ronald McDonald.
White living room furniture looks very modern and clean.
A hose sitting next to a fire hydrant on the street.
A large clock stands on a post on a city street.
Two giraffes standing in a rocky area by a river.
A group of women are gathered by a long table of food.
A stop sign that has been covered with graffiti.
A dog with its mouth open about to ear pizza crust .
A shelf with donuts being sold six for five dollars.
A muscular man surfing on a vast blue ocean
Herd of elephants crossing a water hole next to another herd of elephants.
A collaboration  of people in different pictures doing things
Airplanes on the tarmac in the rain at an island airport
A person jumping a skateboard at a skateboard competition.
there is a man pointing up standing on a building
A boy and dog sitting on a recliner the boy looking at a laptop.
Men on horses herding a group of cows down a road.
A skier looks up to the camera above her.
A row of urinals with air freshener boxes on a wall.
a person standing in a door way and a horse in the foreground
A baseball batter readies himself waiting for the pitch.
kids out on the field playing soccer together
A man flying through the air while riding a surfboard on a wave.
Man in red shirt standing in front of a man holding a frisbee.
Two mountain peaks rise above a large meadow.
A bike in view in a living room with a Christmas Tree in the background.
a group of horses standing next to a tree in an open field
A player sliding onto bass while an opposing player tries to grab the ball at a baseball game.
a big bear that is staring at a camera
To people sit on benches in the rain, holding umbrellas.
a person in a kitchen with a pizza
A mans toilet attached to a black pole.
Someone holding a stuffed teddy bear in their arm.
A crowded city street is full of big umbrellas.
A naked baby is on a bench in a backyard.
people standing outside of a building with a fire truck
A meat filled pizza sitting on a pan on a table.
A rainbow colored kite caught in the branches of a tree.
A young boy riding a skateboard up a ramp.
She is eating a sandwich and having a drink.
A kitchen with marble counter tops and black appliances
a stop sign graffiti written on the front
a train yard with several stopped trains waiting to go
Street signs, including a stop sign, where someone wrote "Don't stop believin!"
Two pillows sitting on the ground next to furniture.
A person is holding something donut shaped in their hands.
A woman with glasses and a scarf skateboards along Hollywood's Walk of Fame.
A glider gliding in the sky over the ocean.
a group of people sit around a table
The pastry has a substance in the middle of it
A group of baseball player congratulating each other.
A man strikes a tennis ball during game.
a number of people on horses playing polo
some baseball players on some grass and some trees
Two people climbing a mountain on their skiis
A view of a oven with the food flipped over in it.
A woman sitting at a table while using a laptop.
A counter topped with lots of pizza and sandwiches.
Several zebras walking in the shade near some trees.
A kitchen has light wood and shiny floors.
Plate of food including chicken, pasta  and vegetables.
Girl competing in a horse competition at the county fair.
A white train sitting in a train station next to a Bologna sign.
A zebra standing on top of a grass field.
a motor bike sits parked on a cracked street
A number of food items and two beverage atop a wooden table.
Vases of flowers sit among plates of pastries.
An airplane is on a snowy runway at an airport.
A dark colored dog sitting on blanket and looking up.
A bunch of different fruits sitting in baskets and on a table.
Vegetables and fruits are on a brown cutting board.
A low angle view of a church clock.
A man and woman sitting by a pile of bananas
a person sitting in a boat with a dalmation
A group of people near a table full of bananas.
A large flock of sheep are in a grassy meadow.
A person sitting on a machine with wheels in the middle above a pedal.
This lamp is standing near a wall that is painted red.
A toddler laying in a bed with pink sheets.
A person on skateboard in a parking ramp looking area.
An old, dilapidated toilet with a broken seat
A smiling clothed man sitting on a toilet.
A huge white and blue airplane sits on the runway.
Two men shaking hands and one being presented with a key.
Stuffed animals on a shelf with some books.
All of the donuts each have a different flavor of icing.
A close of a bobble head doll with a computer in the background.
A bathroom with toilet glass sink, mirror and extra toilet paper.
A very cute orange cat laying with some shoes.
A clock tower sitting behind an illuminated star display on a tree.
A paper plate topped with a slice of cake next to a spoon.
a shop with a bunch of signs sitting out front
A room with wooden floors and wooden walls
A school bus that is made by Chevrolet has a few bumper stickers.
A suitcase full of random assorted food items
A living room with couches, a table, and a fireplace.
A plane takes off from a runway while a large building stands in the background.
A trolley with people on tracks in a rural area.
The batter prepares to hit the ball, while the fans watch from the side.
Young snowboarder spending time on slope in ski area.
a sink with a microwave oven on top of it
A delta airlines jet sitting next to a  truck on a runway.
Two people standing in the reflection of a mirror.
Two children on a soccer field kicking a soccer ball during a game.
Group of motorcycle riders looking over traffic on the street
A man holds an enormous sandwich in front of his face.
Group of motorcyclist riding motorcycles down a highway.
A man skiing down  a snow covered ski slope with two ski poles.
Small boy in green shirt touching a yellow fire hydrant.
A small herd of cows near a water bank.
Decorated coffee cup with spoon next to miniature bicycle.
A man standing next to a woman in a kitchen preparing food.
two street signs on a pole on a sidewalk next to a street.
A man walks out of a colorful train onto a platform.
A pudgy man holds a huge hot dog and chips.
Donuts and a cell phone laying on a table.
there is a man with tattoos talking on the phone
A man with a racket goes to hit a tennis ball.
A high speed train pulls into a platform while people watch.
A bed in a room with two windows.
two plates side by side, one with a roll and jam.
A young zebra stands away from the zebra in the light.
There is a white stove with pans on top of it and next to it, a refrigerator.
A lady sitting at a kitchen table alone.
A tennis player serving a tennis ball on the court.
A typical living room with couch, glass coffee table, television and water dispenser.
A cop leading a gang of bikers down a street.
A couple of giraffe standing next to each other in a  forest.
Laptop computer sitting on a table with a sticky note on it.
An animal grazing under a wide, gnarled tree.
A woman laying on a surf board is riding a wave.
a group of people next to a train with a sky background
A cat staring at a another cat hidden in a travel bag
Several people gathered around a table that has a cake on it.
A coffee cup sitting on a pad of paper next to a keyboard.
A white plate is filled with a variety of doughnuts.
An aisle in a store that is selling holiday items.
The player is hitting the ball with strength.
The man is holding a glass pan full of liquid mix.
There is a row of cows with baby cows next to them.
The black cat is turning away from the large computer screen.
The window to the store has graphics on it.
A plate of food and a cup of coffee.
a girl with a microphone talking about a cow
Carrots and cucumber on wooden cutting board near knives.
2 buses and numerous cars move down the street.
a male in a brown shirt sitting on a bench with a laptop
A painting of several flowers in a vase sitting on a shiny surface.
A large yellow dump truck driving on top of a sandy beach.
A person is riding a horse inside an obstacle course.
A man is standing near a computer giving a presentation.
A black cat with green eyes rests on colorful blankets.
PERSON GOING FOR THE RETURN ON A TENNIS COURT
a view of a keyboard, remotes sitting on a desk
A man riding a skateboard down the side of a ramp.
the sandwich is on the plate and has been cut in two
A man is standing at the base of a ski hill.
A refrigerator and a stove in a kitchen.
The bottom of a large airplane flying overhead.
a man wearing a yellow snow jacket and black snow pants snow boarding.
A piece of chocolate cake is in a plastic container.
A small airplane flying over a body of water.
A child holding the hand of an adult while moving on skis.
People are walking on cobblestone with umbrellas and shadows.
A child is under the covers reading a book.
a plate with some food in it on a table.
A man is riding on skis down a snowy mountain.
A batter, catcher and umpire in a baseball game.
A line of black and white cows are lined up and grazing.
a woman is working at a pastry shop
A cat is sitting on top of an entertainment system
A woman sitting on a bench with a mean look on her face
A large group of people on a grass field.
A person sitting in front of a laptop computer.
a desk with a bag and a bunch of other things sitting on the floor
A dog runs alongside a skateboard with one paw on.
A skateboarder balances on his skateboard, then balances on the board at the edge of a low wall.
A man riding skis down a snow covered ski slope.
a close up of a plate of food with broccoli
A man wearing a wet suit riding a wave on a surfboard.
a police on a big white horse in front of a retail store
A small cat standing by a mirror on the ground.
A bedroom with a white bed on a frame next to a window.
A white table topped with lots of plates and food.
A suitcase surrounded by some items on a floor
A man holding a Wii game controller while standing in a living room.
A fried dish is pictured on a plate.
A tall clock tower sitting on at the end of a street.
A herd of wild elephants walking along a dry grass filled hillside.
A group of people sitting at a table with plates and soda.
four woman standing next to each other with bike helmets on and holding bananas
A woman walks in the road shimmering with rain past the city lights.
A man making a cut into a celebratory cake
There are cars parked along the side of the snowy street.
A red crafted bird is pasted to a parking sign.
close up of a cow standing on the other side of a barbed wire fence
A small boat rests on wooden planks by the water.
Man on skateboard on top of wall in factory.
A plate of food featuring burger patties, potatoes and carrots.
A girl lying in bed and playing a handheld game.
Two clocks on post next to building in street.
Looking down at cookies baking in a home oven
Colorful flags hanging lined up in a row.
A black and white image of an older air plane.
A stuffed animal dog birching out in front of people at the beach
A box is full of old items as a tribute to Forrest Gump.
A vase that has flowers in it on the table.
A dog on a bed looking at something.
A man on a surfboard that just caught a wave
A bird stands next to many black benches.
A woman with makeup bruises is in a suitcase.
a man lifting the lid of a square shaped toilet
A man is pulled by an unseen boat while water skiing.
Small cup of baked brownies being scooped out into small snack sized dishes.
a group of people sitting at a table to eat at the beach
a couple of people that are in a kitchen
The train can be seen through a chain link fence.
Birds stand on a side walk under the large trees.
A view of a great room consisting of a living room and dinning room.
a street light with street signs in front of trees
A photo of some cows standing in a field.
many luggage bags near each other on the ground
A line of people in suits holding roses.
A BATHROOM WITH A TOILET AND A SINK
That collage of nude women probably means this bathroom belongs to a guy.
A man standing in a room holding a drink and a game controller.
A white toilet and sink in a room.
A child showing a banana to the camera.
A close up of a cats profile is shown.
A man is standing next to another man who is laying on the floor
A man is looking at a bus stop sign
A baseball player holding a bat standing near home plate.
A man holding a laptop sitting beside a woman with a small child.
a toilet with a remote control mounted on the side
A couple of women riding on the back of a horse drawn carriage.
A man at a campground eating a sandwich.
A man sitting at a table with food and beverages in front of him.
A bride, groom, and minister at a wedding ceremony.
A TV sitting on top of a stand in a living room.
Four people riding on horses along the beach shore line
A train engine carrying carts across a bridge over water.
A black train sits on the tracks as people stop to admire it.
A kitchen with two stoves, an island, and appliances.
a woman walking down a crosswalk next to woman riding a skateboard
A double length metro bus drives down a city street.
A tray covered in tin foil on top of a counter.
A clock sitting on top of a sidewalk.
A park bench by a body of water.
Two cats laying next to a cup of coffee.
A hairy man is holding a frisbee on the beach.
Two people hold their colorful pastries next to each other.
Many skiers are going up a snow hill
A picture of a baseball game being played in a stadium.
A collage of pastries, and a boxed of donuts.
A triple layer cake sitting on top of a table.
A red and yellow traffic sign sitting on the side of a road.
A bowl with red and green apples and an orange.
A living room with furniture, television thrown rug and a window.
a small child standing in a living room eating something
a dog moving towards the horses at the mountains
man with skull decorated surfboard eyeing the ocean
A young man standing on top of a snow board in the snow.
A cyclist passes a bus while it picks up passengers.
a big living room with stained glass windows leading to a piano
Men's doubles tennis players shaking hands on the court
A white cat is sitting on a white sofa.
A woman wearing a white shit and apron standing by a man in front of a traffic light.
A white plane sitting on top of a runway near a building.
An old black and white photo of a man holding skis.
Cows in pasture within a fence on a field.
a broken toilet bowl base overturned in a shrubbery next to dirt and rocks.
Bathroom with a toilet, glass sink and a mirror.
A couple of animals grazing on a dry grass field.
Electronic and personal items from a back pack laid out neatly
Female flying a kite in an open field.
a woman posing on a bench in front of stony ruins
four giraffe stand at a tree all with their noses stuck into some kind of nest
A group of elephants walking through the street with a pepsi stand in the background.
A man holding white surf board on the beach
A walk in shower next to a tub in a bathroom.
A kids baseball game with a runner sliding into home
A black, blond and white cat crouches on the side of a table with a cake on it .
A man on a bike balancing quite a bit on his head.
A group of female soccer players at the pitch playing
Horse jumping over an obstacle on a course with a rider.
Man cooking marshmallows over an electric stove with fancy tongs.
A plane on a runway drives off to the air
A group of people playing Wii in a family room.
Flowers in a vase full of water next to a window.
Many different piece of luggage that are open on the floor.
two people with two dogs on a surf board and one dog swimming
a plate filled with assorted veggies and cheese
A train engine is sitting at a train station.
A couple of people on skis in the snow.
A bathroom with a toilet, towel rack and a tub in it.
A man in wet suit surfing wave on a surfboard.
A cat is curled up on a bed beside a remote control.
A couch with a cat and toy teddy bears on it.
a number of horses standing near one another
A child in snow gear and skis on a ski slope.
An old truck with no passenger door with tires and body painted in different colors.
A close up of a plant center surrounded by leaves
A couple of kids are skate boarding down a street.
A yellow train is stopped against a barrier on the tracks.
The soccer player is kicking the ball while a crowd watches.
A silver pan filled with food on top of a stove top.
A man riding a dirt bike on top of a sandy beach.
A cat is stretched out on a couch under a window.
Close up images of bikes parked next to the highway.
a table with some glasses of beer and some pizzas on it
Two horses giving each other a loving nose kiss.
A monkey with a banana sitting in the dirt.
A table that has a plate of food and a glass of wine on it.
A bowl filled with ice cream, sprinkles, cherries and other toppings.
half a dozen giraffe in a wooded area
Two skiiers jump down a snowy slope towards a ski lodge.
A full, black and white coffee cup held in front of a computer keyboard.
This kitchen has white cabinets and counters and silver appliances
a man that is walking down a sidewalk
A beautiful young woman riding a pink skateboard.
A commuter train makes a left-hand track change to change direction.
A beautiful Asian girl with a white rose in her black hair. She is holding an open blue umbrella over her head.
A man riding a motorcycle on the street.
a group of people holding wooden utensils a smiling at he camera
A bear made out of gummy bears in a candy store.
This aerial shot shows several people using a cross walk while holding umbrellas.
A group of people sitting on a couch in front of a cluttered table.
A white bed topped with pillows sitting next to a wooden night stand.
A boy is playing tennis with other people in the background
Many people on the beach with large colorful kites flying in the air.
A women is holding an ID and holding a pair of scissors to it.
The elephant is walking outside by himself along the wall.
A woman with nice legs laying next to a purple umbrella.
A group of teddy bears all dressed in Pilgrim and Halloween outfits.
A man in light clothing stands near a boy with sunglasses and jeans and they are both by a white glider.
Display of ornamental vases and figurines with oranges stacked on stands.
Athlete in motion during attended competition on gray and blue court.
A large jetliner flying over a body of water.
A kitchen with stove, refrigerator, and cabinets in it.
Two people riding bicycles alongside the river on a sunny day
An older gentleman flies a kite on the beach.
A variety of items are spread out on the bed.
a bathroom with a sink right next to the shower
Two girls in pink robes standing in front of a television.
Trays of party food lined up on a table.
A copious amount of food are served up in the kitchen wares.
A white horse is out eating in a field
Three donuts piled together on a small plate.
An airplane on the runway either just landed or ready to take off.
The cat was laying in the sun on top of the zippered bag.
A train with two cars is on a railroad track that splits into several directions.
A desk nook area has a desk, a chair and a book shelf.
a man is arranging a set of appetizers on a tray
A train is parked at a depot on the tracks.
A man hitting a tennis ball with a racquet on a court.
a person riding a skate board at a skate park
A silver truck driving past a giant arch from a mcdonalds.
A man in green shirt riding on an elephant.
A man is wearing a robe and a tie.
A man is standing in front of a grill with an umbrella.
A young boy aims his video game controller as a man watches.
A group of cows standing around in an open field.
Orange and white cat laying down and chewing on some cups.
Woman on tennis court grasping racket with both hands.
A piece of pizza being held in a persons hands.
a red fire hydrant at the corner of a street
A green and red semi trailer truck front without a load.
A couple of men standing on a lush green park playing a game of frisbee.
the duck is looking over the side walk
A group of men standing next to each other.
A zebra in an open ground near a bench.
A toddler brushing his teeth and gums at the sink.
A group of zebras that are standing in the grass.
Man mid swing playing Tennis on tennis court
Boxed hotdog, fries and a drink are set out for daytime reading.
Wooden mantle holding two vases of flowers and a picture.
Tilted pic of a mountain road with a street sign.
A couple of people kneeling over a pile of snow.
A bird with outstretched blue wings is sitting on some bird feeder.
a kitchen with a table a stove and an oven
A red stop sign sitting on the side of a road.
A bed sitting in bedroom under a picture.
a living room with big couches and a ceiling fan
A person with glasses on the phone in a restaurant.
An elephant is standing in a grassy field in front of trees.
Four boxes of donuts of various descriptions on a table
The baseball pitcher has wound up his arm to pitch that ball.
A little girl is holding a Minnie Mouse umbrella above her head.
A man is looking at hanging fruit arrangements.
There is plenty of clutter by the computer on the desk.
A cat in a bathroom stands on the rim of the toilet.
a red fire hydrant with two nozzles on it
A giraffe is standing in a field with a group of zebras.
Someone who is holding a hot dog in front of a box of teddy bears.
a man in a uniform is cutting a cake
A man on cellphone and woman walking by building.
a teen girl sitting at a table with some pizza in front of her
The ingredients represented in the meal might include pineapple.
A small group of men playing with a frisbee.
Several people are swimming in a blue lake
A train emits thick steam as it moves on the rails through a flowing plains.
An orange tabby cat stands in a doorway with a bookshelf in the background.
A group of skateboarders standing around while another skates.
A family of elephants stand close to each other.
some yellow signs attached to a building wall
A variety of boats are shown in the water.
A woman speaks into her microphone while looking at the cow.
A green umbrella over some chairs and tables
The pitcher is winding up to make the pitch.
A very cute girl holding up some scissors.
A male and two females jumping to catch a Frisbee.
There is a train moving along a railroad track.
A couple of people that are sitting on a bench.
The man is watching hockey on his computer.
Man on a boat carrying large quantities of cabbages.
a parking meter that has been drawn on
A group of people standing around a white cake.
A white and black potted plant with a mirror behind it.
some kids are standing outside with an umbrella
A train car traveling on a bridge over water.
A boy with two marks on his back stands on a skateboard.
a surfer runs into the waves on a beaching with his surfboard
a tattooed man with a skateboard thinking about doing a trick
Two cats who are laying down on a bed.
A plate of food, that appears to a small omelet and other pieces of meat.
A dog that is sitting in a window.
A cake shaped like an elephant squishing a horse.
A WOMAN SITTING ON A BENCH EATING PIZZA WITH A LITTLE BOY
a pepperoni pizza sitting on an oven done cooking
A pitcher standing on a mound on top of a baseball field.
A man riding a snowboard down a snow covered slope.
A man kneeling down next to a little girl.
A dog with a frisbee in its mouth is jumping over a man lying on the ground.
A giraffe looking over the corral fence in his zoo habitat.
A horse and foal are standing in the meadow.
This is a picture of a black furry cat on a laptop.
The man is riding down the ramp on his skateboard.
A couple of people walking in a parking lot by several motorcycles.
A man holding an umbrella light on a beach.
A man in a red and white baseball uniform holds out a bat toward a baseball on a baseball field.
A man standing in a kitchen in front of a stove top white oven.
A small bathroom with a toilet and flushing system.
A living room with a fireplace and contemporary furnishings.
A pizza that is sitting on a table.
A cat in a chair peeking above the table's edge at a drink.
Leaves sitting on a street next to a parking meter.
Young man gliding along rail on his skateboard.
A mother and child carry kites through a park.
A blender with something in it to blend
A batter is getting ready to take a swing.
A man flying through the air while swinging from a pole.
A small green boat at a dock in the water.
A desert topped with whipped cream is sitting on a plate.
Two women who are riding in a horse drawn carriage.
A vase sitting on top of a roof top.
Pizza sitting on top of a table next to a couple of wine glasses.
a kitchen with a refrigerator a sink and a stove
Blue and purple vase sitting not he side of a white wall.
A boy eating a doughnut in a diner.
A line of traffic beside a metro bullet style train indoors.
A woman is on snow skis on top of a mountain.
A large black bear walking through a forest.
a bench dedicated to someone with a weird edge
She is eating a slice and watching the small countertop TV.
A river with rocks in the middle and a train trestle in the background.
A boy in a yellow shirt is riding the edge of a half-pipe on his skateboard.
A plane flying over the beach with a mountain in the background.
the man has returned a server of a tennis ball
A large Japan Airlines jet landing on a runway.
The back legs of a cat dangling over a keyboard.
there is a large wooden platform bed in this room
Residential bathroom with wooden cabinet and mirror next to shower.
A dog plays with a frizbee in a pile of snow
A man is standing up, taking a shot of the water, while a pigeon looks on.
A man wearing a bandana, holding a skateboard.
A woman is riding the waves on a surfboard.
A giraffe is in a field of grass eating leaves off a tree.
A black and a white horse are grazing in a green pasture.
Kites being flown from the water in the ocean
A man is riding a bike while using a cell phone.
a group of people are at a market
A room with a table, chairs and a doll in it.
The woman is eating breakfast in the kitchen.
This shows an innovative Apple device and a keyboard.
a small bus sits parked as a kid runs across the street
a building with some windows next to a street
A man surfing a nice wave on a bright day with a ship in the background.
A wet bear stands in the river looking for fish to eat
A white table topped with a flower surrounded by chairs.
a woman skiing down a ski slope in the slope
A smiling man stirring something in a kitchen.
a male with a beard a book and a child in bed
A bird sitting next to a dried cob of corn.
Man walking up the side of a mountain with his skis on.
an oven outfitted with several Christmas lights
A girl on a bicycle is stopped before crossing traffic.
Man sitting on chair in kitchen with baked pizza on table.
A little girl buying a small teddy bear.
four men in an office working on their computers
A skateboarder is riding the green ramp.
A skateboarder performs a trick in a skate park.
Several people are talking next to a yellow plane in a hangar.
A person points a remote control at the television.
A brown and white dog laying on top of a green field.
A red and black motorcycle with people in the background.
A boy that is on top of a skateboard.
Three giraffes tower above trees and brush as they feed.
There are small trees with oranges growing on them.
Older black and white photo of a woman playing baseball and swinging a bat.
A white frisbee laying on top of a dirt field.
Group of giraffes standing by a pile of wood in an exhibit.
Two brown bears walking on an unpaved forest road.
A bear walking on a fallen tree in the woods.
Two horses that are pulling a piece of farm equipment.
People on a tarmac board a Qantas airplane.
Up close to a giraffe in its natural habitat.
A red truck is parked on the lawn of this house
A man's feet resting on a skateboard
A child standing between two luggage carts behind a car.
People sitting around a table as someone puts stuff in a blender.
A city intersection displays a clock on a long tall stand.
Four giraffes encircle the palm tree within the fence.
a stop sign with some graffit on it
A lit birthday cake has some penguin candles.
A building displaying a clock showing the time to be 6 oclock.
Some very pretty whit bowls with some food in them.
A man is eating a peanut butter and jelly sandwich.
A black and white cat sitting on top of cabinets.
An elephant stands in weeds with trees in the background.
A man and woman sitting on a motorcycle.
A shirtless man reading a book and eating.
Park bench near tree during fall in open area.
Two people standing on the beach with a kite.
A man making a surprised face is getting a hair cut.
A young boy learns the meaning of the word strike.
There is a batch of doughnuts being made
a man sitting at a table with a plate full of food
a white horse is standing near a train
The bus  is parked at the bus stop.
a man taking a picture of a truck parked next to a building
A group of young people standing next to each other on a beach.
A train traveling down tracks next to a power grid.
Three giraffes in a field with an Egyptian theme in the background.
Someone on a snowboard holding the bottom of it in mid air.
A gourmet style pizza with a variety of vegetables.
Cows graze in a field in front of a lake.
A room with a sunny window contains a bed and a desk.
A toddler holding an electric toothbrush to his mouth.
a close up of a clock on a pole with a wind tool
A white refrigerator and cabinets in a grey kitchen.
A partially eaten plate of eggs, bacon and toast.
Garbage and police trucks on a city street
A small gray goat standing on large rocks.
An orange and yellow flower sitting in a see through humming bird.
A yellow and orange fire hydrant in front of a building.
A group of snowboarders snowboarding down a mountain.
Man stands up on his bike and looks up next to a parked car.
A herd of animals standing on top of rocks.
A bus is traveling down a street near a building.
A few birds are on the roof of a house.
A woman sitting on a rail next to skis
A kitchen with a microwave, cabinets, stove and dishes on the counter.
A child skier standing at the bottom of a slope.
A teddy bear sitting on a tricycle on a sidewalk next to a flower bed.
A man standing next to a woman in ski equipment.
An acrobatic dog catching a frisbee mid air.
A blender is full of food being prepared to puree.
Girls walking in a park talking and taking pictures.
A group of people sitting around a living room together.
a couple of people that have tennis rackets in hand
A group of people sitting outside at a restaurant table.
A woman standing over a stove cooking food.
A little league pitcher standing in a field holding a catchers mitt.
A colorful doll-house bedroom with one girl doll occupant.
People walk across a footbridge that stretches over a river.
A person holding a dog who is looking at it's self in a mirror.
Overripe bananas on plates with breakfast food packages.
Two double decker buses passing each other on the street
There are several people riding mopeds and motorcycles traveling down the street.
A man holding a flying disc in a park.
A large man is holding a black suitcase.
An elephant throws dirt on his back with his trunk
Two giraffe, two zebra, a monkey, and two flamingo are searching for food.
a young boy wearing ski equipment in the snow.
A surfer is on the water and is waiting for a wave.
A herd of elephants walking through a lush green field.
A woman with a tennis racket tosses a tennis ball.
Police car parked behind a car illegally parked at fire hydrant.
A herd of elephants splashing and playing in a  waterfall.
A group of people sitting at a table with beverages in front of a window with ocean view.
A woman with a pen is writing while a man in a tie is watching.
A bathroom scene with the sink and shower.
A cheesecake on a plate with a croissant behind it.
a woman reaching up her arm as she looks at tennis ball
Round mirrors above clean sinks in a public bathroom.
A man playing tennis, ready with racket in hand.
A photo taken in a mirror showing the side of a truck.
a man in a surfer suit walks down a street
A gray airplane with metal petals on the wings takes off from an airport.
A European tour bus with luggage on top on a brick city street.
A close up of a glass bowl full of small oranges.
A very beautiful woman wearing a black hat, black shirt and tie.
An ipod plugged into a dock inside of a kitchen.
A man sitting on a bench waiting to get a ride from a bus.
an elephant behind a fence at the zoo
A cat sleeping on a bed with its head on a teddy bear.
A person viewing a picture on their cellphone.
A meal with two plates full of broccoli and other items.
A refrigerator adorned with several magnets and clippings.
A few people are getting to know one another in affection.
A street light turned green on a dark street.
Small bird sitting on a skateboard posed in front of dark blue background cloth.
A man leading a horse around the town.
A sign that has a camel on it.
Color fruit is on a stand including pears and apples.
A giraffe standing alone next to some trees.
A man is holding a military medal in a bar.
A silver and green train stopped at a train station near kids.
A couple of knitting books sit on a couch.
Single zebra standing in a field of semi dried grass.
some blue and orange surfboards on the sand water and rocks
Small child in baseball uniform standing next to players.
A boy swinging a bat at a ball on a field.
The man is on a horse pointing his finger.
A small elephant lawn decoration near a plant.
A child dressed in random clothing standing barefoot in the kitchen.
A woman in short shorts standing next to a young man.
some people some buildings and some are flying kites
The back end of three zebras walking in a group.
A man in a red shirt motions toward his cell phone.
a corner of a building with the name of the street on it.
A man is checking his cell phone while snowboarding.
A sandwich in a basket accompanied by a beer and a lollipop.
The ships are all docked on the beach by the water.
Skiers on a snowy slope are high above a small town.
A dirty nasty urinal in a very dark rest room.
A family gathered at the table eating breafast
a black and silver trains engine and a car and grass
A steer walking down a busy market street.
A pizza with several toppings sliced and ready to eat.
A child dressed in a skeleton Halloween costume.
Teenage boy about to catch the flying Frisbee.
Soldiers with guns in the back of trucks in a parade.
A boat that has been beached on the shore.
A couple of hot dogs sitting next to a basket of fries.
Four meals have been placed on a table with beverages.
A photo of a couple singing karaoke.
Person in black surfing a wave near the beach.
there are two woman that are walking in the street under a umbrella
A black and white photograph of a skater performing a trick.
A herd of sheep grazing on a lush green field.
A group of people riding bikes down a street.
two high school soccer teams play against each other.
A silver fire hydrant stands in the grass next to shrubbery.
Two people with umbrellas stand at the fence looking over the water.
A woman stands in line at an airport.
A big vase with flowers near a cup and window.
A person in a park playing with a frisbee.
A television sits above a fireplace in a living room.
A single tall flower in a green glass vase sitting on a windowsill.
a close up of a bench surrounded by plant life
A black stereo speaker near a computer monitor and mouse.
A woman standing with a donut and a candy apple in her hands.
A red train sits on the rail road tracks.
a young woman sitting at  a table resting her elbow on the table
An elephant in dirt area next to a booth.
Two giraffes eating leaves off the trees in the woods
A variety of fruits and vegetables sit on a table.
Young male baseball player in full uniform and glove alone posing.
a really sad picture of some men with guns sitting next to a dead zebra.
A small plane with the cockpit open and landing gear down
A woman is standing next to a display of giraffes.
A living room filled with lots of furniture and a TV.
The woman in red sunglasses is walking in snow with ski poles.
a close up of a zebra in a field of wheat
A group of people riding horses through a small village.
A sandwich on toast with potato chips on the side.
A brown bear is grazing in the grass.
A multi-colored umbrella that is blocking out the sun
There is a laptop on a crowded desk.
A tennis player is on one foot hitting a tennis ball.
A zebra grazing on dry grass in a field.
Variety of meat and produce displayed for meal preparation.
A red stop sign on the side of a building.
A black and red train engine with train cars behind it.
The lady on the bicycle is waiting for the light to change.
a person with one foot in a snowboard
a bike shop with various bikes in it
A woman is holding an umbrella over her head
A couple of guys playing video games inside
This is a meal made for two people.
a small little plate that has some fruit on it
A large pair of scissors on display next to plaques.
Delicious looking meal of vegetables, cheese and meat on bread.
A public bus near a curb on a wet day.
A red traffic light hanging on a street pole.
a shirtless male surfer is carrying a white board
A man in yellow vest on motorcycle next to a building.
A large tree sitting on top of green grass.
a person is sitting on a couch while on a laptop
A white bathroom with corner shower and tiled floor.
A sausage sits on a takeout plate with spicy carrots.
A bunch of books that are on a bed.
A large red truck visible through the rear view mirror of a car.
A young girls soccer team posing for a picture.
Two lines of bicycles parked on a brick surface.
A snowboarder mid-air above a ramp outside in the snow.
A polar bear goes bobbing for fish at the zoo
A cat lies in a crib next to a small child.
Man pushing a cart loaded with luggage in an airport check in line.
A red and yellow high speed passenger train rolling along the track.
A large, ancient looking clock tower rises above a neighboring structure.
A laptop on a table with a white cloth at an art auction in a hotel ballroom.
A white bird with wings spread under a cloudy sky
many people sitting on the ground with a big container in front of them
Stainless refrigerator and microwave on the counter of a kitchen.
A guy with headphones does a trick with a skate board.
A picture to people and horses in the water.
A baseball player holding a bat over home plate
A Christmas tree sitting inside of a living room.
A woman sitting on a bed talking on the phone.
A subway train with the doors wide open next to a bench and pole.
The clock face on the exterior of a building.
Some animals that are hanging out in the dirt.
A lady holding a camera up near a big black dog.
a small giraffe that is next to some rocks
A dog playing with a toy in the snow
A half full glass of red wine on a table.
A group photo of men and boys from the Goodmayes Boys School dated April 1929.
A white and brown dog laying on carpet under a desk.
I can see one tennis player but I cannot see the other.
A couple holding wine glasses and holding up a tag reading USQ.
Two soccer teams playing a soccer match in a stadium.
a group of people playing frisbee in a field
a cat laying down stretched out near a laptop
An orange kitten is hiding under a blue blanket.
Eight dishes on a platter, each with a different food item
A room with two side by side beds, one of the nightstand lamps are on.
An oven with fire in it and ashes around it.
a small child standing above a skateboard on a tiled patio
a black cat sleeping on some bags of carrots
A cat is looking at a cluttered computer desk.
Adult elephants crossing roadway with young in native land.
A young guy is surfing in the ocean.
Cars and a truck lined up across a train car
A man poses in front of some green wood panels.
A donut frying in oil along a conveyor belt.
Some hands are coming from the closet and reaching for a sleeping woman.
A pizza slice is being removed from a pie.
The umbrella is ready to be installed at the restaurant.
Man with ponytail digging out condiment for sandwich in hand
A plate full of food sitting on the table next to a fork, orange, cup and salt and pepper shakers.
The student is trying to relax on the floor.
there is a man sleeping on a mattress outside
A packet of ramen, remote control, cigarettes and a lighter on table.
A deser plate with cake ice cream and fruit on it.
A table topped with a toothbrush and other items next to a wall.
Cars and buses seen through the reflection of a window
Green cake with a pair of pink pigs next to it.
A young woman is eating a piece of pizza.
A clock tower at seven forty three in the afternoon.
Young boys walking on wet pavement with umbrellas.
A clear tube containing a flower sits on the floor.
Two men sitting on a couch one who is holding a remote.
Three colored beached chairs, yellow, red and blue by the ocean
Antique warplane surrounded by safety cones near person.
A couple of boats floating along a river.
a man is holding a baby  and playing with a laptop
a street sign outside near a flag pole
a rusty flatbed truck sitting by a building
A black fire hydrant that has two exits.
An appliance is standing next to cabinets in a kitchen.
The red clock is displayed for the people can see
a close up of a slice of cake on a plate
A hot dog covered in cheese on top of a plate.
A cat laying on top of a pair of shoes.
A laptop on top of a box on a table
A picture of some food and some coffee.
A room with chairs and a couch next to a fireplace
A very dimly lit dining area with some pretty flowers.
A tennis player wearing all white reaches hi racket up to a ball.
Kites laying on the beach on a sunny day
a small candle lit beside a placemat and some glasses
A woman standing alone holding an open umbrella over her head.
There is a place of food on a white table.
A cat and some people on a grass field.
a train stopped at a train station with people near by
two cows in a body of water near a field
A cat sitting on a dresser with a person in the mirror behind it
A pallet holds a display of fresh vegetables.
Two geese and their babies stand together outside.
A military plane is flying upward in the sky.
People standing next to sheep and feeding them.
Rows of green bananas on a tree with big green leaves.
A young woman riding a horse holding a flag
Two boys sitting on a bed playing a video game.
A meal that looks like falafel and hummous.
Three sheep are grazing freely in the open field
A vehicle near a stop sign with a poster.
A living room minimal furniture and a large window.
A view of a bus stop from across the street.
a person dressed in ski gear in the snow coming down a mountain side
Dual digital parking meters are in place and waiting for a visitor.
A female soccer player sits in the bleachers holding her ball.
a black and white picture of a white man singing a song
A man and woman smile while standing beside each other.
There are several holiday teddy bears in a shop window.
Wall with tools hanging on hooks and two litter boxes under alcove
A girl sitting at a table full of bananas.
A man brushes his teeth with a toothbrush.
A tablet PC decorated with a picture of a girl and three baby pandas.
PIZZA, SPOON, BOWL, COFFEE POT ON TOP OF STOVE
A dog in a grass field with a Frisbee.
A person on the beach flying a kite.
An elephant is being taken down a road in the back of a truck
A black cat staring out the window behind a computer
A man with a black tie smiling and holding a white box.
A puzzle picture of a baseball player batting a ball
A cardboard cutout of two boys kicking a soccer ball
A pizza sitting on top of a white plate on a table.
An old fashioned refrigerator in a kitchen next to an old fashioned stove.
A mom and a baby who is holding a teddy bear
A white horse looking through the window of a tall brick building.
A big orange truck driving down a street.
A wooden bench written 'CITY OF LONDON' at the park
A man smiling while slicing into a cake.
A bed with white sheets and a night stand.
A stop sign at the intersection of fifth avenue and fifth street.
The brown dog is riding a wave on a blue surfboard.
A boy and a group of sheep walking away in dirt field with trucks in background.
A picture of a very nice clean living room.
A person with glasses holds a Frisbee standing in the grass.
Locomotive pulling cars on tracks in outdoor area.
a cute happy bright yellow and red bird sitting on a tree branch
A team of horses hitched and ready to pull a wagon.
A man riding a skateboard while a child sits on the front of it
a newly shaved sheep walks away from it shaven fur
there are any kites that are being flown in the sky
Two elephants are in front of a muddy waterway trampling in the wet dirt.
Zebras are grazing on grass by a car.
a row of three ambulances with white and yello paint
this kitchen is all white and all white appliances
A person sitting in a bed with a laptop before them
Someone is wind sailing out at the beach
A man is sitting on a park bench speaking on his cellphone.
A laptop in front of a computer on a desk and a blue chair with a colorful blanket on top of it.
Two men standing and holding video game controllers.
A 18 wheeler truck on a highway carrying a large over-sized covered load.
A WOODEN HAND MADE KEYBOARD WITH A MOUSE
A view of an airplane traveling across the bright sky.
A dog laying in the grass next to the sidewalk.
A baseball player is swinging high in front of the readied umpire and catcher.
a couple of people on the beach playing with their kites
A homemade pizza with toppings served on a plate
Two giraffes in a grassy field with trees in the background.
A flock of sheep are crossing the street next to the cars.
A couple is sleeping in a bed with red sheets.
A young girl smiles for a picture at the beach.
A parking sign and a fire hydrant.
A red stop sign sitting under a green street sign.
a close up of a bird on a beach near water
a girl dressed in red shirt and black pants playing tennis
A girl with glasses curled up under a colorful, crocheted blanket
Some women are talking next to some sheep.
A laptop computer and mouse on top of a desk.
The refrigerator and the kitchen is being cleaned.
Different kinds of food rest on a plate.
A hot dog lays on a white paper next to a can of juice.
The living room has two couches and an easy chair.
A woman on the beach has a pink hat and umbrella.
Horses and carriages are lined up along a walkway awaiting customers.
A piece of wood with bananas and forks on it
Man in white shirt and scarf throwing a frisbee.
a large field full of sheep out in the outdoors
There is a man about to fall off his skateboard
A group of people standing together for a gathering.
A long train yard full of different equipment
A man with a bunch of plates in front of him by a red house with an open door.
A tile bathroom with a large mirror on the back wall.
a person sliding in to home plate when the guy didn't catch the ball
Several potty pieces with a white background and blue design painted on and one is adorned with feathers.
A big city bus parked right beside a building.
Young black cat lying on desk with head on keyboard.
a close up of a baseball player with a glove
a man that has a wii remote in his hand
An applegate hot dog is placed in a bun.
A row of matching planters are arranged on an outside colorful wall.
Group of people walking through a city at night.
an image of a tray of food on a table
a case in a bakery full of doughnuts of different flavors
A vegetable succotash has cashews, broccoli and sauce.
a image of a dessert on a plate with toppings
A man with lots of tattoos sitting in front of a bowl of food.
Adult wearing white shirt and tie holding baby in outdoor scene.
a pair of gray scissors hanging on a nail and another black item
A horse is standing in the green mountainside grass.
A person is flying a kite on a beach.
a toilet in a wooden themed bathr oom is open
A clock on a tall brick and white tower.
A series of images of skateboarders skating and jumping.
a couch in the living room near some stairs
Trio of large birds sitting next to each other on wooden perch.
A painting of a woman sitting in a chair with a laptop computer.
A couple of small birds on a wooden pole.
This is a collection of different kinds of hot dogs and french fries.
Person wearing all white leaned up against a wall with a yellow sign.
A woman at a table in a restaurant
A black and white image of a young woman sitting on a grassy knoll using her lap top.
A person that is playing a tennis game.
A baseball game is being played before a crowd.
A mini refrigerator stocked with bottles and cans of alcohol and soft drinks.
A young boy holds a kite in a grassy park.
The young man races toward the yellow frisbee.
An orange cat licking a blue pair of shoes.
A close-up of a desert with cookies, ice cream and a cherry.
Two semi cabs are parked neatly beside one another in a park area.
a dog begging for food off a table
A living room with low lights, a couch and a tv.
A television stand has a television and vases on it.
A young boy holding a tennis racquet near a house.
A clock, bird with missile, american flag at an area that looks like a flea market.
Three women at a party posing for a photo.
A display of a man with striped tie and a bird on his shoulder utilizing two Instamatic photos
Three inset pictures including bottled water, small pizza, and cup of coffee.
A man riding a motorcycle down a street next to a train car.
Older man rides on a carriage pulled by two horses
Several men are unloading trunks from a Model T.
A tray with a hot dog, fries, ketchup and mustard on it.
A piece of cake is seen on a clean, white plate.
Small boy holding a kite over his head waiting.
A firetruck without emergency lights on cruising through an intersection.
A TV sitting on top of a brown couch next to a pool.
A black and white photo of a person surfing. The picture is from underneath the water.
A train with a red engine in the countryside.
A man riding a skateboard down a street in front of a red car.
A trio of men throwing a Frisbee in a field.
A motorcycle stood up in a forest with melting snow.
a green fire hydrant siting by a yellow pole
A pile of garbage sitting on the curb in front of a wall.
a few pieces of pizza on a pan
Skiers pause for a photo before hitting the slopes.
A woman plays tennis in a tennis court
A zebra standing by a log and container eating grass.
two stuffed teddy bears sitting in a chair
A cat stands alert on a park bench.
Two pedestrian walk signals are lit up at night.
A cow licking its side in an enclosure
Two female tennis players walking in opposite directions on the tennis court.
A man stands in front of a Jamaican food truck in a city.
An airplane flying a in clear sky above a light.
a man is skating around a cement skate park
A orange and yellow freight train traveling down the tracks.
A stew pot holding carrots, celery, and squash.
A blender has some sort of liquid inside.
A lady dressed with a pink hat and unique clothing snow boarding.
A green airplane flying over a lush green field.
Two children at a skateboard park under a blue sky.
A lamp that is on in the corner of a living room.
A man holding a tennis racquet on top of a tennis court.
A man dressed like a zombie with other zombies around him.
An older man and a younger boy play a video game.
A clock is shown on top of a building.
A small baby is biting into a banana.
A very tall clock tower towering above a city at night.
A man in glasses wearing a suit and vest.
a group of women gathered together side by side in front of a table with pastries on it
A man standing on a  tennis court holding a racquet and a ball.
Three people riding ponies and horses in a residential area.
a woman and horse walking behind a giant pickup truck
An image of half a bathroom and half stairs.
A calico cat is laying on a laptop computer.
a man and woman are outside taking a picture together
An elephant and it's trainers interact with each other.
An arial vierw of a building with a clock tower.
A woman and two men are having a conversation.
A far off picture of birds flying above a field.
A man and two women with Wii video game controllers.
A couch that is in a living room with pillows on it.
There are three adult giraffees that are walking in the park.
many brown and black sheep bushes grass rocks and trees
Several women sit at a table tasting wine
A cat lays down around some stuffed animals.
a person on a motor bike drives down a street
Young man wearing a suit and tie standing inside a building.
there is a tall sign that is on the side of a building
A busy city intersection with people and cars
A close up of a clock reading 1028 and 54 seconds.
A train traveling past two cars on a road in a rural area.
a person on a surf board rides in the water
Two elephants bathing in a man made environment.
Various tools are sitting on the table together
A small bike laying beside a fire hydrant.
White sheep are grazing in a green pasture.
Male surfer on a red and blue surf board.
A mother elephant and her baby are standing alongside a dry water pool.
The view of green mountains and a valley from a cockpit.
A man and a young boy riding on a donkey while people move behind them.
A  ORANGE WITH A WINE BOTTLE ON THE COUNTER
so many elephants moving near some waters in the forest
An image of a baseball player getting ready to take a swing at the ball.
A long river runs alongside the train tracks.
A street sign points in the direction of the road.
Several signs can be read at a pillar in the fence.
A large bear is sitting near a rock in an enclosure.
three people walking a dog in the snow
a guy standing by a fench with his skateboard
A mom and two smaller sheep in a large green field.
A women reading a red book in her bed.
A man riding a motorcycle with another person during a sunny day.
A table in a restaurant covered in plates and mugs.
The man is playing tennis at a very high level.
A fire hydrant sits in a small grassy island near the sidewalk.
An orange kitten laying in a chair with a stuffed bear.
A train is making its way around a snow dusted corner track.
Blueberry stuffed beanie teddy bear sitting on a table.
A black and white zebra stands next to a tree.
Assortment of laptop computers displayed on table with backpacks full of electronic cords.
Two pictures of a stoplight, one is green and one is red.
A large orange bus stopped next to another bus.
a couple of different types of signs on the outside
A group of beautiful woman walking down a street in bathing suits.
a large air plane flying in the sky
Several toy SUV's alongside a toy bus on a highway.
Two parents are helping a baby put on a hat.
Three horses are pulling a wagon full of hay.
A kitchen knife on a cutting board with vegetables and spices beside it.
A boy and a girl posing for a picture.
A variety of Domino's pizzas and a business man selecting a piece.
Two dogs playing tug of war over a frisbee
A gentleman laying on the couch while talking on the phone.
This bathroom is all white and has a framed mirror on the wall
Attractive landscape with picture frames and large white vase.
There is a man cutting something up over what looks like a homemade pizza.
a bunch of bananas hanging on a wall
A female sitting at a table cutting a cake.
A woman sitting at a table with a plate of food.
Large black towel sitting any Penwith hey with people looking at it.
A van is driving through an alley way.
A stop sign is standing on the side of the road in front of houses.
A white fire hydrant is in front of an old couch sitting on a sidewalk in front of a house.
A few mack trucks in a parking lot.
A person wearing all black does a one handed hand stand as he holds a skateboard on his feet.
a close up of a cat laying on a desk
A flooded street with the water up to the traffic lights.
A baseball player waits at the plate for the pitch.
A large green bus transporting passengers through a city
A kitchen with lots of counter space and a black oven stove top.
A table with a stack of orange cups by orange scissors.
A dog is standing on a tile floor.
A large red bus parked in a stationary position.
A boy skate boarding down some steps .
An angled photograph of people flying kites at the beach on a sunny day.
Two dogs are laying next to a bike.
A long silver train traveling through a wooded area.
A light red fire hydrant on the corner of a street.
A person crouching next to a pair of motorcycles
Busy stadium with many people outside near vendor trucks.
Two urinals in a restroom with multicolored tile.
A person watching a sheepdog chase a white disc across a green field with mist covered mountains in the background.
Several people holding umbrellas are lined up near a fence.
A stoplight and street signs beside old buildings
A tennis player getting ready to swing her racket.
a close up of a drink on a table near a laptop
Several pieces of ancient pottery and stoneware on display in an exhibit.
A woman standing in a room holding a Wii game controller.
a group of people with surf board standing on some snow
the ice cream vendor is talking on his cell phone
A pile of TVs sitting next to a brick building.
an image of street signs on a residential
A guy on a skateboard in front of a water fountain.
A giant cake decorated with round discs on a table
a woman on skis is standing in snow with her dog
A woman casually reaches up to hit a tennis ball.
A fat kid enthusiastically enjoying a pizza from a big pan.
An old rusty fire hydrant sitting in the grass near a picnic table.
A new kitchen that has just been built.
A sleek, modern toilet has a backlight and granite counter for storage.
A teddy bear posed sitting holding a book
A train running on train tracks through the wilderness.
A white toilet sits in a bathroom, with the lid open.
a bathroom with dark tiling in iit and a pink bathtub
A machine is on a folding table in a small kitchen.
A person riding a skateboard down a street.
A man kiteboarding over a large body of water.
A man working on a laptop computer at a desk.
Two people pulling a luggage cart down a sidewalk.
A white plate topped with meat and vegetables.
Skiers come down a snowy hill in a row
two glasses of juiced carrots and apples on a white cutting board
A man is seen in a mirror in a bathroom.
A person is holdiing a kite in a field.
A sepia colored room shows vintage furniture with a tendency to the frilly, including a bed with a curtained balcony and a chair, both in matching floral pattern,  and a dress form.
a little tourist train pulling three cars of passengers
A mirror is shown with a man driving in it.
A moped parked in front of a yellow wall and traffic sign.
A flock of sheep sitting in the middle of a field.
A skier kicking up a spray of snow.
Yellow and blue fire hydrant in front of a movie theater.
A teenager does not make any expression as he rides a skate board.
A man holding two small green birds in his right hand.
There is a baby elephant with its parent
a truck parked on a beach near water
The neatly made bed is beside an open window.
a big grizzly bear looks toward the camersa
A renovated propeller airplane flying in a blue sky
A giraffe that is eating a piece of food near another giraffe.
A group photo has smiling people and one dog.
Brick houses with brown stairs stand near a wide sidewalk by a line of trees.
A large building with windows and cars parked below
An asian woman holds a sub sandwich near her mouth.
A pizza cut up into many pieces on a white plate.
A red truck with a trailer attached, is parked near a red house.
A plate of food with eggs, meat, salad and a fruit cup on it.
A small table with cups and saucers and a clock on it.
A boy flies a kite on a beach near colorful tents.
A very cute cat laying on a desk.
Several Billabong surfboards make up a nice display.
A cat looking at the television with flowers on the screen.
A group of people in  a park playing frisbee.
An airplane hanging from the ceiling of a building.
A table topped with lots of different types of fruit.
A small child's bed sitting next to a window.
Tennis player with white outfit holding a racket.
A person wind sailing next to a person para sailing.
A man is prepared to get on a wake board
A horse pulled carriage on a open street.
a couple of bowls of food sitting on a table
A refrigerator is shut with black duct tape.
a man standing on a surfboard in the water
A large clock rests on the side of a brick building.
A picture of a vase with colorful flowers in it.
A giraffe walks the grasslands by himself at sunset.
A dog heading into the water near a horse.
A woman rared back at a tennis ball with a racquet.
a person at a table with a plate of food
A small child on a bed looking at a lap top computer.
A group of people are looking at something or someonr
A bride waits for something while holding her bouquet.
A white cloth with scissors, a needle, thread, and measuring tape resting on top.
A tennis player gets ready to hit the ball as a crown watches from the bleachers.
A white truck has a vision sign on it.
A clock on a steeple of a tall building.
a shop with some wine bottles sitting on a counter
The kitten is nesting inside the empty bowl.
A gondola like boat crossing over a bridge
A large clock tower with a roman numeral clock on it's side.
A woman on a court with a tennis racket.
There are three dishes and a vase with two roses on the table.
A boat sailing on a massive lake surrounded by mountains.
A woman in a long dress talks on her cell phone.
A white plate topped with a hamburger next to fries.
A man riding a surfboard in the ocean on a wave.
A LADY FEEDING HER CAT WITH A SPOON.
a man standing at the beach with a surfboard and a paddle
a white building and some people flying a kite
A guy standing in the grass is ready to throw something.
grandma watching two kids playing a video game
A bowl of soup and a sandwich plate on a table.
A photo taken from a boat with a long bridge in the background.
An average hotel room with twin occupancy capabilities.
A woman is eating a pita on the street.
A baseball player holding a baseball bat in a game.
A serving dish has meat and greens in it.
two boys with painted faces laying in a bed
Two men sitting at a table with plates in front of them.
Wooly horse and sheep dog face each other down
A white pitcher filled with orange and purple flowers.
A lady bent over with her tennis racket while another girl looks down court.
A grey and red train next to a train station.
A person standing in a bathroom next to a white toilet.
A counter with various baking ingredients that include bananas, butter and oats.
A small plane is dwarfed by the larger ones in the background.
A couple of dogs standing outside of a wrecked car.
Motocross rider displaying aerial tricks on nice day.
A male child swinging his bat at a ball, another child behind him as the catcher.
a man holds parts of a broken television
A lone bench sits atop a hill looking over the river.
Donuts are going through the mechanical glaze machine.
A white boat sitting next to a  dock near a white building.
People sitting on the side of a street next to suitcases.
A group of baseball player standing on top of a baseball field.
A man wearing a brown suit and brown tie.
A train on some tracks with power lines above it.
A boy in a blue and white shirt playing tennis on a brown tennis court.
A toy town with a train on the tracks passing a signal.
A couple of sheep standing on top of a grass hillside.
A man on a stage with ski poles in his hands.
Chef stirring large pot on top of stove.
A young boy is playing tennis at the tennis courts.
A boy riding a skateboard and doing a trick.
Two girls compete in a game involving a frisbee.
The dog is sitting in a chair beside a bright window.
A close up of someone's feet on a skateboard.
A white dog is on top of a bed looking into a box.
Brightly colored oranges, pear and apple in a colander.
Suitcase containing many compact clothes for just one person
a person on a tennis court holding a rackett
Three men on a field playing a sports game.
A bunch of people walking on wet sidewalk by buildings.
A woman holding a tray of food in a kitchen.
The sink counter of the small bathroom is made of wood.
The girl is standing with her laptop in her hand
Three people on snowboards on the slope of a mountain.
A man standing next to his guitar case talking on his cell phone.
a small boy in a black shirt a brown and black dog and a bed
There is an airplane flying by a mountain.
A toilet stall with green marble walls and a painting.
a toilet a tub  some pipes and a window
A black and white photo of a train pulling into the station
A lady is running with a tennis racket on a tennis court.
A meal of hot dogs and stuffed vegetables
a person cutting a cake on a table
A woman is sitting down with the light turned down low to take a picture of herself with her cell phone.
A couple of white horses walking along side a rocky hillside.
The motorcyclist is traveling down the busy street.
Park bench on snowy elevated viewing area above city.
A man in a green tennis outfit hits a tennis ball with his raquet.
a man that is standing up on a stage
A room with a picture on a wall and a vase near the window with flowers in it.
A train moving along a track, approaching a light signal.
a small toy truck with a cat peering through a window
Bed with yellow blanket against a wall with hard wood floor.
A group of people are painting a bench in the park.
The pitcher just threw the ball to the batter at the baseball game.
A classic clock sits on a wooden table.
A giraffe on the dirt looks tall among the trees.
a living room with couches covered by sheets
A man in a suit and tie is playing a key board.
a number of oranges in a tree on branches near leaves
A hat is sitting on the top of a bed.
A man eating a hot dog on top of a bun.
Several birds are standing in a large nest.
Two woolly sheep in front of a wooden fence and barn
A couple of trick planes flying by each other.
Guy in shorts and a cap ride along top of wall with his skateboard
A person skiing downhill in the white snow.
Two men looking at a plane on a runway.
two woman sitting on the ground one is on a cell phone
Two Asian men standing in a office with business suits on.
A bunch of used appliances sitting on the street
a clock attached to a green pole on a building
a sheep standing in the grass next to a fene
A row of passenger buses traveling down a lone road.
A black bear lying down near many trees.
a couple of houses that are next to each other
Several broccoli plants planted next to a wall.
Train cars sit on the tracks next to a platform.
A group of lambs are running in the opposite direction of a dog who lays barking.
Fluffy white cat laying on a lightly colored bed.
A living room filled with furniture and a flat screen TV.
An employ looking kitchen has a black refrigerator.
The bathroom is mostly a red color. It looks very old.
A plate of food with a salad and very large chicken sandwich.
an image of a cat next to the feet of a person
A green train sitting along side a train station platform.
A man lays in a hospital bed while holding a teddy bear.
A man stands behind the counter of a restaurant.
A skate boarder jumps off a curb into the street.
A lit up display of teddy bears of different colors and sizes.
A cap that is sitting on a blanket next to a remote control.
a couple of cats are laying on a bench
a group of people standing around a metal briefcase
Several types of wild animals grazing in an open field.
A man sitting at a table with a pizza in front of him.
A large long train on a steel track.
Baseball player preparing to strike ball from the pitcher during game.
The woman is in a ski racing down the path.
A vase filled with flowers sitting on top of a counter.
An old parking meter sets with time expired in front of a parked vehicle.
A large white polar bear sitting on top of a rocky ground.
A kitchen that has white cabinets and a white oven.
a close up of a child near an opened refrigerator
a bedroom with a large window cover with shiny curtains
Some traffic lights suspended over a road by some parked cars and houses.
A row of parked motorcycles on the side of a street.
A goat with red painted horns on its head
A refrigerator packed with lots of food and drinks.
A surfer falling in a wave with other surfers nearby.
A huge cargo ship sits empty in a bay reflecting blue skies.
A man that is sitting at a table.
A slice of pizza with cheese and golden crust.
The pizza is topped with broccoli and onions.
a bunch of plates of food no a table
A tower that has a clock on it.
A man surfboards on a wave in muddy water.
A woman is skiing near a bunch of trees.
A caution light and traffic cones set up to block a street.
a black cat is sitting on a green bench
A narrow city city features colorful buildings and a large green bus with cars behind it.
A man standing next to a beautiful woman.
a kitten laying on a bed next to some phones
A big piece of bread is placed on a white plate.
A small white dog begging at a door to come inside.
A doughnut on a plate and a banana.
A plate that has a cooked pizza on it.
a man carving the turkey for thanksgiving dinner
Three sheep graze in front of a barn.
There is an old street sign leading against a building.
a bear in teh middle of a grassy field
a green and white street sign and a traffic light
Traffic light signaling green at the train tracks
A cell phone held open in a bathroom.
A teddy bear sitting in a very unusual spot high up
A man touches a hammer to the center of a clock.
A group of three people sitting on top of a green couch.
A couple of military men cutting up a  giant sheet cake.
A person eating food from a large white dish on a desk.
A man standing in front of a microphone wearing a suit and tie.
A cow laying on top of a grass covered field.
The side of an airplane that is parked, and an Air China sign on the side of the plane.
Shadows dominate the landscape in this dark, dreary scene.
A snowboarder is boarding next to a chairlift.
A woman in a tiara cutting a birthday cake at a party
a desk with a keyboard mouse monitor and a tv
A giraffe is crouching in the grass next to a tree.
A cat sits on the table next to a bowl.
two adults holding a baby while wearing ski wear and standing on a snow bank.
Giraffe looking through a set of bars in a cage.
a white building with a white clock and some trees
there is a red stop sign on this street pole
The tall zebra is following slightly behind the shorter one.
A skier is headed down the steep slope.
A surfer riding a small ocean wave on his surfboard.
A young woman is holding a cell phone open next to her face.
a brown horse with a white stripe on its head
A cat is sitting by a single shoe.
A single giraffe that is walking in a field.
A group of people posing for a picture.
A person is standing on the beach holding a kite.
A group of people in grassy field with kites in the sky.
Two women playing tennis on a tennis court.
There is a man interacting with a black dog.
A parking meter on the curb of a city street
A ceramic object with blue flowers on it.
The dog went all the way into the water to fetch the hat.
Cows and a sheep eating food from a red box.
there is a piece of cake and a fruit on a green late
There is a stop sign covered in snow.
a couple of people holding a martini in their hand
An airplane on a runway with another plane flying overhead and a truck nearby
A very tall white clock tower towering over a lake.
A corner of a rest room with a shower with glass walls.
Three sheep laying in hay in a gated area.
A group of people sit at a table with food.
A desktop computer monitor sitting on top of a desk next to a mouse.
A lot of carrots on a wood board for sale.
A man standing on top of his head while riding a skateboard.
A man dressed in a military style uniform shaking another mans hand.
A empty living room that has a table in the center.
A kitchen that has pots on the stove.
A large two story boat floating in a lake surrounded by mountains.
A person on a motorcycle is doing a wheelie
two white black and brown dogs are lying on a red couch
a vast, grassy field with animals in the distance
a gray cat is sitting on a wooden bench
An iPod with ear buds and a mouse near a book and keyboard.
A black leather case containing several pairs of scissors.
Several boats that are moored at a dock
a man standing on the corner and people walking down the sidewalk
A woman holds a colorful kite in a city park
a close up of a person playing nintendo wii
A commander cuts a cake at a military function.
A boy looking at the camera while sitting at a wooden table.
A large ship making it's way through the water.
A woman in a bustier holding a stuffed animal.
an image of two men standing in front of a Christmas tree
A vase with flowers sitting next to a glass tomato.
The lamps are on next to the pull out couch.
Black and white photograph of two people on a moped.
A kitchen with white cabinets and a stove on the counter top.
President Barrack Obama standing in front of a crowd while giving a speech.
A boy smiling as a large spider walks on his arm.
A giraffe standing outside of a building next to a tree.
The back of a moving truck that has a man standing on a lift with a royalty style chair next to him.
A duck is swimming in the pond to the next destination.
A bathroom with some of the wall removed during a renovation.
a couple of little kids in baseball clothes stand next to each other
A group of people flying kites over a beach.
a person siting on a bench with a dog near by
a black cat sitting on top of a black suitcase on a bed
Two computer monitors, two keyboards and two CPU's on a desk.
A cat looking inquisitively over the top of a car seat.
planes and cars sitting on an airplane tarmac
Trucks and cars going down a commercial retail street in a city
A young woman is preparing to hit a tennis ball.
A man that is at the beach jumping in the air.
There are many books and magazines in the small room.
A framed wedding picture on a crowded wooden table.
A child holds a string over the water.
The train has spots of rust that are obscuring the graffiti.
A series of photographs depicting bathroom before and after minor changes.
Dark cabinets around a white two doors refrigerator.
A man holds a card and wine glass with a woman who also holds a wine glass.
A casserole containing broccoli and  topped with cheese.
Woman standing behind open refrigerator door in modern kitchen.
A turquoise and orange station wagon with two surf boards on its top.
A tennis player in black shorts and white shirt looks up and holds back a red racket.
a train is passing over water on a bridge
A woman wearing a maroon sweater standing in front of crates.
A flock of black-faced sheep near a watering trough on a rural hillside.
Two giraffes standing outside while people watch them
Table centerpiece of a tall wine glass shaped vase with flowers
A man is holding a large piece of pizza.
The young child is close enough to pet the cow.
One horse has taken the lead in the race.
A skier struggles in deep snow with their lost ski.
A collection of different smart phones on a table.
Note with listed items on white refrigerator in kitchen area.
A man with a backpack and coat walks by a bus.
An magazine photo of a restroom toilet and sink.
A man choosing a piece of pizza from two boxes
A scenic view overlooking the water at night or early morning
Seagull flying through marina with many boats around .
A blue bird standing on the ground among large green leaves
A male skateboarder does tricks on a half-pipe course.
People ride on the back of an elephant while being guided along with other elephants.
A peeled banana on the front of a car.
Three people ski in a row in the snow.
A woman riding a wave in a wet suit on a surfboard.
A beautiful young woman brushing her teeth in a bathroom.
A scene containing a couch with flowers and a mirror.
An automobile with a timer attached on a city street.
An elephant placing its trunk on some plants and some people watching.
Several elephants dressed for the circus are in line next to people.
Two sheep sitting behind a fenced in area
A man flying through the air while riding a skateboard.
A close shot of a pizza plate with a rubber on it.
A bear observing something on the ground of a field.
A dog in a river chasing a red ball that is thrown into the water.
a person handing a child a plate of food
A bride and groom are cutting their wedding cake.
Two dump trucks driving down a two lane road with a white pick up approaching from the opposite direction.
Lady using oxygen in bed with a little dog.
An elegant kitchen has an attached stone fireplace
Two zebras in a field are eating grass.
there are many computer monitors and things on this desk
A man leaps in the air while on his ski board.
A woman dressed in a button up white shirt, suit and necktie.
The lid is up on the toilet bowl.
A dog is resting on the window sill of the building.
People milling about outside in a busy city
A young boy wearing goggles and a billed hat holding a stick.
A group of people walking under a leafy green tree.
A motorcyclist with a female rider in the back and a dog in a sidecar.
a woman poses in front of a giant pizza
A woman holding her hand over a giant pizza
Sun setting on a dark street and buildings.
Giraffe holding it's head mid way with a wooden gate behind it.
There is an elephant that is lying in the grass
A giraffe presses his head against another giraffe.
A plate with a pastry on it, topped with whipped cream.
The airplane is waiting at the airport for passengers.
A group of people on bicycles riding down a road.
A man and a woman stand in a field with cows and horses.
A bathroom in the process of demolition.
Black and white photo of a small air craft.
A person standing next to some old junk appliances.
a tennis player in a black shirt is wiping his face
A couple of plastic containers filled with lots of food.
A man is holding a banana in his hand.
a cow in a field looking into a camera
A view of a cell phone and a watch on a table.
A bowl with sliced avocado, eggs and tomatoes.
A empty water bottle sitting on the corner of a wooden bench.
A bathroom that has a broken wall in the shower.
A full course meal with meat and mixed vegetables.
A young man in a kitchen shapes dough into balls.
A couple of elephants standing next to each other on a dirt field.
A couple of dogs walking through a large body of water.
Two flat computer keyboards laying on a table
elephants in the wild surrounding a large tree
A large park with people flying kites in the sky.
A man with glasses on and a suit and tie.
A traffic light on a street corner with shops behind it.
A boy soaring into the air, doing tricks on his skateboard.
Smiling people are holding a large white snowboard.
A black cat and a "K" sitting on a green bench.
An attractive young woman holding an umbrella under a tree.
Grass roofed umbrellas on a bay with cliffs
A person holding a banana in front of a basket containing fruit.
two dogs playing in the snow as a one person wearing black uses a snow board to go down a hill.
Two cows are standing on the end of a boat
A picture of several street signs on a post.
A man walking with his surfboard on the beach.
a person on a beach with a kite flying in the sky
The cat stands on the edge of a bed looking at television.
The large scissors are sitting alone on the counter.
A pair of scissors and some stick like things in a bag on a wooden table.
two horses in a field of grass near bushes
Infant in a high chair eating a chocolate frosted chocolate cupcake.
A skateboarder jumping with two others behind him.
A painting that shows a vase with flowers and a table.
A person riding a horse, jumping it over an obstacle.
A family of zebras standing together at a zoo.
There is a train attached between two buildings as a walkway.
A black headed sheep sitting in a field looking onward.
Two people embrace while walking down the street under a pink umbrella
a room that has all kinds of christmas deco in it
A crowd of people standing around an old fashioned train engine.
A couple of very small cute kids in the rest room.
A BLACK AND WHITE PICTURE OF TWO WOMEN BASEBALL PLAYERS
A baseball pitcher delivering a pitch to a batter.
A Harry Potter novel is set next to a plate of eggs and toast.
a person milking a cow next to a wall
a woman watching a dog jump up for a frisbee
A zebra grazing on top of a grass covered field.
A large metal pan filled with peeled food items.
A woman at a baseball game talking on her phone.
White truck with painted words parked at night.
a white box with 12 sugar glazed donuts
A large black bear about to take a swim in a pool
A train has been painted with Christmas decorations and lights.
The large crowd watches a skateboarder descend a rail on a stair case.
A group of people having a picnic on the beach.
A little dog that has a frisbee in their mouth.
A small cup cake and a knife on a plate.
A frosted doughnut with sprinkles on a table.
A man in a yellow jacket is snowboarding
The flowers are in a vase on the table.
A close up of a vary unique looking vase in front of the tree.
A little girl standing and holding a remote in her hand.
A baseball player holding his arm up with a ball in his hand.
A pitcher in the middle of delivering a pitch.
some kind of room with some weird things in it
a bear partially submerged in a body of water
Young girl making funny face in residential home.
A table with a plate of food, utensils and some other items
A plate of cooked broccoli on a long white platter, next to a dipping sauce.
People inspecting a large, shiny semi trailer truck at a park
a couple of men compete for a frisbe
Artistic black and white photo of man on a motorcycle.
a cake made to look like two trains
A group of oranges stacked in a wooden bucket.
Chilled beverage in glass bottle next to orange halves.
A woman in playing with a green frisbee at the beach.
A stuffed bear sitting in a chair with napkins and cup.
A display of wild animals inside a building.
a big train passes under a big bridge
A man sitting on a bench next to a man.
Two kitchen stools sitting in front of an island in a kitchen
A bowl of fruit and a plate on a table.
A phone and a computer on a kitchen counter.
Someone having dinner in a dimly lit restaurant with wine.
A pole, light and traffic signal have all been painted green.
Cat sitting inside kitchen cabinet, near the dishes.
A red and white bus driving on the street
A living room scene with chairs, lamp and a clock.
A girl carrying a kite walks along a beach.
Two children in a fire truck amusement park ride.
A group of plates with grilled meat, bread, and appetizers.
A man in a blue blindfold reaches a doughnut tied to a string with his mouth.
A seagull wading in the surf at the waters edge.
People enjoy a day at a mountain lake.
Police tow truck parked on a city street in front of stores
Living room with TV playing and view of a hand in the picture.
People stand beneath umbrellas on a flooded road.
a dog jumping in the air with a frisbee in its mouth
A tree with a white low trees sign hanging off of it's side.
A man in a dirt field next to a group of sheep.
A baby girl wearing a red shirt holding a tooth brush in her mouth.
An upstairs bathroom is pictured in this image.
Woman laying down on a mattress at a store.
A skateboarder performing a trick on the edge of a ramp.
Man in a kilt and woman and white dress cutting into a cake.
A faded stop sign near a street side.
A snow covered wood bench in a park
Major League Baseball player taking a very fast pitch from the pitcher
A young toddler playing in a suitcase on a bed.
a group of people playing frisby in an open field
Woman kissing little girl's cheek under umbrella indoors.
A man is surprised by a very large doughnut.
People bringing in a loaded boat of vegetables to the market.
Two skiiers ski down a mountain in front of a village while it is snowing.
a close up of a pizza on a wooden spoon
A Bus Stop sign peeking out from a vined wall
Two green animal food bowls sitting on a tile floor in a room being refinished
Two people standing near the water holding surfboards.
The man is standing in the snow with his snow board.
a close up of a broccoli plant with leaves
A cow is standing near a fence in a field.
A sign that is standing in a parking lot.
A boy in white shirt playing with a Nintendo Wii controller.
a red umbrella is inside out in a city
A woman sitting on a bench while reading a magazine.
A dog riding on the back of a horse.
A catcher and an umpire near home plate.
there is a blue and black bus stopped at a bus stop
A young girl riding a skateboard behind a man on a bike.
some baseball players are playing baseball on a field
A black-and-white photo of a person sleeping in a bed.
Two people dressed a refrigerators walk down a crowded street.
a sink a picture a mirror and white tiles
Young boy inspects a picture on a table with construction paper materials.
A man is standing by a movie poster talking on his phone..
A sesame sandwich sits on a white plate with a cup of coffee.
A kitchen with a white stove top oven and a refrigerator.
A man sitting in a chair with a canned beverage in hand.
A motorcycle rider bends down on a track.
Several sandwiches sliced and neatly arranged on a white plate.
A street light pole with many street signs and warning signs.
The girl in a tan skirt is sitting on a bed.
There are electronics and other music equipment around a desk.
a person in a field with a kite flying in the air
Woman talking on cellphone in front of personal computer.
A red stop sign mounted to a wooden pole
A red and yellow fire hydrant in an open field.
A couple of giraffe standing on a grass covered hillside.
A guy on a tennis court holding a raquet.
an image of a woman at a ski slope
Rear-view of a horse as it grazes near concrete.
a cat is sitting on a white keyboard
The plate is loaded down with a lot of food.
A pair of leather chairs beside a table and matching couch.
a hot dog sitting on top of a mound of french fries
A young boy appears hesitant to eat some broccoli.
A woman with glasses making a call with a cell phone
Two bears laying against wood with a sign.
A plate that has broccoli and other food on it.
A blue and silver cell phone with an accessory.
They've gotten off the bus to stretch their legs for a few minutes.
Horses stand saddled in their paddock near the beach.
A man swinging a baseball bat on a field.
Chicken with sauce and broccoli is served in a serving dish.
A couple of boys playing frisbee against each other.
Large white birds with black beaks sit atop benches.
Two women who are standing under an umbrella.
A plate of food, bread and salad, sits on a chair.
a big boat is going down a small river
A caramel apple is sitting next to a jar of french fries.
yes we have no bananas we have no bananas today
a man gives children food to feed an elephant
A table covered in different kinds of baked goods
A bunch of dead, stuffed wild animals on display.
Two men loading up the back of a truck
There is a mug with a fork in it and an unidentifiable liquid.
two red double decked buses side by side
two people riding motorcycles on a street at night
A woman is looking at ribbon for children participating in an art activity.
A box of pepperoni pizza already has two pieces missing.
A room filled with people sitting at tables eating food.
A cutting board with pizza and a glass of wine
A woman with a purple umbrella stands on a brick street.
A man dressed in all white is posing on a motorcycle.
A mixture of food and drinks sitting on a table outside.
A man sitting on a horse in the sun
A man is sitting on a picnic table next to s ski slope.
A woman on a bed kissing a mans face.
A young boy puts together a kite on the floor.
A store has displays of pans and other things.
Several people in a large building that is filled with luggage tagged with yellow tags.
a perishing square tent set up with a bicycle
A carnival occurred on a beautifully sunny day.
A herd at of zebras are grazing in the field
A cow poking its head between skinny tree trunks.
a skier at high speed coming down the mountain
A bunch of vegetables and fruit arranged on a table.
A large slice of angel food cake sitting on top of a plate.
a baseball player swinging a baseball bat at a game
Two guys in suits are having a conversation at the couch.
A large truck driving down a city road.
a whole bunch with luggage standing outside
Multiple people standing in the water on a beach.
Two airplanes lined up on pavement near a building.
A man lounging in a computer room with a laptop on his lap.
A man carrying a kite while walking on the beach.
A young elephant walking in tall grass behind a larger elephant
A snowboarder leans into the snow with their board.
A lady with green hair and red boots sitting in the grass near a horse.
a man looking at the chocolate on his fingers
A group of children running after a soccer ball.
Woman approaching the door of a train at a station.
A group of people sitting on a bench in front of a restaurant.
A lot of animals that are in the grass.
some people are walking down the street with each other
A nutty cake is sitting in the grass.
A car sitting in the middle of the grass in the rain.
A lot of boats parked in a large body of water.
A man in a police uniform sitting on a horse by a traffic light.
A flipped image of 2 toned room with a small chandelier.
Light green and white painted fire hydrant with people walking in background.
A man and a woman sitting on adjacent couches focus on their laptops.
A golden bath area with a chandelier and blue and white bathtub.
The clock is below the dome of the tower.
A close up of a full cooked pizza pie.
two elephants are drinking some water on a sunny day
A small glass of liquid sits on a table.
A sandwich and salad on a plate sitting on a black table.
A woman is getting ready to hit a tennis ball.
Two motorcycles are parked next to each other.
A professional baseball player about to pitch the ball
A modern kitchen with a large window by the sink.
there are two men riding a motorcycle and holding a umbrella
Two racks on top of a white counter topped with cup cakes.
A person with a skateboard on a ramp.
The little girl is standing between the low shrub and the fire plug
A man and a woman beside bicycles with orange train cars behind them.
A snowboarder riding in the air above the snow.
A man and woman in the middle of a conversation.
A woman skier in costume at the beginning of a race.
A store window with stuffed teddy bears in it
A man bent down fixing a toilet .
Group of motorcycle riders being led by a police car.
Red passenger train passing over top of a bridge.
A small coyote is seen in the back of some tall grass.
A large red chair in front of a building.
A man is on snow skis on top of a mountain.
A baseball player is ready to swing at a pitch.
A woman serves out a sauce for her dinner party.
A young man is preparing to throw a Frisbee.
A brown and black dog holding onto a couple of crushed water bottles.
a nice black back splash a plant  some body oils and a black Kleenex box
A beautiful young woman laying on top of a bed next to a dog.
Large pizza sitting on a table next to beer glasses.
A fire hydrant with a blurry view in the back of it.
An infant is sitting in front of a computer.
A cook placing two pies in the oven.
A group of people standing around each other in front of a building.
A skateboarder performs a trick on a small ledge.
A boy on his skateboard at the top of a skateboard ramp.
a women that is playing tennis on a court
Very small remote control that fits in the palm of your hand.
A huge elephant is walking down the road.
A woman standing next to a man while wearing a short dress.
A MAN WITH KIDS ARE ON THE FLOOR
A kitchen with an island that has place settings
A moose is getting some shade outside an old building.
Jockey on black horse being walked around infield.
A person cutting into a plate of food on a table.
A long train is going down one of many tracks.
Several stacks of  different types of books on a bed.
A giraffe laying on lush green grass next to trees.
Two glazed and one chocolate doughnut placed on a napkin.
a police officer rides a motorcycle on a walkway
A child stands next to a window near a bear.
BABY IN BLUE JEAN OVERALLS HOLDING A CELL PHONE
A view of a Sony remote, next to a laptop.
A person on a skateboard is riding up a ramp
two people riding skis on a snowy slope
A baseball player up to bat swinging at a baseball.
A little girl laying in bed holding a book next to a black cat.
A woman cuts a cake while two dogs watch closely.
A fritter and a donut on a white bag next to a donut box.
Some chefs working together in a big kitchen.
A picture of a person brushing her teeth.
The pizza is topped with very unusual ingredients.
Bicycle parked at meter outside large building with column.
Three of six people standing and sitting at a restaurant table are on cell phones.
A brown teddy bear sitting next to a wall with a painting.
A small room cluttered with piles of books, a portable TV, and stereo equipment.
A yellow rectangle sign stating that pedestrian priority crossing is ahead.
A restaurant clock displays the time of ten twenty.
THERE IS A METER POST ON THE STREET
A group of men that are in the back of a truck.
A man in a orange and yellow outfit juggling tennis rackets.
A sign that is on the side of a building.
A horse wearing a pink hat pulling a carriage.
Many people are scattered together near an Orange stand.
Two men playing a video game inside of a room
A woman riding a surf board through the waves.
A pretty young woman walking a bike with a small dog in a basket.
A man standing in front of a brown horse.
A very big group of happy looking people posing together.
Different kites flying around in a field with a bunch of people.
A wall with a  black and gold clock and walkway above.
A plate full of a lot of good food ready to eat.
A slice of cake with icing on three sides, a knife and fork beside it, on a wooden table surface with a knot area visible in the wood.
man jumping up super high in a grey jacket
A person on a sidewalk holding a kite for the camera.
A very big pretty green vase with some flowers.
A polar bear standing near a tree on grass
A kitchen island has a farmhouse sink on it.
A sheep looks at the camera, by the side of the road.
Two pieces of bread coated with a dark brown spread.
The train car is used as an office by railroad personnel.
City scene of cars at sunset going past stoplights.
A red and white street sign that reads "no parking any time."
The towel bar is above the toilet in the bathroom.
Baseball batter hitting ball standing near catcher with mitt.
Two skiis and poles stand upright in the snow.
This kitchen table has fruits and vegetables on it
A cluttered room with a televisions that is surrounded by shelves that have various games and supplies all over them.
A view of a narrow kitchen with the only light coming from a glass door.
A woman standing on the sidewalk, looking at her phone.
A tall clock sitting next to a barren tree.
A vase that has flowers inside of it on a glass table.
A man in a blue shirt, blue hat and gray shorts playing tennis.
A person is near a row of luggage carts as one man pushes a cart.
A boy blowing out candles on a birthday cake.
A small locomotive on small train tracks with people inside.
A girl is standing against a wall in a room.
A breakfast plate of scrambled eggs and fruit.
Baseball player swinging a bat during a game
a large plane is parked at the runway
A man in a wet suit on a surfboard in the water.
Clock tower sitting in a pier with clear blue water.
Large group of motorcycles on brick street with trees
Woman standing on a surfboard in calm water.
Man riding a horse in a foreign country.
Woman on a bus looking out the window at another orange bus.
A rusty parking meter that is empty
A woman with blonde hair sitting at a bench in front of a building.
a man is riding a snowboard in the snow
A giraffe hiding behind a grove of very tall trees.
An elephant with no tusks walking in the woods kicking up dirt.
A person in a baseball uniform about to catch a flying baseball.
A pizza that is sitting on a table.
A man eating a hot dog and holding up a dollar bill.
A plane up in the sky viewed from below labeled "Cityjet.com"
A cat on the toilet peeks its head into the bowl.
A group of people gathered around a table outdoors having a meeting.
There are several people in this funny boat.
A man slicing pieces of bread with a knife.
A stack of pancakes covered in blueberries and whip cream.
A man is standing close to a tv playing video game bowling.
Two elderly men preparing a motorbike for a journey.
The motorcyclists turn the corner of the road next to the home.
A cat is playing with the bottom part of the umbrella
A traffic light and cars on a street.
A bride and groom teddy bear each in a coffee cup on a saucer.
A laptop computer sits on a girl's lap.
A street sign that reads, "right turn only."
A baseball player is preparing to swing while several people watch.
A man sitting behind a group of different wines ready to taste them.
A black cat rubbing up against a woman laying on a surfboard.
a person walking on a train station platform
A long train traveling past a forest near a road.
a herd of elephants walking down a path in some tall grass
A silver railroad train traveling down the tracks
A man with glasses showing off two cell phones.
A dog looking cautiously at its reflection in a mirror like object.
A man using his laptop sitting on the balcony with a water view
Plate of food with a variety of vegetables.
A blue bullet train stopped at a train station.
A red fire hydrant between two potted plants.
What can only be described as an interesting and presumably authentic dish.
The man has a tennis racket in his hand.
A large jet liner sitting on top of a runway.
A batter, catcher and baseman during a baseball game.
The train drives between the forest trees.
A green city bus traveling by a parked truck.
A woman is using the ingredients to make sushi.
Two men in a living room holding the Nintendo Wii remote.
A clock with roman numerals hanging on the wall next to flower patterned drapes.
A tray with carrots, snap beans, mash potatoes and an egg.
A dressmakers dummy with hat, coat and tie.
A plate of food has carrots and broccoli.
A mini stagecoach being pulled by one horse and driven by one driver.
A young girl holding a tennis racquet on a  tennis court.
A dog is in mid air with a frisbee in its mouth.
A skier in the air coming off a jump with a mountain in the background.
A cutting board topped with fruits and vegetables.
A pizza in it box siting on a table with a side dish.
A little bird standing a the twig of a tree.
A polar bear laying down on rocks by some water.
A woman sitting by a man at a restaurant eating food
A group of young men standing on a sandy beach.
People sitting and walking in the patio and grass area of a building with tented sitting tables and lawn chairs.
A man serving a tennis ball on top of a tennis court.
A white toilet sitting on a sidewalk outside.
Multiple skateboarders in the same outfit ride in a demonstration.
A fit young woman enjoying a game of tennis.
A decorative cake with several layers and an animal on top.
A brown dog laying in an open suitcase on floor.
A man bending over on a tennis court.
A slice of cake baside a fancy beverage on a wooden tray.
a car with luggage bags on the roof and in a trailer
A couple of men standing next to each other holding snow boards.
The bathroom is equipped with many electronic devices.
a person standing with a tooth brush and tooth paste
Two giraffes are standing together in a field.
a small cat gets petted in front of a laptop
A cardinal sitting on a small branch of a cherry tree.
There is a man wearing a dress shirt and tie.
A man about to sit at a restaurant table with a woman.
Many people around a chocolate birthday cake with candles.
A man riding down the side of a skateboard ramp.
A man sits on a broken toilet as people walk by.
A person flying a kite on a sunny day.
A couple of elephants standing next to each other.
A rainbow that is above a street corner.
Swans gather in the middle of a parking lot.
The skier is repairing his ski on the slope.
Four dishes of food are organized on a counter.
a kitchen with a double sink, stove and counter top
A small child is enjoying a donut at the table.
These two people are using the phones at a parade
A paper plate holding a piece of cake.
A beautiful young lady standing next to a  man on a tennis court.
A clean white bathroom with a simple mirror above the vanity.
A man standing next to friends eating food.
A bathroom toilet with a phone hanging on the wall.
a guy that is on a surfboard flying in the sky
Signs and wooden poles stand in front of houses and lawns.
A mouse pad sitting on top of a desk under a mouse.
A couple of trains move side by side down the tracks.
A man in a yellow shirt stands in a dirt circle.
Two horses standing on the grass near a body of water.
Two giraffes are in the foreground and there is a zebra in the background.
A young man riding a skate board up a ramp.
A small horse with his eyes closed standing on snow covered ground.
A bottle of liquor called Granite next to a half-filled glass.
A flock of sheep standing around in the middle of a pen.
A mans face coming out of a chili dog with a fez.
Several boats in a river with people in each boat.
People sitting in a subway station that is in black and white.
Two kids are holding sprinkled doughnuts at the table.
The man is sitting down resting before his tennis match.
A bowl of beans sitting on next to a sandwich.
Six people in a boat rowing on a body of water.
a plane flying by below a slighly cloudy sky
Old style bed has a cross on the headboard
A dog jumping up in the air to catch a frisbee in it's mouth.
A group of young ladies kicking around a soccer ball.
Two back-lit computer monitors and a keyboard and mouse.
A beautiful woman sitting at a table next to two pizzas.
There is a person jumping high on a snowboard.
A big orange cat sitting on a wooden bench.
A child sitting at a table with a hot dog.
hungry dog inching it's way toward the donut.
A brown ottoman sits near a black counter in a vacant room.
A pretty young lady carrying two large donuts in a restaurant.
A woman is getting ready to dive in to some donuts while two guys watch.
A couple cut their wedding cake while the bride makes a face for the camera.
A large animal laying on top of a lush green field.
Rows of books on bookshelves in a library setting
A room with two woman, a dog laying on the floor and table and chairs in it.
A healthy meal with various fruits and vegetables.
A man that is on the side of the wall with a skateboard.
An adult and young horse interacting in a field of grass.
two trains on a track near a platform
A dog is in the air catching a frisbee with a crowd watching.
a group of people stand by watching a group of elephants
Man with racquet about to hit ball tennis ball.
A man riding on the back of a white horse.
kids are enjoying a nice game of soccer
A man getting ready to take a picture in a field.
Several people in a canoe with oars on the river.
A tower that has a clock on the side of it.
A large clock is next to a pillar.
Cat staring at something while sitting on porch.
a woman plays a video game in a living room
a man snow boarding on a ledge with a snowy field behind him
A woman walking in the rain with an umbrella
Various toys and items on carpet that includes wallets and a camera.
Person holding a string with a white kite on the other end.
Young skier posing for photo in alpine ski area.
A baseball player holds the ball in his glove using his other hand.
A red motor bike is being repaired in the driveway.
A little boy on a skateboard on the road.
The long plate has cookies, fruit, and chocolate on it.
A group of people in a living room playing video games
A man in white jacket rowing a yellow surfboard on water.
a person that is riding around on a horse
A woman walking in a muddy field carrying an umbrella.
There is a birthday cake covered in this guys face.
Several people who are waling on a dirt road.
a nicely decorated living room with a big mirror above the fireplace
A person holding a little girl next to a sheep.
A town full of street signs connected to a building .
Fish, small potatoes and broccoli are arranged on the plate.
a large black sheep who has been shaved
The large detailed cathedral has a clock on it.
A ship in the ocean with a seagull and another bird standing on things on the boat.
Three helicopters are flying through the clouded sky.
cows with ear tags standing in a field
a laptop projecting an image on to a flat screen television
a person cooking a pizza in an outdoor grill
Parasails in the wind in front of a bridge on a gloomy day.
Two men on bicycles riding on the street
A group of skateboarders watch a skater perform a trick.
A rear view mirror view shows a truck coming up behind.
This is a traffic light signaling green in a downtown area.
A man jumping over a blue park bench.
A small infant holds a soft toy bat.
A frisbee that is laying down in the sand.
A man with wide eyes eating a muffin.
a giraffe grazing in a high line of shrubbery.
A batter, catcher, and umpire anticipating the pitch.
A big boat on the water near the shore.
A woman standing on a tennis court holding a racket.
there are two urinals in a public bathroom
A silhouette of a horse is seen against the back drop of the sea.
One lonely person on the platform waiting for train to open for boarding
A tennis player prepares to hit the tennis ball
A black bear walking through a zoo exhibit.
A cat sprawled out over the top of a laptop computer keyboard.
People walking along snow and trees with skis on.
A woman walking a gray horse around a field.
A dark vase is holding pink flowers in front of a window.
A parking meter reads "90" minutes on the window.
a man being feed a cake seated on a yellow chair
Two briefcases are stacked up on a desk chair.
Three horse and buggies are parked out in front of a building that has three steeples.
a woman writing something down on paper while the laptop sits on the table
A green parking meter on a city street.
A group of people on bicycles next to a passing train.
A woman is getting in her car on a busy street.
An empty classroom with four unoccupied desks and writing on a chalkboard.
People are standing by a small passenger train.
A elephant being ridden by a little boy.
an image of a man riding on a skateboard
two computer monitors are sitting on the computer desk
A train on one of multiple parallel tracks passes under a bridge.
A pink and a blue toothbrush are on a white background.
A person on some skis in the snow.
A person on a skateboard going up a small ramp.
Guy jumps high in the air on his skateboard off the hill ramp
Man shopping at a grocery store at the produce section.
Three children posing with their tennis rackets at a tennis court.
a keyboard a persons hand a mouse and a monitor
One building has a clock and the other one doesn't.
a close up of a train on a train track
There is an image of a bear jumping on another bear.
Several trucks and cars are driving on a muddy road.
its raining so all the people are carrying umbrellas
a lady and a man getting ready to fly a kite
Motorized scooter parked in front of a gated roadway.
A computer screen and keyboard on a desk.
A stuffed teddy bear sitting next to hay holds a stuffed dog.
Boats are docked by houses on the shore side.
A young boy riding a skateboard at a skate park.
A man holding a phone that has a picture on it of a man holding a phone.
two people in a body of water near a pier
A couple of young boys riding on the back of wooden bikes.
A man is standing in the snow with skis and ski poles.
Wavy wooden seat bench with a sidewalk, grass, and stones
A dog lying down on a couch next to a nightstand with a wedding picture on top.
it is extremely foggy and theres a truck on the road
A recliner chair sitting next to a table with a lamp.
a person in a kitchen preparing food on a plate
Two men in black wet suits ride surfboards on small waves.
A tennis player running to hit the ball.
A man on stilts is holding a pink, polka dot unbrella over a woman in colorful clothes in a park setting and there is a crowd milling about.
A white bathroom has an aqua colored container.
A very small child on a surf board near a big fake wave.
A large airplane flying through a cloudy sky.
A woman starts to remove something from the oven.
A man watching a single engine plane make an approach to land.
A row of retro kitchen designs in various colors.
A kid with a baseball bat on a field.
a person standing outside of a building with an umbrella
A stove pulled away from the wall in a kitchen.
Two pink roses sitting inside of a blue vase on a table.
A military jet is parked on the runway of the airport
A group of women hanging around a long table
a close up of a large and a small zebra
A surfboard is recycled into a unique planter.
Several colorful trains are parked at a station.
A man holding a surf board standing on rocks.
A black cat resting on a bed wearing a tiny winter hat.
A black and white dog is catching a Frisbee in the air.
A man eating food from a napkin in his hand.
A giraffe with an object in its mouth.
A baseball player up at bat in a game in a stadium.
People walking in the rain on a city street some with umbrellas.
A group of sheep sitting on the ground around a bench.
A crowd of people standing on snow covered ground.
A group of people crossing a street while holding umbrellas.
A glass cup holding three toothbrushes next to a wall.
Two pictures of the same woman playing tennis.
The room has a large china cabinet, and two couches.
Two guys reach high to catch the Frisbee.
People walking their dogs on a park trail.
A large wooden clock by a window in a room.
A brown dog standing next to a toilet in a bathroom.
Pink and white flowers in a blue vase.
A group of three horses standing on a lush green field.
A group of people eating at a table raise their glasses
Two people walking and talking on a huge air strip.
A man riding a skateboard on top of a ramp.
this is a mulicolored stripe sun umbrella near a palm tree
A group of skiers trekking through the snow
A clock that is on the side of a tower.
A boy is holding a video game controller in his hands in a living room.
A stool is inside of a walk in shower.
Some very pretty giraffes standing by a big fence.
A sink is shown in front of a frame covered wall.
Two jets landed at an airport facility with many service trucks.
an old school bus sits in a field in a retro photo
A woman standing over a cake with a knife.
Two young boys play a game of frisbee.
A young woman plays with a frisbee indoors.
A baseball player about to receive a pitch in a stadium full of people.
A man near a curb with a bag and a box.
A picked off cake somewhat resembles the original design.
A crucifix is on the wall next to a clock.
A close up view of a small broccoli plant.
A police officer is riding his motorcycle on duty
A cat sniffing a small teddy bear laying on the floor.
Two birds are standing on park benches outdoors.
Someone chopping up foods and placing them in bowls and plates.
A tow truck hauls a jeep along a busy street.
Woman walking beside a man riding a horse in a yellow shirt.
Adult in suit and tie with markings across back of hands.
A small older bus parked alongside a roadway and another behind it.
The horse is tied to a tree in this snowy yard.
A living room consisting of windows, rugs, chairs, and a coffee table.
A man holding a kite next to another man.
A bird standing on the sand near a body of water.
The bathroom in the house is clean and ready to use.
a toilet with metal walls and a sign
A woman takes a picture of the newspaper with her phone.
A large red bus on a city street.
a lemon sitting on top of some fish with veggies and rice on the side
A woman stands behind her luggage next to a building.
A group of men holding up a white and yellow frosted cake.
A few birds wading in some shallow water.
Herd of cows, walking in a drinking from, a river.
a man in sunglasses and a pilots uniform looking down
A skateboarder grinding down a red and black ramp.
Two men sitting at a restaurant table holding up a tray of pastries.
A food entree is shown on a platter.
This kitchen has wooden ceilings and two tables
Few horses out in the distance eating grass
A picture of a black bag with a motorcycle in the background, on a dirt path.
a train with green and purple on it on a track
a small vase sitting on the table with flowers inside
A stop sign outside of a building with the word Liberty on it.
Two older gentlemen sitting on a public bench in a park.
a close up of a pizza on a pan
A woman with a dog throws a frisbee to a hill.
The inside of a bicycle store with numerous related items on display
two sinks a toilet mirrors and a counter
A baby sitting on a couch next to a brown teddy bear wearing a t shirt.
A dog standing next to a sheep behind a fence.
A man is in a field flying a kite.
A airplane with striped wings is in the air.
A man in snow gear in skis at the side of a snow slope.
Small desk with electronic equipment in office type room.
A plane is flying over a gas station preparing for landing.
A man spins around with his forehead on a baseball bat.
A group of men tours a building that has had fire damage.
A white sink and toilet in a small bathroom.
A man sitting on a sofa using a laptop with his dog curled up next to his shoulder.
A man and a woman dress up in costumes.
A horse is standing behind behind a fence
A red, white and blue train on the tracks beneath a mostly cloudy sky.
A person doing tricks on a skate board at a skate park
A motorcycle sits parked in the corner parking space.
A professional baseball player in a white uniform on a baseball field with a bat in his hand.
We are looking at a photo of buses in a demolition derby.
A man rides a snowboard down a snowy hill.
A baseball player takes a high grip on the bat as a catcher scrambles for the ball.
Two guys are excited to be catching a Frisbee in this tournament.
A glass coffee table sits in a living room.
The man wearing a hat holds a kite near many other kite fliers.
A green plate topped with pasta, broccoli and a salad.
Two shorn sheep graze on tall green grass in a sunny pasture.
A dog on top of a building yawning and an airplane above him.
Various contents laid out on  a wooden table
a number of people near many bunches of bananas
A dog is chasing a frisbee in a park.
A man standing on a tennis court holding a racquet.
A toddler with a frisbee in his hand.
four children in a living room with one of the children holding a game controller before a television.
The men are playing a game of baseball in the yard.
A clean living room containing a couch, three tables and other decorations.
A female tennis player is holding the racquet, ready to swing.
Two plastic containers sitting on a table filled with food.
A man standing beside a robot with a camera around his kneck
a kitchen with a sink on a counter top
A woman holding an umbrella while a man walks behind her.
Men are eating hotdogs during an eating competition.
A bed sits in a white room with a window view of a nearby house.
A bus that is parked inside of a building.
A gigantic bird statue outside of a building
a street people cars trees and buildings and police
there is a man with a uniform at the supermarket
Two bananas that grew as one  speckled banana
A older woman is fixing a younger man's tie.
Kitty fast asleep on its back on the bed.
A BED NEATLY MADE WITH A TABLE NEXT TO IT
a giraffe sticks out his blue tongue at zoo visitors
Two giraffe sitting on a dirty lot next to a forest.
A cat snooping in a bag on a bathroom counter.
Looking up at the belly of a jet airpline
A young many getting ready to serve a tennis ball on a clay court.
An old man is flying his kite in the middle of no where.
Dog curled up in the bed under covers
A living room area with couches facing a television and windows on the side wall and on the wall behind the television.
Several people are skiing along on a snowy field.
Some bottles and glasses of wine surrounding entrees.
A woman takes a bite of her sandwich.
Food and a cutting board sitting on a table.
Here is a stop sign with graffiti written on it.
a young man is doing a skateboard jump.
A cake on a table with other desserts and pastries.
A baseball player swings at the pitch being thrown.
A zebra standing in  a dry field of grass.
This is a nighttime image of a church
A baby in a crib with a nighlight
The brown and white cat is sitting by the computer.
Three pies on separate plates on a table.
Outdoor subway train pulling into an empty station.
Sugar covered doughnuts are heaped in a pile.
A man standing on the grass preparing to throw a frisbee.
A man seated with a mouse, a keyboard and cell phone in front
A calf is laying in a pen as people gather outside to look.
Traffic is stopped on the road because of a red light.
A truck parked on the beach next to the lifeguard station.
A man is on his cell phone outside under an umbrella.
A colander sitting on a countertop in a kitchen next to a microwave.
A man in black shirt doing a trick on a skateboard.
A flight of red brick stairs which lead to an antique bench and a view of historical brick buildings.
a number of small boats in a body of water
A table topped with a giant penny and a tray full of vegetables.
A dirty kitchen stove with a timer located on at it's center.
An electric pole with a single light and two street signs next to a 5 eleven sign.
A yellow train parked at the end of the tracks
A guard with a dog walking around a bus in a parking lot.
A cutting board with a beet cut in half
A tall giraffe rests amidst green grass and trees.
A man is skiing in a snowy forest.
A tennis player dives for an incoming ball on the court.
Smiling man enthusiastically hugging a plush teddy bear.
A full view of some tall buildings in the downtown.
A bird standing on debris in the water
A young boy is skateboarding in the middle of a parking lot.
A man playing tennis with a racket in one hand.
A skateboarder riding on a park bench, on a cloudy day.
A black cat holding a Nintendo Wii controller.
Man smiling while displaying food item in kitchen area.
Two people stand by a motorcycle and a van.
A black and white cat walks near someone's legs.
A group of people gather near a motorcycle.
A man talking to another man sitting at a table in front of a laptop.
A kitchen with refrigerator, sink, and a curtain over a doorway.
Two teams are playing a frisbee sports game.
Carrots, cauliflower and broccoli sitting in a clear container.
A man eating food off of a plate
A tennis player hits the ball up from the racket.
A large jetliner sitting on top of an airport tarmac.
People sitting on a bench above the water.
A kitchen that has wooden cabinets and a kettle on the stove.
An orange and white kitten laying on a chair.
a cook standing in a restaurant kitchen while making a meal
A woman smiles while posing wearing skis and holding ski poles.
A woman in grey sweater lighting a candle at table with hotdogs.
A skateboarder performing a trick on a ramp.
Man and a rowboat next to a misty mountain lake
Several cut up carrots boiling in a pot of water.
A bus all lite up inside and out with diffrent colors .
A television and a cabinet on both side.
Parents and children dressed up as Santa Claus sit on a park bench.
A simple yellow vase holds two red and white tulips.
some people and a long thin boat water and houses
Baseball player coming in for the base, while catcher readies himself to get the ball first
A person on a skateboard and a person walking a bike.
A glass sitting on top of a wooden table. next to a keyboard.
Commuter buses parked in a lot outside an apartment complex.
a table full of vegetables and fruits stacked on top of each other
A plate with a sandwich with people in the background.
a police officer on a red motorcycle in the street
The child is pushing a cart full of luggage bags.
A young toddler is standing next to the high toilet.
A female nurse is standing next to a man in a bed.
a blue and white  road sign written in japanese
A small elephant standing next to a wooden tree.
A lone person snowboards down an empty slope.
An open laptop computer sitting on top of a bed.
a street light on a street next to a tree lined median.
a close up of a bench near a ledge with a statue
An antique car being towed on a flatbed trailer
Two guys are having fun while playing the Wii.
The young person in a cap is grabbing his skateboard during a jump.
A dog attempting to use its mouth to pick up a pair of scissors from the floor.
this is a pizza and a fork on a table
A group of women with umbrellas and a couple without umbrellas outside in the rain.
a woman tosses a frisbee in a public park
A train on an overpass over a parking lot.
three kids at the field playing frisbee together
A tennis player is in motion with his racquet raised.
A man who is holding a skateboard in his hands.
a close up of a sink with many dishes in a tray
A man on a phone with a sandwich in his hand
A red fire hydrant sitting in the middle of a forest covered in snow.
Two little girls sitting in the grass with toys.
A kitchen scene with focus on the microwave and oven.
A dog sitting on its dog bed in the middle of a living room.
An adult zebra and a young zebra stand together in a zoo enclosure.
A man is on a paddleboard in rough waters
a man holding a teddy bear standing next to a basketball.
A man flies a kite at the beach.
A sandy beach topped with lots of people and tens.
One cat lying on the floor, and another with its front paws up on a stool
A man holding a plastic gun in his hand.
A train is passing along a hillside during the day.
A baby in pajamas outside, sitting on a skate board and waving to someone.
A street sign hanging from the side of a pole.
This is a photo of a hotel bathroom, all nice and neat.
A pen of five sheep surrounded by other pens.
A person in a large yellow and purple train.
a large air plane on a run way
A herd of sheep shares the road with cars and motorcycles.
A paper mache "bandit" piece of artwork stuck to a pole under a "neighborhood watch area" sign.
People standing in line by several food trucks parked on the street
A man holding a huge slice of cheese pizza with a crying kid on his lap.
Man on purple tennis court swinging at a ball.
A knight riding a horse and greeting a crowd as he clutches his shield.
A broom is hanging on the wall of a house.
Picnic tables under a pavilion in a park.
An assortment of thrift store objects, including two vases and some miniature carousel horses.
A man riding through the air on top of skateboard.
A young child is palying with some books.
A giraffe standing up with its legs crossed.
LOTS OF PEOPLE ON THE STREET IN A DIFFERENT  DECADE
A tennis player reacts during a match in a tennis court.
A man is standing by a trolley station as one approaches.
A brown couch sitting in front of a flat screen TV.
a plaza filled with a lot of birds and some cows
The produce section of a grocery store
a couple of boats sit parked by a dock
A white and red moped is parked on the sidewalk.
A young woman with a smile on her face rides her bicycle down the street past some parked cars.
Four Blue Angel jets are flying in formation.
Traffic sign and barricades on roadway near large city building.
A zebra stands alone in some tall grass.
A blue and yellow train in the train station.
Several people sit on top of an elephant as another person watches.
A white toilet in a small tile walled bathroom
two people setting up a table of food
a girl is looking at her cellphone and a blond boy
a big tower that has a clock on top
Someone who is enjoying some rest on the edge of the pier.
A group of people flying kites on a field.
A man that is standing on a court with a racquet.
A sheet cake with a tractor frosted on it.
A very clean, modern and minimalistic style bedroom.
A woman using a Wii controller and playing a game.
Two giraffes in a zoo enclosure with a zebra.
A man wearing a glove pitching a baseball.
A woman with her face painted white and decorated with a dragon.
A traffic signal underneath some tall buildings.
A rear view mirror on the side of a car door.
A bedroom scene with focus on the window with the bed in the reflection.
Four men playing doubles tennis on a court.
A Chanel sign sitting behind a display window at a store.
A man riding a motorcycle with another man hang off it's side.
A dirt road through the woods with a rolling suitcase.
A  black motorcycle is outside of a house
The children watch the man and woman cut the cake.
many people in a boat in the water and trees
Glassed in bathroom with the sinks on the outside.
A truck waits in traffic next to a wooded area in the city.
A man standing next to a cat in a kitchen in front of a laptop computer.
A bathroom door is open showing a shower with the shower curtain mostly open.
a brown table and chairs and a vase with flowers
Several students sit at a conference table with their laptops.
A photo of a jetty and a body of water.
A surfer wipes out as the waves break.
Toddler getting ready to hit a t-ball with his bat.
Young lady falling to the ground catching a frisbee.
THERE ARE TWO ANIMALS THAT ARE PLAYING TOGETHER
Large brown bear sitting next to rocks in open area.
two boys with a skateboard and  a bicylce
A young boy holding a Nintendo Wii game controller next to a lego controller.
An umpire in the field, talking to the batter.
A group of people riding an elephant on a dirt road.
A group of trucks and cars are coming out of a tunnel.
A man walks past a weathered structure and parking meters.
A small hotel room with a king bed.
A male standing up with a Wii remote control strapped on his hand.
A guy is performing a trick on his skateboard.
A person standing on the river bank by bushes
A picture of an open air zone that looks incredible.
A man pulling a piece of something from a machine
A large stuffed bear is sitting on the ground with a cup next to it.
A man in a beige suit with graying hair.
a couple of vehicles on a busy city street
A man stands with his produce in baskets.
Adult preparing to catch flying disc in open area near trees and water.
People stand in line for an ice cream truck.
A man in a the middle of a bunch of cows.
A female tennis player hitting the ball with racket
People on a busy sidewalk, some on bicycles.
A fat ass sitting on a toilet with lady magazines.
A small white tow truck parked to another small white tow truck.
A man without a shirt squatting on top of a skateboard.
A man on a motorcycle driving beside a van.
There are signs on a cobble stone sidewalk
Black and white photograph of a man on subway with bicycle.
A stop sign in an older residential neighborhood is marked with graffiti.
a couple of large pizzas that are sitting on the table
The man is spraying down the toilet to clean it.
a bunch of people watch a person do a trick on a skate board
a man on a beach tries to fly a geometrically made kite
an image of a clock on a tower high in the air
A large black dog standing near an open suitcase.
A man standing behind a white frisbee on a green field.
A black-and-white photo with a colored red double-decker bus.
Line of people behind plastic fence with umbrellas
The street signs are clearly visible for us to see.
A pair of red scissors sitting on top of a piece of paper.
A baby feeding cake to a man with a fork.
SEVERAL PEOPLE STANDING AROUND APPEARING TO BE LOOKING AT SOMETHING
skateboarder in red helmet jumping on skateboard ramp
A person working at an airport, outside of an airplane.
A birthday cake for a child is sitting on the table with candles.
A dirty, overturned motorbike lays in the mud.
An airplane in the middle of a field with some jeeps parked near it
The front of a store that has a large teddy bear in the window.
Old suitcases piled to the ceiling on a luggage cart make art in an airport
A giraffe standing on a lush green field.
The white and black dog is in front of an open refrigerator
A small boy next to a table upon which sits a birthday cake shaped like a racecar.
A train runs down a track past run down buildings.
A man that is standing on a platform with a frisbee.
A black plate with a hot dog and fries
Two pieces of pastry sitting on a mat next to a spoon and fork.
A bunch of green bananas is on a tree.
A woman holding a video game controller and grinning.
A pan of some kind of food cooking in an oven
A man in a suit and colorful tie in a park.
A close up of a girls boots as she sits on the counter.
a black and white photo of a cow near a tree
Assorted vegetable along with cheese and nuts for food preperation.
Two tennis players are doubled in their pursuit of the two tennis balls.
A man on his skis on a snowy slope.
A comparison photo showing bathroom in regular view and in stretched version.
Small yellow container attached to a digital camera.
Two slices of square pizza on a plate with a fork.
a man in a tie holding a cup
A street scene of an intersection with a street light.
Three wild cows in a field on a nice day.
A small sink inside of a very clean and white bathroom.
a small machine is sitting by a cliff
A wall with words painted on the glass and people behind it.
A laptop sitting in someones lap and a dog lying on the floor.
A little girl posing for a photo next to an elephant.
A man is at an outdoor table under an umbrella.
A bathroom window reveals a snowy day outside.
Animals grazing in grass in front of an industrial landscape.
I THINK THIS IS A RECEPTION HALL AND ITS FULLY DECORATED
A red frisbee has been thrown by a man.
A blender filled with flower and eggs on top of a counter.
Colorful collection of fruits and vegetables with some type of "baby" decorations
A white boat on water with brick wall next to it.
A baseball player up to bat and missing a hit.
A park bench sitting in a snow covered field
A woman cutting thru a pastry on a white cutting surface.
A vase of flowers one being a large sunflower in front of a brick wall.
A woman biting into a sandwich with a happy look on her face.
A man sitting on a couch playing with a game system.
Two people walking down the sidewalk with a "wrong way" street sign directly above them
A couple of horses standing on top of a grass covered field.
An elephant is drinking with its trunk at the watering hole.
a close up of slices of bread on a table near a spoon
A woman with eyeglasses in a kitchen with bowls, spoon and glasses
A person is eating at the kitchen table.
Two women are facing each other and one is blow drying her hair.
a drawing of a big fancy court house
two elephants walking in  an stone enclosure.
A cake with a train on a track and a ground made of cookies.
A baby is laying on a bed with the cat next to and a man is looking over in the mirror.
A supply truck in a snowy area driving towards a tunnel.
A bunch of carrots are on a plate next to broccoli.
A fire hydrant on a sidewalk next to grass
A stirfry containing broccoli, carrots and other vegetables.
A wooden bench surrounded by potted plants in front of a house.
a close up of a teddy bear on a balcony
The storefront of a bakery that has been painted green.
Three stuffed animals hanging on the structure of a train track.
Children get off a school bus as a crossing guard stands among them.
Two horses underneath a canopy of green trees
Group of mixed vegetables sitting on a counter top in a kitchen.
a close up of a plate with a sandwich
A carnival atmosphere is effected by these colorful stuffed creatures.
delivery truck dropping off delivery at train depot
A young child laying down in bed with one arm raised up that is wrapped up.
A woman in black jacket with umbrella on a sidewalk.
Different types of fruit are shown on the counter.
Two individuals sit on motorcycles on a busy street in the rain.
two people riding motorcycles, one sliver one white
two girls and a boy standing in a kitchen
a couple of birds stand on top of a rock
A man in a white shirt with a red tie is standing in front of a door way.
a couple of small beds are in a room together
A white bedroom decorated with low level furniture
People looking at zebra and cows behind fences at a zoo.
A man standing on one leg on a baseball mound.
This tennis enthusiast, not using correct form, is practicing on the court in the city.
A large eating and living area inside a house
A lady walking with an umbrella during the day on a rainy day.
A person on a skateboard on the ground of a park.
A person sitting on a bicycle at night outside of a shop.
Two different types of dogs sitting together on a bed
People are riding in a boat on a lake.
Two horses rubbing necks together in a field
A zebra is chasing another zebra inside an enclosure.
Two zebra standing next to each other in a field.
A white toilet sitting in a bathroom next to a wall.
A woman standing on skis and holding poles in the snow.
A GROUP OF SHEEP WITH MOST OF THEM IN INDIVIDUAL CAGES
An empty alley with a street sign at the end of it.
A sandwich that has several toppings on it.
The man is kneeling low to hit the tennis ball.
A lone giraffe bending over to graze within an enclosure
A man driving a scooter with two women sitting on the back of it.
A very up close picture of a sign.
Someone is holding a toy bear in the mans face.
A tour boat on a man made lake under a blue sky
A man hitting a tennis ball with a racquet.
A young boy is doing a trick on a skateboard.
A man working in a market surrounded by produce and meats.
A dog sits in a suitcase with a doll.
this is a group of people eating together
A player is swinging at the ball in a baseball game.
A blue doorway with a clock mounted above it.
a teddy bear sitting in a hanging basket
A man talking on the phone while he walks down the street.
The bear is wondering about in the woods.
Two wooden desks holding keyboards and computer monitors.
A man cross country snow skiing near a wooden post.
There is a young child sitting on the floor in front of the refrigerator.
This bedroom features a bed and chest of drawers.
Two men stand next to a horse near a bus.
A long bookshelf behind the head of a bed.
A large fire truck with a water tank on the side of the road.
A group of people carrying bags of luggage through a lobby.
a baseball player swinging a baseball bat at a ball
There are people riding bikes on the street.
A LARGE PAINTING ON SIDE OF BUILDING WALL OF TOASTERS
a small bird sitting perched on a chain link fence
Tennis players at match standing over net shaking hands.
A city street lined with very tall buildings.
A woman in white shirt holding a kite on beach.
A table with a sandwich, sandwich makings and glasses of red wine.
The unique meal includes both carrots and peppers.
a kitchen with maple cabinets and black appliances
A wine bottle is being used as a flower vase.
The alleyway is lined with many parked motorcycles.
a woman is walking down the street in a sweater
Woman eating a really large sandwich at a dinner table.
BEAUTIFUL VIEW OF A BLUE SKY TOWERING OVER WHITE CAPPED MOUNTAINS
a white plate with three donuts and two drinks
a boat docked at a wooden dock on a lake
A young man uses a fork to eat food.
Motorcyclists perform a pyramid stunt in a darkened auditorium.
A person sitting down in front of a laptop.
there is a young girl that is feeding a giraffe
Cattle grazing and eating grass while looking at the camera.
A child, wearing a cat costume and umbrella, stands before a brick building.
The little league ball player is posing for his picture.
Three men sitting on top of a green bench.
A bouquet of flowers is stuffed inside an arrangement of wine glasses on a table.
Baseball pitcher in the middle of a windup.
A vase sitting on top of a plastic pedestal.
A mannequin wearing a jock strap, unbuttoned shirt and tie
A couple of men sitting on the side of the street.
An old flip cell phone inside a cozy.
two kids battle over a soccer ball while on a field
A red motorcycle on display at a show.
A stuffed animal sits atop a barbed wire fence.
A man sitting in a car on a cellphone.
A tennis player on the court stepping backwards in preparation to swing
A bedroom with a fluffy comforter and lights above the headboard
this is a man skiing down a hill
An older male with white hair holding a flip phone.
a female tennis player diving to hit the ball
A red and white plane sitting on a runway.
A group of people standing around each other.
Mom has to help him eat his hot dog and bun.
A kitchen counter with some bananas and eggs.
A group of people who are sitting on horses.
A small vase of pink and yellow flowers next to a candle holder.
A variety of vegetables laid out on a kitchen counter.
It is out in the open with various things in viewpoint.
An advertisement for Samsung Galaxy Golden cell phone
A man wearing headphones looking at the camera.
Children in the snow with skies and snowboards.
an exhibit featuring various animals under a wooden roof
A big bear standing out in the shade and sun light
A city is lit up at twilight near a river and a clock tower is lit up in the distance as a large boat is seen on the river.
a number of giraffes in a field near one another
Two soccer player on opposing teams playing soccer.
An Asiana Airlines plane taxiing at an airport.
A man riding a red motorcycle on a street next to a crowd.
A blue street sign in an Asian language and English.
Pink lunchbox filled with fruit and vegetables and snacks.
A white refrigerator on the side of a road next to cars.
Various vegetables in a roasting pan in an oven
A giraffe walking through a lush green field.
A bowl filled with apples, limes and lemons.
A man riding a gray elephant holding a ball in it's trunk.
A bus stops on a street corner as pedestrians walk down the street.
a bridge lit up with some  blue lights
He is heading for the beach with his surfboard.
A couple of kids looking out of a window on a subway car.
Two dogs are watching a television set intently.
This is a picture of a persons garage sale.
An elephant under trees in the night time
A chair and debt with laptop, monitor and a cat.
A black, brown, and white cat is near a laptop.
A teddy bear wearing green hat and jacket.
Several baseball bats leaning against a fence with a short hanging from the fence.
A person in red jacket skiing down a hill between trees.
A woman dressed in costume is sitting on the motorcycle
A woman on a tennis court is hitting a ball with a racquet.
A man in a traditional African outfit gestures while a black cow is in the background.
A lone skier is seen on the slopes on a cloudy day.
Children flank an old pickup truck in a parade.
There is a bird statue and clocks outside of an apartment building.
A baseball player holding a baseball bat over his shoulder.
A plate of food that includes meat and broccoli.
a person wearing an apron in front of kitchen appliance
The hot dog is loaded with many toppings.
A large jetliner flying through a gray sky under clouds.
A bunch of ripe bananas sitting on top of a table.
some baseball players are playing baseball and some trees
A man talking on a cell phone while sitting down.
2 zebras outside eating grass in a wide open space
A man in India herds a number of cows on the street.
A laptop and desktop computer on a desk with a light on next to them.
A group of baseball players standing on top of a field.
A train that is driving through some houses.
Cat and dog in the windowsill of a building.
A desktop computer is displayed at a wooden table.
A large group poses for a photo in their ski gear.
A man sits in the snow while breaking from snowboarding
A man standing and talking on a phone in a courtyard.
Automobiles stopped at an intersection because of a passing train.
A giraffe standing on a stretch of sand at a zoo.
a woman wearing a crown and a young boy smile at a table with a cake
An older man and a boy are on the beach with their surf boards.
A bench next to a lamp post on a cobble stone street.
Two parking meters that are nearly covered with snow
Cars present at an intersection with traffic lights.
A sign indicates when parking is off limits on West 25 12 street.
A Pizza with red peppers, zucchini and cheese.
A bird with a red face is standing on a rock.
A dog laying on the back of a couch.
a kitchen is decorated with american flags
a tropical bread on a branch surrounded by trees
A zebra standing in a dirt field next to green plants.
A colorful plate with a pizza sitting on top of it.
the people on the beach are flying kits over head
A man in white baseball uniform throwing a pitch.
A boy with a helmet on eating food across from a bicycle.
A large giraffe standing in a dry brush field.
Woman in grey and blue throwing a frisbee.
Dozens of people on a grassy field flying kites.
A man that is standing in the snow.
A cat is standing on a toilet with its front paws inside.
a small baby is biting into some food
Several pieces of pottery in the process of being painted.
Various types of flowers sitting inside of a vase.
A little girl holding a colorful umbrella next to a penguin.
A red piece of luggage sitting on top of a bed.
A black and white shot shows evergreens, bare shade trees, and bushes that slightly obstruct the view of a building with a low roof in comparison to its  clock tower, which stands more than twice as tall as the evergreens, against a grey sky.
A very long street with traffic under some cloudy skies.
The men are racing on skis on the snow covered race course.
some white and brown signs a tree and a building
The corner of college street and 5th street
Miniature Poors on the side of the road in a rural mountain.
a "use crosswalk" sign on a post in front of a rain-covered street
a person holding up a cell phone
A man sitting on a couch has two cats on his lap.
Military jet on tarmac near wooded area on cloudy day.
A desk with a monitor, keyboard, and laptop on it.
A plate with a piece of cake and a spoon on it.
A table with family photos, sentimental mementos, and a potted plant
A couple of buses that are on the lot.
Group of four ladies sitting at table overlooking parking lot
Flowers in a window box sit in front a closed window.
People behind a barricade watch a man ride a motorcycle.
The bathroom has been cleaned and is ready to use.
a woman on a tennis court holding her tennis racket up to hit the ball
A couple of men sitting at a table with pizza.
An orange cat  grooming themself underneath a piece of furniture.
a yellow hall with a brown floor and a mirror
a cat sitting on an organ looking out the window
One adult giraffe and two kid giraffes standing in the woods.
A man in cowboy hat on horse next to cattle.
A trolley at a train station at night.
A soccer player in front of the goal holding a soccer ball.
A couple of horses standing in a grass field.
A young person with an umbrella is crossing a busy intersection.
Several closeup shots of giraffes near a fence.
A airplane that is sitting on a runway.
two female tennis players are playing tennis on a court
A pair of workers unloading the back of a pickup truck.
A glass table contains a bowl of spheres and two fancy vases.
A man stands with several ripe and unripe bananas.
A cellphone next to a laptop computer.
There is a man on skis in the snow.
A humongous jumbo jet is on the airport runway.
Several people are flying kites in a field.
A black and white dog catches a Frisbee in the grass
A glass vase filled with different colored flowers.
A street on a city at night that says "Obama".
A large silver truck with a tractor parked on it's flat bed.
A large knife sticking out of an apple in front of a blood soaked wall.
A man who is swinging a tennis racket.
The orange and white cat is wearing a bow tie.
A huge group of people stand outside several buildings, holding umbrellas of various colors
Small bird feeding near chair in grassy area.
A cat laying on the ledge of a window.
A white and brown sandpiper with a long, black beak lifts up one leg.
A city street with people, cars and police.
two long haired cats laying on a bed beside each other
The man is cutting bell peppers near a large pot on the stove.
A man sitting at a table with a large plate of breakfast food on it.
a person standing at a tennis court holding a tennis racket
A traffic light near a building on red.
Suitcases revolving around on an airport baggage belt.
A baseball player throwing a pitch into the field
A tow truck and fire truck are at the scene of the accident
An ocean view with people water skiing using parachutes.
A cat looks happy while sitting in a bowl.
A child is flying a kite while sitting in a yard.
The fire truck red and the green pastures make it look just like Christmas.
Two people sit near many luggage bags using laptops.
A meal of noodles and broccoli being held by chopsticks.
A woman carries a basket of bananas on her head while some men stand around.
boy skateboarding next to a graffitti covered wall
A lad and a lady patting their favorite horse.
People are sitting on surfboards in the water.
a couple of people that are walking in some grass
People on a boat on a lake and two people jumping into the water.
People standing on a dock near a elephant on a phonton boat.
I am unable to see the image above.
a man that is sitting at a table with a laptop
a little boy that is eating a pizza
A baby zebra is standing in a pen
A large truck sits on the dock as a boat pulls up.
The bathroom has a shower area, toilet and sink.
A display case in a store filled with lots of efferent foods.
an extreme close up of many different types of bottles
A boy standing on the grass as bicyclists ride by.
Four men riding horses playing a round of cricket.
A man feeding a baby her bottle with a smile.
A bunch of candles that are on a cake.
a woman sitting on a wooden bench in the middle of nowhere
A very tall white clock tower sitting under a blue sky.
A bathroom with white toilet and walls and blue accent bars.
A woman is holding an umbrella while walking down a flooded street.
Two apples and a bowl and jar of applesauce on a cloth.
a line of very tall buildings next to a clock tower
A group of people who are standing outside.
The apple and banana are on the table.
Four trucks are parked in front of a paint store.
A small sandwich made on fresh bread with lettuce and mayonaise
A cat that is in a white sink.
A group of people sit on a couch in front of a kitchen.
A city street with busy traffic including a yellow bus, many cars and a person ridding a bicycle.
a number of baseball players with bats
A soldier who is standing near a goat to feed it.
A woman taking a hard swing at the tennis ball.
A person watches tv in a room with a couch and a laptop
A bed with two pillows and a backpack leaning against it
A yellow and orange double decker bus is shown.
A man surfing in the ocean as the sun sets.
A living room with white furniture and a small wooden table.
A group of athletes engage in an organized game of ultimate frisbee.
A cat on the floor next to a room with a sink
A living room tastefully decorated with flowers on the coffee table
1 12 loaded hot dogs and veggie side
A person standing next to a chair with two tennis rackets.
A man is swinging a bat at a baseball game
A toy fire truck sitting on top of a wooden table.
A man is riding a skateboard in an underground parking garage.
There is a wood bench in the garden.
A bright computer screen inside of a room.
Windsurfer kites are seen from above the beach.
A kitchen with white counters, sink, and stove.
The elephant is attempting to complete the difficult trick.
A small white plate of food on a table.
Two giraffes and one other animal grazing in a field.
a man holding a tennis racket on a tennis court.
a close up of a young person holding a kite
A sleepy tortoise cat laying in front of the monitor.
A man at a kitchen counter preparing food.
An old diesel truck driving down the path next to freeway
A beach volleyball game with a kite flying in the background.
Several baby elephants standing on a plain on the side of a river.
A large group of people sitting in the sun
a number of food trucks parked near one another
Semi trucks on a parking lot with orange cones.
A photo taken in a car looking at a dog in the back seat.
Young man surfing a fairly good size wave
BLACK AND WHITE PHOTO OF A MAN AND A WOMAN
Trunk and small chest in cream colored room.
A baby girl with beautiful blue eyes standing next to a brown teddy bear.
Men playing soccer on a field at dusk.
a building with a clock at the top.
A couple of clocks mounted to the side of a wall.
A young person riding a skateboard up the side of a ramp.
A man standing on a tennis court holding a tennis racquet
A white plate of food on a table.
two donuts left in a box of donuts on a counter
A woman wearing skis with her black dog in he snow.
A young boy smiles while holding a hot dog.
a pizza in an oven not yet cooked
Barry Bonds holding onto a baseball with the number 754 written next to him
A kitchen scene with focus on the pantry and a clock.
Several planes fly through the sky, close together
A smiling woman holding her cel phone up and open beside her face.
A small dog getting a bath by it's owner.
A man is riding his horse on the field near the blue trash cans.
A woman holding a cat in her arm.
A fake large cow that is standing in the snow.
A tennis player is bending over and reaching to hit the ball.
A long mirror is above the sink in a small bathroom.
A view of a city and a body of water from a plane.
A person that is driving on the street.
A twin engine aircraft is flying in the sky.
People are being served at the outdoor restaurant
I am unable to see the image above.
The lights of a vehicle streak across a modern bridge.
two people are cutting into a cake with forks
Several zebras from behind standing on grass plain with distant trees.
a plate filled with grapes and some sliced apples, kiwi and oranges
Four young people crowded in a bathroom brushing their teeth happily.
A boat docked at the shore of a lake.
Some people with wine glasses are smiling and laughing.
People with umbrellas looking towards the grassy area
A peacock standing near some metal grill fence
A young man in a tan suit and shoes
Two sheep on the top of a hill covered in grass.
There is a tennis player holding a tennis racket
People near a stone building with a clock tower.
Two pictures of a woman talking on he phone at a coffee shop
Baby elephant alone by a tree in the evening.
Patty on a whole grain bun served over salad.
A doughnut sits on a napkin, with red frosting and one missing bite.
A cat sitting by a microwave under a cabinet.
an image of a man in the middle of playing baseball
a zebra walking on a dirt path near a fence
Woman riding bike with basket and walking dog.
Scissors with a blue handle are in a plastic package.
The skateboarder is trying his latest aerial trick.
Several parked bikes sitting in the grass near a tree.
An airplane is flying high in a blue sky.
a large plane is sitting on a runway
A dog staring at a camera while laying on a bed.
a picture of public restrooms taken from the outside
The man is flying his kite high in the sky.
A bowl that has food and a spoon in it.
A bunch of children wearing winter gear playing ball in the snow
a close up of a zebra near a car window
There is a sign that warns people of work ahead
A man is jumping in the air to catch a frisbee.
A red bus driving down a street near a building.
a man holding a cell phone so someone else can look at it
A fire hydrant stands on the sidewalk in between two poles.
A stoplight controlling traffic in an urban intersection
A group of men riding on the back of horses.
A black and white cat sitting between bottles and a furniture leg
a group of people sitting on a bench in front of some blooming flowers
A girl in a striped shirt and red skirt playing tennis.
carrots rice and potatoes in a bowl with a spoon
A man with a helmet on coming up a ramp.
People riding horses on the sand of a beach.
A person on a surfboard in the water.
A man wearing a hat while standing next to a  purple teddy bear.
A black and white photo of sheep grazing near an old fashioned car.
A surfer is surfing in the ocean.
a young woman holding a cell phone in her right hand
A urinal in the men's bathroom and a small sink.
A kitchen view of a dining table with a bowl with bananas.
A train pulling through a grassy area with two children near.
A skateboarder in mid jump while others look on.
A couple of horses standing next to each other.
A robot built from a Lego robotics kit
Two tall birds are standing in some mulch.
A gaming system plugged into an electric source.
A person with cold weather gear on while skiing in the snow.
A woman on a horse silhouetted by the sun behind.
The two people are facing away from the screen
A man holding a tennis racquet and tennis ball.
palm trees in front of a building and mountains in the background
A person on a snowboard sitting in the snow.
A turkey sandwich and an apple are on a plate.
A view inside of a room with a television.
A brown kitchen table with chairs and brown high bar chairs.
A couple of small statue sculptures on display in a garden.
A small giraffe stands alone in some thick grass.
A number of little league baseball players gathered in the dugout.
A bowl of oranges with several on table around bowl.
A dog running through the grass holding a frisbee in his mouth.
Two people stand in the snow with their skis on their backpacks.
A red train on the track in between two buildings.
A group of baseball players is crowded at the mound.
The stop sign has two street names posted above it.
The woman is rocking the newborn baby and smiling.
Birds in the air in a circle with ocean and mountains and city in background
A car near a toilet sitting on a sidewalk.
a wine bottle and a small vase sitting next to a tiny pizza
Boats sit in the lake next to one another.
A yellow stoplight with a smiley face drawn on the lens.
A narrow kitchen with a refrigerator at the end of it.
a man in a red and black leather jacket on a motorcycle
A man in blue jersey throwing a baseball.
Three women sitting on a surfboard in the water.
there is a large building under construction and many parking meters
a big teddy bear behind a glass wall
A living room area with some couches and a television
a large group of zebras under a shingled roof
A piece of meat covered in marinara sauce, cheese and herbs.
A brown and black puppy sits in the sun.
SOMEONE BRUSHING THEIR DOGS TEETH WITH A TOOTH BRUSH
Several people sit on park benches by the water.
A man is taking a picture of a toilet from outside the restroom door.
A very happy surfer guy is loving life as he hangs ten on a beautiful wave.
A cat looking at the camera with a funny expression.
A sign post with two street signs and a stop sign
a paper plate holding a slice of veggie and sausage pizza
A train engine carrying carts over a hill side.
Skier on red and black skis jumping near a mountain.
an image of a bathroom scene with lots of hair products on counter
A public transit train going through a station.
A yellow fireplug in front of a blue street pole and store window.
an image of a man riding a horse
A STOP sign that has been written on with paint.
Girl in a pink scarf eating a pastry at a table.
a surfer in a wet suit is surfing in a sunny day
Two people are walking up a snow covered hill.
A bird standing on the rocks in front of water.
A counter top in a kitchen with various items on it.
White people looking ridiculous playing wii and drinking beer.
A man bareback riding his bike down the street
Small fishing boats lazily drift in the bay.
Three baseball players are standing by a base and smiling.
A table topped with lots of different types of cakes.
A train is approaching an opposite side boarding area.
A horse is standing in the grass with its head over a fence.
A puppy chases its tail, next to a mirror.
The two elephants are eating their grass for dinner.
A man is standing next to a city fence.
a person riding a surf board on a wave
An image of a man wearing a baseball glove and leather jacket.
An assortment of foods on white and blue plates.
A little girl sits with a piece of cake
A red and yellow double decker bus at a bus stop.
A dog that is laying on the bed.
A dog and someone laying on a bed in a bedroom
A man standing in a snowy forest wearing skis.
A simple vase with a few flowers in it .
a child on a tennis court getting ready to swing a tennis racket
A box that has different kinds of donuts.
A baseball player swinging his bat, while the catcher and one spectator look on.
A man is steaming his clothes in a bathroom.
A man rides behind a horse during a race.
A man in black gear skiing down the hill
Old photograph of baseball team posed on a set of steps
A woman at a crosswalk that has a green light.
A young man riding a skateboard on top of a rail.
a young lady holding her kitten and kissing its head
A man in a red shirt and sunglasses is playing frisbee.
Up close view of two zebras in a zoo.
a monorail making it's way down the track above a bunch of cars
A chocolate donut filled with cream and custard.
a large crowd of people in a park, a good portion of them are flying kites.
an open field with some people flying two different kites
A professional snow boarder flying through the air
Seafood with pasta and broccoli is on a plate.
A young person is biting into a hotdog
A couple of birds sitting on top of a large clock.
A woman leaning in and smiling at the camera.
A double decker bus is on a street.
A cat laying on the bed looking at the camera
A busy intersection with traffic captured in motion.
a pair of pet bowls on a mat is next to a screen
A man riding a skateboard on piece of concrete in a park.
There are two people posing and one man is holding a banana
A FIELD AREA WITH GREEN GRASS AND TING BUILDINGS
There is a couple standing among some fountains
A mp3 player sitting on top of a speaker system.
Large sunflower displayed in colorful vase on table.
A group of skiers watch as one members does a trick.
A man riding down a snow covered slope on skis.
A single giraffe standing by a tree and some rocks.
Man with tennis racquet, soccer ball, golf club, and hockey stick.
An airport is full of people's luggage and no one is there to claim it.
A man riding a horse in an arena with a bull.
A base with  yellow pink and orange daisies in it.
A cat sitting on a table next to a vase with flowers.
There is a train sitting on the tracks.
A white toilet sitting next to a bathroom sink.
A bag and it's contents sitting beside it on a floor.
A colorful display of hundreds of small teddy bears is featured.
a bed with a shelf above it with items and luggage
A living room with a chair, fireplace and mirror.
A tray of food sits on an outdoor table.
A double sided parking meter buried in the snow.
A black Macbook on top of a stand
A stop sign near a Star Bucks Coffee Shop.
A girl in dress sitting on a park bench.
A man standing in water holding a fishing rod.
A blue and green  hummingbird seems to hang in the air with its wings together and outstretched.
The man in the chair is playing a video game.
The airplane is on the runway t the airport.
A plate with some eggs toast and bacon on it.
Two horses graze in a large grassy field.
A bowl with rice and a side of broccoli in it.
There are two giraffes standing in the wild next to trees.
A cow grazing upon a hill on a foggy day
a child lying in a children's bed next to a wicker basket dresser.
A piece of birthday cake is sitting on a plate.
Three people ride on an elephant in front of a forest.
The fork is attached to the dinner tray.
A cutting board with chopped carrots and apples.
A parachute floats in the sky above the ocean
a man on a snow board riding through the snow
Zebras, and elephant and another animal standing near water.
The young child with the missing tooth is holding up a new tooth brush.
A man standing in a tool shed running water from a wooden sink.
A woman is cross country skiing in a forest.
A boy in jeans spreads his arms wide as he balances his skateboard on the edge of a pool.
A girl is sitting at a table in front of a skate board and helmet.
Two double deckers buses travelling on a city street.
A girl with a curious look in front of broccoli and chicken.
a bronze statue is looking at a clock on a building
A bike in front of a scenic welcome sign.
A kid wearing a Georgetown Day Shirt has a baseball glove in his hand.
a black and white cat is sitting on a yellow and red chair
A living room with two red chairs on front of a television set.
A plate of food sitting with a very elegant setup to it.
The remains of the breakfast table from above
A picture of a person laying on a bed.
black and white picture of elephants in a fenced water tank
Two people playing a board game involving cards and chips.
The giraffe is standing by itself by the gate.
A large jetliner flying through a clear blue sky.
BEDROOM WITH BED, DRESSER, TV, LAMP AND OPEN WINDOW
A woman dressed in white holding a colorful umbrella.
two girls sitting in a restaurant eating noodles
A man surfs a small wave on an overcast day.
fruit hanging from a tree with trees in the background
A desk with several computers and electronics on it.
A couch sitting in front of a rub on a hard wood floor.
A man that is on a surfboard sitting on a wave.
The people are piling on to a large truck bed.
A plane on the tarmac with airport personnel.
A man in a suit and a woman in a dress standing side by side.
The men are walking with each other wearing ties.
The guy is on the computer while there is a girl on the bed.
A large clock in a mass transit station.
People in a room with one man working on a laptop while another looks on.
Two horses standing together in an open field near some mountains.
a couple of players are out in a baseball field
A couple of small birds and a building.
A woman smiling while showing off her cell phone.
A baby on the floor biting into a remote.
The teenager is taking a picture of her male friend with her cell phone.
A prepared plate of dinner has meat and broccoli.
A man is doing tricks on his skateboard, him and it up in the air
A green and beige bus sitting on display behind a traffic light.
there is a small boy getting help brushing his teeth
A hand that is that touching a dog.
A lady making something in a home kitchen of some sort.
A man sitting in the middle of a fresh produce stand.
Large oak desk with laptop, keyboard, and pictures on shelves.
A small bed in a room with lacy curtains on the window.
a blue billboard sign in  a busy city
A black and white dog on shore of a beach.
a trey with some fruit inside of it
A cup of coffee and a banana are setting on this desk.
Birds are sitting on the arms of poolside chairs.
A remote and a container is sitting on a table.
A person skiing down a snowy mountain slope.
A doorway view of a bed window and doorway to another room.
A man that is standing in front of a television.
a man lighting the candles on a birthday cake
A group of people flying kites over a lush green field.
An airplane is flying through the sky during the day.
A baseball game is being played on the grass.
Several ducks are out in the middle of a lake.
There are some people walking on a beach with surfboards.
A stove and some books in a kitchen.
A small pizza with burnt edges and fresh toppings.
A living room features a gray and yellow couch, and wooden furniture.
a man with a hat on a bicycle beside a tractor
A pan filled with celery, onions, and carrots.
a person standing near a small motorcycle on a city street
A remote control that is laid on a piece of furniture's cushion, which is ripped and exposing springs and wood.
A man wearing gear on his feet walking in the grass.
some bananas oranges apples and other fruits and a bowl
THIS IS A TRAIN GOING THROUGH THE MIDDLE OF THE WOODS
Living room with large television and lit fireplace.
A herd of zebra standing on top of a lush green field.
a bunch of fruit is laying on a table
A large white bus on the side of the road.
Various street signs next to wall with a building in the background.
There is rice, broccoli mac and cheese, and turkey on the plate.
Some people are holding a union Jack umbrella.
A street sign saying Major Street with an arrow pointing to the right.
Sauce covered pizza in a box on a wooden table.
A couple of people standing next to each other.
A group of people standing and holding wii remotes.
A man taking a swing at a baseball
A bedroom with bed, chair, table and  bookcase.
Two large trucks traveling in the side view mirror of a car.
Pancake breakfast on wooden table with blue and white mat.
A cow walking by a creek with two swans swimming in it.
A stuffed bear sitting on top of a window sill.
A group of sheep sit on top of hay bales.
A dog wears goggles while sitting in the side car of a motorcycle.
A boat sitting on the beach next to a van.
Two traffic lights facing opposite directions with a street sign atop the same pole.
A red umbrella and chair are by the ocean.
A woman hitting a tennis ball with a racquet.
A bathroom with a pink toilet and pink tile.
A young man sitting at a simple desk with a laptop computer and bed in the background.
a boy swinging a tennis racquet at a tennis ball on a tennis court
A white truck is carrying three motorcycles on the road.
A tennis player is hitting the ball on a tennis court.
A man that is wearing a tie and is standing while smiling.
Someone's hand holding up a glass of wine.
Person of a surf board riding a wave in the ocean.
there is a male tennis player playing on the court
Some giraffes are walking around near some bushes.
A group of people traveling uphill on a snowy mountain.
Four people sit at a table full of pizza.
A street with many cars and busses in a city
Plate of food with mixed vegetables and a side of meat.
The motorcyclist is happy to be on the road.
A young girl smiles as she holds a cell phone.
The person in an apron is arranging boxes of fruit.
A bird sitting on top of a tall metal weather vein.
Thee zebras graze in the middle of the zoo.
The tower has a clock displayed in order to tell time.
a jet that is parked on a runway
a man standing on a red boat out on a large body of water.
a cat looks out of a room while on a step
a white cat is sticking his head out of some iron bars
A pitcher gets ready to throw in a baseball game.
Two two birds are sitting on a rock.
Five giraffe stand around a pole eating hay.
a baseball player holding a baseball bat inside a stadium
A woman holding a sign sitting on top of a truck.
A truck full of luggage has the hood opened
Motorcycles are going around a track leaned over.
A drink cooler with bottles of water, juice, and soda.
A train on one of two of the train tracks.
Indian woman selling bananas while others look at stand.
a female tennis player in a white dress is playing tennis
A baseball player standing on top of a field.
A vase of colorful flowers sitting on a table.
A white toilet commode sits on a tile floor.
Assortment of fruits with pastry and beverage displayed on table.
A skiier slides down a snowy mountain on his board.
An orange van with vehicles behind are sitting on the road.
a little grey teddy bear with a missing eye sitting by a tree stump
A teddy bear in a top hat and bow tie with the message "Me To You"
A couch sitting next to a white fire hydrant.
A man with a frisbee in his hand in the woods.
a few people that are playing with a white  frizbe
A person is choosing produce to bag at an outdoor market
A stop sign in Arabic, in a desolate location.
A couple of women sharing a toast at a table.
a black and white photo with a bench grass and trees
A house renovation showing an unfinished room next to a kitchen.
two buses moving on the street besides residential houses
An alley with a person on a bike and a girl walking.
A pita is topped with onoins, carrots, and bacon.
A toilet has been fitted with a system to potty train a cat.
A cute dog has large pink ears and eyes.
Big green monument with a clock on top.
Someone going down a hill on a pair of skis
A white sink sitting under a bathroom mirror.
A train going by a platform in a train station.
A couple of boxes filled with hot dogs and fries.
a group of bike riders going past a yellow bus
Different types of luggage trunks stacked up together
a kitchen with a stove and a glass door
The sink and counter is in the grouplab.
A man in a gray hat and sunglasses on a cell phone
an image of  a man posing with surfboard
A baseball player standing next to a woman.
A double decker bus pulling up to a bus stop.
Four young skateboarders are holding onto the back of a bus.
A long train traveling down tracks with rusted cars.
A display rack of a variety of tools in packages.
A bird in a dark room perched on a stack of books.
a television is turned on in a living room
A black and white picture has a posing crowd.
There is a phone on top of a calculator
two black and white clocks a tan building and a white and blue bus
A motorcycle stopped on the road during nighttime in the city.
The two men stand next to each other looking out on the beach.
Two men standing on a sandy beach holding surfboards.
a man uses a bat gets ready to try and hit a ball
A television that is on with a white man talking and campaign signs
A gigantic size pizza on a table in front of a woman.
this is a trck driving over an overpass
A woman with her dog are seated on a bench.
Two cows in a grass field with a blue sky and clouds in the background.
a bridge over a body of water near a building
A yellow parking meter on the side of the street.
The couple is sitting at the table talking with friends.
A person riding a horse next to a big black and brown dog.
a number of people in a small boat with a car
A woman standing between two cows on a field.
This refrigerator has a monitor on its door.
A bus drives through a street with an arch.
A sink and bath in a small room.
A child with stuffed animals in the background
People playing a game with a Frisbee outside.
On that point are a bunch of individuals celebrating.
a man about to take a swing at a base ball.
A small plastic container of rice and vegetables with a few crackers.
A woman cutting the hair of a boy whose sitting in a toy airplane.
A variety of cookbooks stuffed into and around a microwave
A baseball player swinging a bat with a catcher and umpire behind him.
A group of skiers trudging up a snow covered hill.
Smoke billows from two smoke stacks of a steam engine boat.
Two Indian men decorate two different birthday cakes
A person riding a yellow motorcycle on a track.
TWO KIDS ARE PLAYING INT HE ROOOM
A bird that is sitting by some water.
A large dog holding something yellow in its mouth.
A bunch of birds that are standing in the grass.
A police office who is sitting on a motorcycle.
A large metal clock and some bright lights.
a very large pizza that is on a wooden table
A brown bear and a white bunny sitting next to each other.
a clock with religious icons painted on a wall
The surfing board is on the sand on the beach
A boy and his dog are playing in the snow.
a man uses a knife to chop up some carrots
A large jetliner sitting on top of a tarmac.
A man wearing a hat carrying two lamps in a field.
A group of people stand in a dimly lit area between roads.
a stop sign and street sign on an pole
a horse is standing with his owner next to a tree.
landscape of a snow covered field and mountains
A man in the park with a frisbee.
Group of double decker buses on road near crane.
a photo of a man in the credits of a film
A large blue train going down the rail road tracks
This is the front of a mobile library.
Black and white of man crossing to old style building with clock tower, possibly in Cuba as cars 1950's vintage.
A lone zebra grazes on grass in a pasture.
Two gentlemen discussing something being viewed on one of their phone screens.
A man sitting on top of a couch holding a game controller.
A man is in the water with a beautifully painted surfboard.
A living room complete with a couch sliding door and a window.
A view of a downtown area, looks very rural.
A bison and her babies walking through a field.
Red Light at a street intersection with people present on the corner
A man and woman having a drink on a docked boat.
A woman and a baby are looking at laptops.
This is a boy on skateboard about to go down a ramp
A girl in yellow dress eating a piece of cake on table.
a very nice draw showing a vase with flowers
An oven is shown with all of the burners in use.
Two young girls making pizza on a counter top.
A bathroom under construction with a white tub next to a toilet.
A person walking a dog on the beach
a beagle with it's tongue sticking out standing by a water bowl
A stack of luggage by a curb and parked car.
A person that is in the water doing a trick.
A guy in a white t-shirt rides on his skateboard.
Airplane being loaded sitting on the tarmac at the airport.
A jumbo sized stuffed teddy bear waits on a wheeled dolly.
A man in competition gear on a red snowboard going down a hill.
Seven circus elephants,  on their hind legs, leaning on each other,  with a standing elephant in the middle of the line.
A female tennis-player with her racket in-hand in front of a crowd of onlookers.
Three sheep in a field of grass near a steep hill.
A man is surfing on a wave in the ocean.
Two blue suitcases right next to each other
A brick wall and several warning signs nearby.
some slices and pieces of yellow bananas on a towel
A row of luggage sitting on a wooden floor.
A giraffe licking a fence post while standing in a coral.
A bathroom with a toilet, television and bathtub in it.
a bus that is parked on a very large hill
The woman is holding a teddy bear in her arms.
Four cows in a pen on a sunny day
A baby elephant stands near its mother.
A person with a hat sitting down with an instrument between their legs.
Large assortment of traffic signals in outdoor area.
A plate of noodles, beans, broccoli and an egg roll.
Three Zebras standing in front of a gate.
A man with a surfboard walking across a bridge towards the ocean
Snow skiers enjoying the slopes in the mountains.
Partially eaten donut with glazed topping on wax paper
a cat that is  on a couch and lap top
a woman riding a bike down the street
A plate of food containing meat and vegetables.
A man sampling donuts and ice creams for a birthday party
There is a computer on the work desk.
A large jetliner taking off from a runway.
An older man holding up a handkerchief with an image of a woman in a bikini.
A room with furniture and a fire place.
Adult woman walking on sidewalk near yellow fire hydrant in city.
three people in Japanese clothing, two are carrying umbrellas and all are wearing sandals and they are walking past parked bikes.
A baseball player attempts a slide as a catcher and umpire look on.
a man holding a brush standing in a room
Trays of food that include couscous, apples, and raisins.
a cat in a blue hat is laying down
A large orange cat sleeping on a pair of shoes.
a person riding a skate board ata skate park
a sign showing no birds allowed while a beautiful bird stand there
A piece of cake in a plastic container next to a large cookie.
A man in a blue shirt preparing to throw a Frisbee.
A plate of meat, bread, and vegetables on a table.
A group of people standing around a chicken coup.
a man at the beach holding a surf board
A room with wooden floors and white walls
A young woman sitting on a city bench talking on a cell phone
A GPS device on top of a counter next to a book.
A person surfing in shallow waves near the shore.
A youth baseball team is grouped together for a photo.
A LOT OF PEOPLE WALKING THROUGH A BUSY SQUARE
A train with one of its doors open.
A man sitting down at a table using a computer.
Olive green vintage military truck, six wheeled.
Sculptures of zebras stand in the brush and grass.
A clock tower lit up at night  with an array of bells at the top.
A white plate of food that includes an artichoke and bread.
a close up of a vase with flowers on a table
The baby is helping its mom on the internet.
A laptop and books sitting on white sheets on a bed.
A man laying on top of a bench on a dirt field.
A new tv on top of an old tv.
people standing in line beside a food truck
A skateboarder rides the rail in an urban area.
An Asian building with a satellite dish on the roof.
A bowl full of something that appears to be nuts, which can be eaten with chopsticks.
Two people are pictured standing in front of an apartment building.
A train traveling down train tracks next to a forest.
The bathroom only has a broken door, a broken toilet, and a broken window.
two people playing with a dog on a leash
A metal bowl containing five oranges in sunlight.
An indoor fruit market with citrus and tropical fruit.
A living room that has a couch and television set on a table.
A beautiful woman standing next to a man holding a Nintendo Wii controller.
A scenic view of Big Ben in the evening hours.
A wooden table holding various bowls and food.
The dog lies down next to the parked motorcycle.
A red and white fire hydrant was given eyes.
a woman in glasses sits in front of a laptop
Surfer riding a large wave next to a platform with people standing on it.
A chair is outside of a window that a woman is cleaning by a bathtub.
a living room with a lot of chairs and  a big entertainment center
Stairs that have some fading green paint on them.
A man with a meal and drink at a round table.
A pizza with various toppings is sitting on a wooden slate.
Two coffee mugs by an orange juice and a juice glass.
A group picture of young men and women at an event at night.
The group of people in business suits is standing beside a large poster.
A smiling man poses with a healthy cow
A large commercial airplane taking off for flight
A woman holding a carrot in one of her hands.
A woman taking a picture of herself in a mirror.
a close up of two pots of food on a stove
The hand is holding a controller for the video game console.
A hotel with a large blue poll lined with lawn chairs covered in umbrella.
Two pieces of fruit, an apple and an orange.
A tennis player in action on the court.
A woman is sheering the coat off of a sheep.
The woman in the mirror is taking a picture of herself and the dog.
A flock of birds sitting on top of a tree.
A woman with an umbrella on a bicycle
Two bulls are resting on the sand next to a boat.
A man with a dog plays a game on his Wii.
A man snowboarding down a hill while wearing a coat, goggles and a hat.
A man is standing near a car with some luggage while another stands near by.
Someone's shoe stuck to a stop sign in the city.
A large clock sitting on a sidewalk in front of a brick building.
A man and woman are playing an interactive video game with controllers.
a small yellow Cessna plane flying on a clear day
A monitor, keyboard, coffee cup, and plastic bottle sit on a table.
Two polar bears in the snow surrounded by trees.
a bunch of cattle are standing in a grassy field
Two pieces of bread with sauce on them next to a bowl of chicken salad.
A smiling man eating at a table with people behind him.
A stove top oven sitting next to a mixer.
a cat lying on the floor in front of a mirror
Four giraffes standing in the grass in their enclosure.
A small engine plane making a slight right turn over farmland.
An older man is on the soccer field with the ball.
The two sheep are enjoying their time in the hay.
A skiier approaches a huge snowball at a ski resort
A man doing a belly flop onto a bed
Boats floating on top of a large lake.
A surfer is surfing the waves in the ocean.
A man standing on his surfboard riding a small wave
there is a male safer that is seen riding a wave
A wooden shop stand loaded with drinks and food.
A large tanker truck driving down a road.
A girl trying to fly a kite with a face.
Standing man and woman near a dining table full of food.
A group of people hiking along a mountain line on the peak.
We are looking down on to a small bathroom.
A parking tole on the side of the street with snow on it
A passanger bus stopped in front of another passanger bus ready to pick up passangers.
A very intricately designed old tower clock, with people coming through the arched doorway at the bottom of the photograph.
a small teddy bear dressed in clothing
A bathroom scene is pictured in this image.
Two sheep are lying on the ground under a tree.
a number of luggage bags on a cart in a lobby
A white cat sitting under an open umbrella
A kitchen with woodwork cabinetry and pendant lights is displayed.
A blue couch with a bunch of pillows on it
A bus stop next to a curvy road surrounded by traffic lights.
A man surfing rocking waves in the water.
A blue eyed dog panting as he walks by.
A team playing baseball on a baseball diamond.
A baby in grey shirt sitting on a toilet net to a tub.
The bird is perched on the gate by the mountains.
A woman wearing a red shirt blow drying her hair.
An airplane sits on the tarmac at an airport.
a woman looking at her phone in a crowded area
a room that has a couple of different computers
A man holding a slice of pizza up to his mouth.
A crowd of people standing underneath round lit orbs.
a man made pond inside some kind of enclosure
A man flying through the air while riding skis.
a green motorcycle is parked next to another one
A sign cheering on the Colts sports team.
Two woman on an island pose for the cameras.
A pizza sits inside of a box on foil.
a bat and a ball on the ground next to a flower
There are seven chairs around the round table.
A person wearing a helmet riding a skateboard while a person stands in the background looking off into the distance.
Two skateboarding decks mounted on a grey wall.
Two zebras in an enclosure walking a dirt path.
A large room has a stone fireplace with candles inside.
A group of giraffe standing around a tree.
A pair of giraffes looking opposite directions in a forest.
A basket with a stuffed teddy bear hangs outside.
People sitting at a table with laptops and books.
A picture of three computer screens with two on.
A young girl eating something while wearing 2 different shoes.
Two men are working in a commercial kitchen to cook food.
a man with a bike sitting on a bench in front of some trees
A man in thought sitting in front of a laptop with a pen in his hand.
a close up of a child eating a banana
People standing at a park with a large yellow kite.
A foyer of a home that is leading to a dining room.
A black bear relaxing on a hammock supported by chains.
Old blue bus with bicycles parked on roadway near green space.
A man lifting up a lid on a toilet in a bathroom.
A group of people playing Wii and smiling.
Two birds are sitting on the ground together.
Several boats are docked in a harbor.
A sign advertising a reptile sale in May
one zebra drinking at a pond and another standing
A rain covered street filled with heavy traffic.
A large decorated bus has a couple of folks standing by it.
A close up of a doughnut covered in red, white, and blue sprinkles.
A purse sitting next to it's contents on top of a table.
A zebra standing in  a forest next to a large boulder.
A bowl of fruit on a table next to a letter and a plate with two tomatoes.
A bear in the arms on a heart pillow.
There is a laptop sitting on a computer desk
a lady taking a picture of a long horned mountain goat
A railroad train heading toward a traffic light on the tracks
a line of people that have horse and wagons
A skier stands posing on a flat area in front of the lodge.
A young man holding a cell phone with two hands.
A big sign telling people to stop eating animals next to a building with cars parked outside
Fire hydrant on a corner with a smile painted on it.
Youngster on a skateboard, trying simple tip up stunt.
A woman getting food out of an oven while another woman stands by.
A small city bus with advertising on the side and back
A hotel bathroom with focus on the toilet.
A yellow yawning dog laying on the ground
A black bear is crouched down in the water.
A couple of men standing next to each other on a  lush green field.
Flower are placed in a vase covered in shells.
a little boy with a suit, tie and glasses
A wooden shelf holding a microwave and small refrigerator
A giraffe sitting on a grassy patch of land.
A man sleeping in a bed by a dog and remote controller.
A jetliner sitting on top of an airport runway.
A man is standing talking on the telephone
Two skiers are standing at the top of a hill.
A table that has a pink hardcase carrier on it, along with several smaller containers.
People are walking along a sidewalk with their luggage.
Baseball player in the motion of swinging his bat at the plate.
A woman is talking on a cell phone outside.
some water and a person is flying a kite
A keyboard that is sitting next to a mouse.
The table has two wine glasses, a bottle of wine, and a vase sitting on it.
A man that is on a skateboard on a sidewalk.
A melted looking lay pot sitting on top of a spindle.
A black and white image of three people on a bench
a big lake with some boats out in it
A young man using a laptop computer with a large monitor.
A small bird sitting on top of a tree branch.
A airplane that is sitting on a tarmac.
The living room in the wooded house is empty.
a very clean bathroom with a walk in shower
An old, large clock hanging off of a building.
Two zebras standing side by side to each other in a zoo pen.
Two young boys carrying red and white surfboard.
a big airplane that is parked in teh woods
An assortment of muffins are on sale in a Japanese store.
A close-up of a bear swimming in the water.
A young man is playing tennis on a court.
a woman sits on an ornate wooden bed with fancy bedding
Man sitting at table with pizza and beer in restaurant.
Several pizzas are lined up on a table.
A man driving a motorcycle with a sports car trailer and two dogs sitting in it.
a big building with a clock inside of it sitting in front of a water way
A man riding skis down a snow covered slope.
Three men admiring motorcycles in a sidewalk exhibit.
The inside view of a bus and its passengers.
A side view of a train passing through a mountain trail.
Looking down at a partially eaten salad sandwich in paper
A black and white dog jumping up catching a Frisbee.
A smiling little girl hugging a teddy bear.
a tennis player swinging a racket on a court
A man standing in front of a flat screen TV.
The yellow mustang car is sitting on the side of the sign.
A dog sitting on a chair underneath a painting.
Group of people outside and one pointing up to the sky.
Two teenage girls wearing hats are smiling for the camera.
A plate and fork with toast and vegetables on a chair.
A group of giraffes on a path near a few trees.
A laptop computer, some speakers, a cellphone, empty pill package, and bowl of chili are on a narrow table.
A man is sitting in the grass holding four cell phones.
an asian man pitching the ball during a baseball game
Young baby in crib laughing with bear
a person in a scarf and suit sitting outdoors on a bench
A street sign with two signs hanging off of it's sides.
a teenager playing in a skate park with a skateboard
A police motorcyclist with a flag is riding while a large crowd watches.
A man doing a trick on a skateboard off a rail.
A person on a surfboard in the water.
The bed has yellow sheets with sheep on them
Two urinals next to each other in a bathroom.
A motorcycle is parked in front of two garages.
A person in skis stands over snowy ground.
A little boy playing a Wii game in his living room.
A man in a black shirt plays on an ocean wave.
A small kitchen that has small kitchen appliances.
a man and a woman playing tennis on a tv.
Bowls of lettuce, pepper, chapati and other foodstuffs
A dog is laying on a pillow holding his toy.
Two male tennis players, one has on a white hat. They look like they were mid conversation.
Many people are dressed as zombies covered in blood.
A white bathroom toilet sitting next to a urinal.
A dog sits on a couch with pillows.
A young boy is playing frisbee in a park.
a closeup of a person's hands as it plays with a Wii controller in front of booklet with Mario and Luigi characters
a note sitting between a couple baskets of oranges
A couple of men standing next to a man and his brown horse.
A bathroom with a hole in the ground for a toilet.
A man behind a cashier holding a red pen
We are looking at a simple clock tower.
A man eating a doughnut at his computer keyboard.
A lab with refrigerators and a man sitting nearby in an office.
a CGI photo of a animal sitting on some vegetables
A hodge podge of colors and patterns decorate a bathroom.
A child and an adult are paddle boarding in the ocean.
A parking area with trees next to a stadium.
A person communicating with two phones at once.
A laptop computer that is sitting on desk that has a lot of clutter on it.
A young girl smiles brightly over a chocolate birthday cake.
A person putting their foot up to a skateboard.
A person's handing pressing a button on a WiiMoted for the Nintendo Wii gaming system.
Man walking on beach near ocean carrying surfboard and holding para sail handle.
A photo of an old white building with a clock tower.
A large grey airplane flying through the sky in the daytime.
A collage of photos including a restaurant, waterfall and a rose.
a baseball player holding a bat in the batters box
A man riding a skateboard on a street next to a park.
A mother and her baby zebra grazing on dry grass.
A woman petting a giraffe that's leaning over a rail.
A black laptop sitting on a desk next to a remote controller.
A plate containing three sandwiches, fries, and ketchup on the side.
A building that has a clock on the front of it.
A parasailer on the water with sky line in the far distance.
The group of zebra are eating and there are small birds in their pen.
A long-haired man with a beard is wearing a suit and tie.
Cross country skiers travel through the snow during a race.
A plate filled with cooked vegetables and meat.
a person jumping a skate board in the air
A bedroom with a day bed next to a window.
A giraffe comes close to a visitor in its enclosure.
A laptop computer sitting next to a computer monitor.
The buses and trolley cars run on the same street..
Apples and pears are in a box with grains and a bowl.
People are marching down the freeway with a banner.
a man is sitting with a laptop box on him
A full view of a flower vase with drinks and cups.
a baby giraffe stands in a area with some birds
A man wearing a silver tie near a clock.
A small bathroom with tiles appears clean and organized.
An airplane engine is seen passing a mountain in the distance.
A couple of brown horses walking down a street next to buildings.
A woman taking a bite of pizza in a restaurant.
A group of people use paddles while standing on boards.
A baseball game is being played on a dirt and grass field.
A dust plane is pulling sharply up into the sky while leaving a trail.
A man on a horse monument in front of a building.
A girl with a bun is sitting on a scooter type motorcycle.
The stop sign is clearly visible for all of us to see.
Advertising image with writing backed by bags of oranges.
A umpire signaling safe at a baseball game as a man slides into the home plate
A person and a child ride on a skateboard in this black and white photo.
a man with a tennis racket is running on a court
A plate contains chips and a sandwich.
A man in a suit standing next to a control board and computer.
A bus travels down a busy street  in a crowded city.
a traffic light with two street signs on the top of it
A picture of an outdoor area that looks great.
A man giving a thumbs up behind a computer screen
Men keep watch on a herd of goats.
A group of elephants gather around in a field.
A street post with street signs and lights on it
A plate of food with broccoli, radishes, rice and chips.
The cat looks through the door that is cracked open.
A man in snow shoes and his dog on a snowy path in the woods.
a building is shown with a big clock in it
Five stuffed teddy bears sitting in a row.
A group of people standing together under an outdoor hut.
A clean kitchen has dark brown cabinets and white appliances.
A person is riding a horse on the sandy beach.
a couple of stuffed animals sit next to each other
A variety of vases are shown on a table top.
This is an image of a baseball game with players at home plate.
A man sitting on a bench next to potted flower.
Two people on bicycles riding in street with signage in the foreground.
A salad made with yellow pepper strips and green sprouts sits on a square white plate.
The adult elephant stands near a large toy ball.
A skier with a backpack pauses to enjoy the view
A baseball player prepares to swing at the ball.
tHERE IS A HOT DOG INSEAD OF A BREAD HOT DOG BUN
Two men sit atop motorcycles and two men sit in sidecars.
A surfboard sitting in the sand on the beach.
A cell phone sits beside a small crocheted change purse.
A collection of green vegetables sits on a table.
A toilet and shower-bath combo in a small restroom
A room with a television, couch, chair, tables and potted plants.
Three oranges sitting on a dark black surface.
A black cat laying down on top of a refrigerator.
A book opened sitting next to some mushroom ornaments and a vase.
A very narrow busy road of shops with a lot of people.
Some teddy bears hanging from chains on a sale rack
Zucchini, summer squash and broccoli are mounded in baskets.
A large flock of birds fly through the sky.
A baseball player is holding his bat, and blowing a bubble.
A tennis player readies herself to receive a serve.
a tennis player hitting a serve on the court
a round table of people with drinks and a cake
A snowboarder and several skiers at the top of a run.
Several people around a boat on the beach with an umbrella shade.
The giraffes were outside the building in a pen.
A couple of people sitting on a bench.
A woman in a dark blue jacket playing Frisbee.
Zodiac on back of large boat in a lake.
Young people painting a mural on a traffic divider.
A woman street skiing with a helmet on putting on her gloves.
A closeup of several ripe bananas clustered together.
A woman wearing a bandanna and ugly sun glasses.
a family in a small row boat in a river
A pile of trash sitting on a boat next to an umbrella.
A man holding a banana over his face.
a kneeling woman taking a photo of her black dog
Woman eating a hot dog while walking down a street.
a white toilet two rolls of toilet paper and a phone
A large airplane is on a runway with clouds in the distance.
Child at bat in Little League baseball while teammate watches from first base.
Two chairs and a small birds below it
this bathroom has white sinks and black counters
A line of people sitting on benches in a courtyard.
A large concrete skyscraper on a sunny cloudless day..
A hand holding a PDA with a illuminated keyboard.
A man sitting in a motorcycle poses with his arms outstretched.
A man balancing a bike on a bench.
A trash can is sitting next to a lowered curb.
Dog laying down partially covered by a comforter.
A yellow double-decker bus next to a traffic light.
An empty, clean toilet stall with a stack of toilet paper.
A group of women sitting on the floor eating food.
a dog looking out a window with it's reflection in the mirror
A blue and white street sign above fence and water.
A baseball player balancing the ball on his left hand.
A close up of a man petting an elephant.
A murdered monkeys head sitting in a white bowl next to bananas.
there is a cup of coffee and a half eaten sandwich on the table
A kitchen has a plain white fridge in the corner.
a plate of french fries, two sliced sandwiches, and a pickle
A man using an outdoor oven to cook a pizza
The sink is on the island of a large kitchen.
A cat is standing in the corner of the room
It is dusk, and the skiers have abandoned their skis and snow boards for social interaction.
A little girl cutting a piece of paper with blue scissors.
A group of people who are walking with umbrellas.
The flowers in a vase are dying.
A woman with short brown hair getting ready to bite into a hot dog.
a dog is floating on top of a water.
A baby and a young boy are inside of a rolling suitcase.
A black and white street sign that reads "end bird."
A teddy bear sitting on the edge of a toilet seat in a bathroom.
A woman is placing a flower into a cake.
A unfunished bed in the corner of a room.
One boy watches as another kid performs a skateboard stunt
A puppy cuddles with a shoe on a couch.
An adult giraffe extending it's tongue over a fence.
a couple of planes flying through the air
A bathroom with a white toilet and window over the toilet.
a person in a red jacket skiing along a path
A living room with a large book shelf filled with books.
a bed room with a neatly made bed and two lamps
Three people standing near a table with several glasses on it.
A small, white dog laying on a bed with a stuffed toy.
a toothbrush on a table with a bunch of scissors
A pedestrian walk light is lit up on the corner of West 3th St. and Seventh Ave.
A man placing a tie on a womans neck
A man flys through the air on a snowboard
A little kitty on the bed using a laptop.
Evening view of traffic light intersection with cars with headlights on and a building and trees.
A close up view of a hand on a keyboard by a monitor.
Here is a image of an zoo animals.
A tire sitting on top of a green fire hydrant.
A subway sandwich on top of plate and napkin.
A group of people riding on the backs of horses.
Clock tower ascending into overcast sky from buildings below
A family of four playing a Wii game.
A lot of sheep eating grass in a ranch.
Many people are scattered together at the air port.
A plate with a drink and a variety of deserts.
A large pizza on a plate on top of a dining table.
The adjacent farm land hills attest to the height of the soaring kite.
A small bird is perched on an empty bird feeder.
A shot of a desk with two computer monitors with a teddy bear on top of one of the monitors.
A cheese pizza is on a tray with pieces missing.
a couple of zebras watch a giraffe walk through the grass
In the evening a large amount of open umbrellas are together.
It is almost like the dog is flying in order to catch that Frisbee.
A pile of busted up toilets and sinks laying on the ground.
A person wearing a helmet and riding a motorcycle.
A cat laying on a couch in a room.
A young skateboarder performs a trick on the stairway.
two elephants in a field near a tree
A child sitting on a wood bench typing on a toy-like laptop.
A man who is looking at his cell phone.
A large plane with propellers high up in the sky.
A double deck bus driving down the street.
A bunch of people heading to a big plane.
A youth baseball player throws a baseball outside.
A horse that is enclosed eating grass during the day.
The young man on the skateboard is practicing his tricks.
A hallway with piles of luggage and other things.
A cat under a table on a wooden floor playing with a jar.
A very tall clock tower towering over a city.
Seven people are posing for an old time photo in a large kitchen.
A laptop computer and a desktop computer sitting on a desk.
A cat sitting in a flower pot with no flowers.
A bike and a large pile of luggage sitting under a sign.
Two giraffes with dried grass and trees in gray light
A large red stop sign on a street.
A train that is going by a train stop.
A woman wearing blue crosses the street on a bike.
A bathroom with sinks, mirrors and a towel dispenser.
Several people on a beach one is parasailing , one has been wind surfing , and some are gathering up a picnic.
A woman and her son using an old iMac computer
A herd of sheep laying down next to each other on grass.
A couple of men sitting next to each other.
A large white double decker bus parked at a bus stop.
A table is adorned with red, yellow and green fruits and vegetables.
A picture of a sidewalk in front of stores.
close up of a toilet that looks like it is smiling
a fire truck at an intersection resting on its side
A couple of people carrying luggage through the snow.
A bird perches beneath a multitude of clocks
some baseball players playing as people watch on
The couple are posing for a picture while he is brushing his teeth.
Two men riding an elephant driven by a boy.
Wide angle view of a girl in a living room watching television.
A bathroom with the door opened to a toilet and separate sink and vanity area.
Two people sitting back to back on a train.
A red double bus is traveling down the road.
A man and woman on a couch playing the Nintendo Wii.
Two zebra standing next to each other on a hill.
A WOMAN TAKING A PICTURE OF CAT IN THE BATHROOM
A family open Christmas presents near a Christmas tree.
A sleeping woman cuddling a cat in bed.
Pedestrians walk on the sidewalk of a busy city
A man is sitting by a river and brushing his teeth.
A clean and bright kitchen with hard wood floors.
A professional baseball player holding a bat on the baseball field.
a bird in a tree branch with green leaves
A plate with stir fried noodles, broccoli, beef and carrots.
a couple of bowls with some fruit in them
Two people on horseback are posing while the horses gallop on a beach shore.
Man standing on shoreline by ocean holding surfboard
A large white tank sitting on top of a green lawn.
A white table with two laptops and a bag on it.
Two hot dogs on buns next to a glass of water.
A pizza toped with cheese and met on a wooden table.
A photograph of a kitchen in the day.
A group of giraffes standing around their enclosure
A bedroom with a large bed with a white comforter.
A boat floating out in the ocean next to a  shore.
A man on skis walks on the ground.
a bedroom with two big beds covered with green blankets
A ,man holding a boys legs learning to surf
A male surfer performs a stunt in the ocean
A bear reading a Christmas book in four separate shots
A pizza with pepperoni and sausage sits on a baking pan.
A group of kids that are sitting around a table.
A bathroom with a white toilet and white tub.
A morotcycle sits parked near a curb where two people are walking.
Motorcycles standing in a row in a museum.
Picture of an exterior place that looks wonderful.
A toddler eats cake with his hands in his high chair.
A thing leafy green tree branch with many oranges.
There are many cows on both sides of the road.
Two zebras are standing close in a field.
A CD case is sitting on a bench.
A cat laying in a bathroom sink, looking at the camera.
People browse and relax in a wine store.
a man sits on a bench while petting his dog
Yellow fire hydrant with a blue top sits on sparsely cut green grass.
A couple of giraffe standing next to each other near  a fence.
A person with their feet on a coffee table in a living room.
A man sitting in a motorized raft in the water.
in a baron field a heard of zebras move about. 2 seem to be fighting
A smile white dog by a bike on the road.
a close up of a person playing nintendo wii
A multi layer platter filled with different types of cup cakes.
A woman with pink hair riding a motorcycle.
A plate with different vegetables and bread on it.
Group of four zebras standing in a field of grass.
A man holding a Frisbee about to throw it.
The baseball player at bat is hitting the ball
Three different road signs are stacked on top each other, as a man on a bike approaches.
Two children are on surfboards in the water.
A woman is seen in the kitchen cooking on a white stove
A bus sitting on the side of the road.
A man riding a skateboard down a street next to a  tree.
A dog that is wearing a dog collar smiling
A small child holding a remote and a remote controller.
A person on a snowboard riding it in the snow.
A purple bird perched on a tree branch.
Double decker bus that is blue and green
A man with a playful look standing by a dessert.
A skateboarder performing a trick on his skateboard.
A man on skis on a snow covered slope.
A man riding on the back of a bike.
A man holding something with some beakers on a table like a science experiment.
Three people are looking at their cell phones and drinking wine.
A small child and a baby are lying down together.
A baby is laying down with a teddy bear.
A group of young adults play frisbee in a park.
Car parked in front of a donut store.
there is a baseball game on and a player is preparing to run
A close up view of Italian mini hoagie sandwich.
A close up of a man's hand holding a cell phone on his lap.
A man playing baseball prepares to run after batting.
a herd of cattle on the field grazing
A bed with a purple bedspread on it in a room with a picture on the wall.
Two very large vehicles side by side on a street.
Two red two-story buses are parked outside of a building.
a dish of food some small plates and a wooden fork and spoon
A picture of a restaurant interior is taken through a fish-eye lens.
A young girl standing on a field with a flock of birds.
A man skiing down a hill covered in snow.
A man standing on top of a field holding a bat.
Two bottles of champagne sit in an empty fridge.
A heart shaped cake with bear decorations on a pedestal.
A trash can and a white toilet in a room.
A couple of giraffe standing on either side of a tree.
A made-up bed in a drab-colored hotel room.
Two dogs are tugging on the same Frisbee.
A man holds up a hot dog covered in toppings.
a person wearing shirt and tie and looking up.
A yellow packet sits on a wooden bench.
This photo is shot from a side angle, capturing the dog looking out of the car window.
A dog looking out the car window as seen in the side mirror.
A group of people eating and drinking in a restaurant
A clock tower overlooks the city and tells the time
A hedge row with rock pillars and a blue gate with sheep behind the gate and a mountain in the background.
A ship is coming in to port and is about to be docked.
A red towel hanging in a black and white bathroom.
A skier is posing in skis and with poles.
A woman riding a wave on top of a surfboard.
Children in suits and ties are standing together
The bathroom is clean and ready to be used.
Some green bananas and coconuts are sitting on a picnic table.
Several elephants walking together in a line near water.
A group of baseball player playing a game of baseball.
A fully tiled bathroom with a bathtub and bowl type sink, and a wooden framed mirror.
A crowd of people standing on loading platform between two trains.
A cart by some water loaded with old traveling trunks.
A stop sign is shown behind two trees.
A man on a tennis court about to hit a ball
A furniture store display, with a chair and set out
A couple of people on the snow putting skis on.
A building sitting along side of a street.
a couple of zebras are standing in a field
Women and a child in a boat made of tree trunks
A woman holding a small boy while a man feeds him some rainbow colored cake.
Smiling orange shirted sports fan using cell phone.
A vase filled with lots of different colored flowers.
An oriental temple of some sort somewhere in the world.
A man is wind sailing on the lake.
A woman holds a baby on her arm and both are looking forward at an enclosed area with two giraffes in it.
Two baseball players from different teams holding their baseball caps against their chests.
A shirtless man on a beach with a disc in his hand.
Men and women standing and crouching in front of a door.
A child is holding a baseball bat at a game.
A man is holding something up that says PPK
two people riding motorcycles on a city street
A cat sitting on a towel that's covering a plastic chair.
a black and orange cat in a shoe box and shoes
A small bird on an orange chair back.
An SUV parked in front of bock of businesses.
a person riding a skate board on a street
a stop light a md del line road
A little boy is eating a donut with white frosting and blue candy.
A zebra running through the brush tail swinging
there is a man and a woman posing in a kitchen
An old fashion oven is shown in  dim lighting.
A long exposure photograph of a tattoo'd man skateboarding.
A group of friends gather on a hill to enjoy a day of sking
A single tulip is seen in a small vase.
Small bathroom with a shower with red curtains on it.
A truck traveling down the street near a fire hydrant.
A hand is near a pizza that sits on a silver platter.
A woman jumping up from a wooden park bench.
A double decker bus waits at a bus stop.
a box holds some gloves and old-fashioned photographs, with ties hanging above
The police officer is riding the motorcycle threw the streets.
A silver vase sits on a wood surface with sprigs of silver leaves in it next to a leafy green plant.
A bowl has a dish that contains broccoli and mushrooms.
A zebra bends over to pick up a stick off of the ground.
A bunch of bruised apples sitting on the cement
A big green Ford F250 Pickup truck parked in the city lot
A dinner of a pork sandwich and french fries, with beer as a beverage.
Street sign on pole outside of building with windows.
A window with so Michael light coming inside
A boy riding a skateboard down a hill.
A man with short grey hair talking on a cell phone.
A crowd is shown walking on the street.
a man that is on a skateboard on a ramp
Three mountain goats sit and stand on a rocky cliff.
A woman on a sidewalk against a wall on a cell phone.
Two children are playing frisbee on the beach.
A young man spreads his arms to steady himself in mid air, as he and his skate board soar over the pavement below the concrete stairs.
The two cats are laying on top of the computer desk.
The huge airliner has four engines on it's wings.
A double decked bus from behind in front of building.
A tall giraffe is observed by people at a zoo.
A snowboarder makes a somersault on a snowy course.
A child drinking from a bottle in one hand and holding a remote control in another.
A man sits at a table and takes a drink of his beverage.
A flower bouquet in a glass vase and some writing on the photo.
an old picture of a person riding a bicycle
A man stands next to a very small plane.
The yellow fire hydrant is rusted on the sidewalk.
People walking and waiting around a baggage claim area.
a person wearing a suit and tie
A man who is holding a tennis racket.
A black dog laying on bed with a striped comforter.
A microwave oven door with a light bulb on inside it.
Car's driving on a city street lined with houses.
There is a vase with red, yellow, and orange roses.
A cheesesteak with a bite out of it along with someone else holding one.
Two teams compete for the ball during a soccer game.
A couple of neon signs sitting above a bar.
A kitchen stove with a microwave in the cabinet above.
A set of electronics and appliances sitting next to each other.
A pole with a clock on the top of it and a building in the background.
A very uo close and personal look at a sugar glazed donut.
A lady tennis player is bent over slightly and off the ground.
A basket filled with items on top of a table.
A stack of plates is adorned with pictures of round cats.
Horses pulling carriages on the sidewalk along the ocean or a large lake
A women who is swinging a tennis racket as two others watch.
A white vase filled with different colored flowers.
A man posing next to a couple of bikes on a street.
A girl sits between a mans legs on a skateboard.
a man is in the air on a skateboard
A red sports car parked next to a truck.
Three men ice skating in a line while one juggles a Frisbee on his head.
A group of people riding skis on top of a ski slope.
A bride and groom are sitting outside on a bench.
a couple of guys that are standing up with a wii remote
A parking meter on the side of the road.
Empty wrought iron bench outside the house on a tile base.
A cat drinking water from a toilet in a bathroom.
People are sitting and eating in a cafeteria.
A group of three people riding waves on surfboards.
asian woman with umbrella smiles at the camera
A train pulling several train cars full of coal.
a desk with a monitor a keyboard and a mouse
Man serving sliced pizza in brightly lit kitchen.
Planes lined up at the airport arrival gates on a snowy tarmac.
a bunch of kites being flown in the sky
A surfer stands with his board on its back in the water.
A subway car is coming down the tracks
A woman walking on a stone wall near two giraffes and a zebra.
A man standing on top of a snow covered slope with a snowboard.
The fire hydrant is by the building on the grass.
a silver car is parked in a lot
A cat is sitting on the dashboard of a car.
A plate of salad at a table setting with a glass of wine.
Horse statue displayed on stand in park setting with trees and flowers.
A child mixing food in a bowl on a table.
A pair of skiers sitting down looking at the scenery from a top of a hill.
Very finely made vases with painted designs on them.
A man is in a restaurant eating sandwiches off paper.
a close up of a bowl of broccoli on a table
Accident scene with a fire truck tilted on its site.
A room that has a couch, chair, and table in it.
A plate of meat, broccoli, rolls and rice with gravy.
A health hazard sign closing a beach to watersports with a sailboat in the background.
Two men laying on the ground near parked motorcycles.
A small airplane that is flying in the air near the airport
A dog laying on the ground with a pink frisbee in it's mouth.
Little boys playing soccer together on a field.
A group of people wearing orange are standing next to a VW bus.
two men standing in a room near two microwaves
Fishing boats docked in a harbor with mountains in the background.
A train can be seen in the foreground and a shipping dock in the background.
A blue ceramic vase with fresh flowers on a window sill.
Tall buildings surrounding an alley way with birds flying over it.
Some people in white overalls working with some metal bars.
A table in the kitchen of a building with screen walls
There is no image here to provide a caption for.
Clean plates, cups, and spoons drying on a towel.
Two people  seated at a table with other people in the background.
Two birds are standing among leaves and sticks.
A snowboarder holding a board while looking at the mountain.
A large wave with some people on surfboards
Someone is surfing the breakers under a sky filled with fluffy clouds.
A sign with a gnome crossing symbol on it.
Two people playing a video game on a large television.
A busy city street with a traffic light on it.
A black and white photo of a cow running in the desert.
Some people are flying a kite at the beach.
A man sitting on the floor with a laptop as others walk by.
An old bus being driven by a beard man.
a clock is up near a statue of a bird
A man shaving in a large bathroom mirror.
A man stands at a counter with food items.
A skier skiing between poles on a ski course.
two girraffe standing in an open field with their necks crossed
A bus with the windows broken down sitting in the open area.
A girl on a surfboard that is on the ground
A freeway is busy in the late evening.
A garden with yellow flowers on a sunny day.
A child skier is headed down a small slope on their skies.
A group of people stand in shallow water near a wind farm.
A white toilet is shown in an all black bathroom.
A laptop is powered and sitting next to a mouse and a cell phone.
A train that is on the rails in a station.
A man rides an elephant as it crosses a river.
A boy is eating a dessert on a table.
A woman, man and child standing near a food truck.
two hands are holding white video game controllers
A fresh fruit plate with grapes and oranges.
Two female tennis players shaking hands over the net.
A group of people are on a platform above giraffes.
The boy is sitting on his blue suitcase.
A large boat is carrying a smaller boat through the water.
A small clock on a pole in front of a building.
Two beach chairs with towels draped over on a beach.
Lawn area outside a McDonalds, no customers, appearing closed.
Some men are putting lots of bananas into piles
A cat is looking up next to a large television.
A man dressed up in renaissance clothing talking on a cell phone.
Four soldiers and retired officer jointly cutting ceremonial cake.
A cat sitting on the edge of an open car window.
There is a woman that is riding on a bike
A traffic light with a red light and an arrow pointing to the right.
A group of horses stand beside water and grass.
A group of bikers riding down a busy city street
A large cat is laying belly up on the bed
A fire hydrant is sitting on a sidewalk.
A lone zebra standing next to a sheep in an enclosure.
A woman and a man with a surf board on the beach.
A red fire hydrant raised up in the grass.
Tagged animals are grazing on grass in a field
A photographer with his nice camera walking in a dirty road
Person in a black wetsuit and gorilla mask carrying a surfboard on a beach.
A woman sitting on a curb with her feet on top of a skateboard.
A room with a large clock sitting next to a wall.
A motorcycle sits on the side of a building.
A young surfer rides the side of a wave.
A meal of broccoli and some kind of meat.
A bunch of cows in a field with a man standing near the fence.
a vandalized stop sign on a city street near a pole
Old style computer with keyboard and mouse sitting on rug.
A close shot of two separate trains.
A heavy set woman wearing a gray sweater holds a brown teddy bear.
A kitchen with a counter, refrigerator and a dishwasher.
A hotel looking room has another room through the door.
A grass yard that has a large sheep laying down on the grass next to a dog.
A woman and a little girl in blue are making pancakes and another person with her hands are putting on some cheese.
A large brown dog standing on the side of a small road.
A bunch of people who are standing around a table.
A dog is lying on the couch with its head on the arm
A bird sitting on a branch looking away.
There are moving motor vehicles on the road.
A woman playing tennis on a clay court.
A blue and gray commuter bus traveling through a shopping district.
A person sitting at the edge of the surf in a wet suit.
a black and white cat wearing a neck tie
A giraffe is in the wild standing next to a tree.
A group of people sitting around a table.
a suit case on the floor with a hat on it
A little girl sitting on a bed with a teddy bear.
a person in a blue jacket and is rowing a red kayak
A paper plate holds two slices of pizza.
The giraffe is walking beside the chain linked fence.
Two elephants are walking through the mud in a clearing.
A turkey that is cooking in a large roaster oven on the counter.
A man eating a piece of pizza on top of a plate.
A cat sits on top of a laptop computer.
A room of people standing around playing video games
A small black and white dog with its head on its paws
A small beige dog with short curly hair.
No dogs, only teacup poodles OK sign and fire hydrant.
a couple of people are typing on their cellphones
A woman is holding a phone and sitting in a chair.
A pan has fruit and vegetables on it.
A white dog stands on the back of a sofa.
A bedroom scene with focus on a bed and a teddy bear.
Men work on the basket of a hot air balloon.
A hand holding a small orange Japanese umbrella.
a statue of a cat sits next to some scissors
A great view of a street in the picture.
a man is doing a trick on a skateboard
A man standing on a tennis court holding a racquet.
An armchair with a stuffed bear on it on the sidewalk.
One small and large giraffe standing next to each other.
A cat sitting on top of a grey cloth and next to two staplers.
A older man enjoying a variety of pastries and breads.
A couple of fans have painted their faces red in a large crowd.
A person holding a hot dog on top of a bun.
Woman sitting near a table eating a cake.
point of view shot of man using a small urinal in bathroom
Man throwing a disc at a bush park.
A zebra eating grass on a sunny day.
a person riding a surf board on a wave
A vehicle with Melbourne Tigers painted on the side of it.
an orange small van and a white surfboard
A gray remote is sitting next to a black remote on top of burgundy fabric.
A man sitting on top of the snow holding skis.
People are meeting around a circular table.
A train stops at a vacant train station.
a ball player holding a bet and some business men
some people in a room with tables and two are playing a video game
Two women on a park bench looking at a digital camera.
A group if people that are sitting on a park bench.
A red fire truck toy on a table.
A man and woman on their cell phones by an umbrella.
A dog is watching a man ride a skateboard on his stomach.
A carousel view shows the circular.. lighted center and several rides, including horses, a giraffe and an elephant.
A man is holding a large pepperoni pizza.
two birds standing together on a rock
A table with utensils, glass, plate of bread and salad, and stones, on a stone patio with chaise.
A group of people standing around each other near a tent.
A group of elephants that are in a field.
there is a flower in the glass vase on display
A tennis player in a blue shirt runs toward a ball.
Two men are skating on their skateboards in the middle of the afternoon.
Cows grazing on the side of a mountain covered in green grass and trees.
A small locomotive engine blowing a cloud of steam.
a couple of people on a tennis court pose for a picture
A person wearing stiletto heels laying in a bed.
A bride and groom exchange a fork-full of cake on their wedding day.
A nerdy woman brushing her teeth with a friend nearby.
There are lots of kites in the sky by the beach outside.
A man throwing a Frisbee on a sandy area.
Two oranges on a cutting board with a zester full of rind on a counter with pot in back.
A TV sitting on top of a wooden dresser.
A dog with closed eyes sitting on a cushion.
a close up of a cat walking on a brick surface
there is a cow that is drinking water from a hose
A woman helping along man put on a tie.
A boy laying on his side typing on a laptop computer.
A pair of dogs lie down beside each other on a bed.
THERE IS A SINK AND A SCREEN DOOR IN THE HOSUE
Four individuals on a basket ball court, one of them holding a tennis racket.
People standing in the sand flying colorful kites.
A large fancy clock on a building showing the time of 1255pm.
A blurred picture of a laptop and a box of tissues.
A couple of one way street signs hanging on a traffic light.
A bull and two calves block a vehicle from going down a road.
a white plate with meat and a green vegetable on a glass table
A subway train painted with graffiti pulls up to a platform.
A person surfing a wave on a yellow surf board in the ocean.
A dog is sitting on a work bench in a shop.
Crowd of people at public market in urban setting.
A man uses an oar as another man looks on
a adult sheep stands by a tree as some baby sheep look on
An old man with a tooth brush head under his nose, mimicking Hitler
a little girl with her teddy bear sitting in front of a morror
A group of older people sitting next to each other eating cake.
A police motorcycle is parked at a festival procession.
A guy walking on a field holding a Frisbee.
an image of a person making a video game character
A small bathroom, with only the toilet and sink visible
a person riding a skate board at a skate park
A bulletin board filled with blue pamphlets on a city bus.
A small bird sitting on a thin tree branch.
Two people and their dogs skiing along a trail in the woods.
A living room with red carpet and blue couches.
A man sitting at a desk with a laptop and a coffee mug.
A herd of sheep and cattle standing on a lush green hillside.
A woman in plaid shirt looking at a bird on a ledge.
three baseball players standing around  a base
A large airplane flying high up in the sky.
Sunset seen across an expanse of calm water
A man is holding an umbrella beside a truck.
Stalks in a ceramic vase against a mustard background
A man in white hoodie sitting in front of a leather couch.
The baby elephant is walking with a small object in it's trunk.
Male surfer in a wet suit, on a board, about to be overcome by a breaking wave.
Baby elephant with ears spread standing in front of larger elephant.
A photo of stuffed large animals taken through glass.
Two plates full of breakfast foods are next to cups of coffee.
The two slices of toast each have cheese on them.
Two young men trying to catch the same frisbee.
A flotilla of small boats circle around water buoys.
Two colorful parrots perched on a tree branch.
a group of people watching a baseball game
A photo of a yellow and green fire hydrant.
A yellow teddy bear on a little girls bed
A train station with a red,white yellow and blue train pulling in on the tracks
there are many cows that are laying in this barn
White dog sticking its face inside a white toilet bowl.
Two guys in a bar eating pizza and drinking beer.
A white carriage with a white horse carries passengers through a city square.
A man crouched down with a camera next to a small white horse
three laptop computers sit on a table in front of a television playing the opening scene of a Star Wars movie
a teddy bear with a hat placed on his head
A brown horse with pink and black harness stands before a business with a short white fence.
a bathroom with a very dirty toilet and sink
A stainless steel sink is next to shelving in a room.
A small dog is beside a laptop computer.
A long row of buses driving bumper to bumper near trees.
Road sigh on wooden pole shown upside down next to white wall.
Two leather clad motorcycle riders on a paved road.
baked round bready pieces of food piled on a plate next to bowls of vegetables and other sauces
A red vase with  dozens of roses sitting on a piano.
A man performs a trick on a skateboard in a skate park.
Snow-dusted evergreens and rolling hills mark the distance, while in the foreground a hunched over skier moves through a dip between two snow-packed slopes.
A dog is running on the beach sand.
A lone horse shades himself under some trees.
A pan with pizza and its cutter on it sit on  a stove top.
A half eaten cake sits on the table with a knife.
Three people water skiing at the same time while folks in another boat watch
A woman in a bar is wearing a tube dress.
A neat and modular kitchen with electronic gadgets and dining table and two chairs.
A clock tower and other buildings in a city.
A chocolate cake sitting on top of a table.
A woman and a man playing an interactive video game.
Some men hugging each other and a person with plates on their head and shoulders.
Several people are sitting around a table having a business meeting.
A woman is checking her phone outside on a fall day.
A young boy who is eating a chocolate doughnut.
A man who is sitting at a bar.
A man wearing a red neck tie and a blue jacket.
A display shows hummus and vegetables on white trays.
A black train sits on the tracks of a station.
Table with food on it including bananas and rice.
A skate boarder is doing stunts on a bench.
A hand is cutting into a large white cake.
An old truck with a broken side view mirror.
A lady with a brown hat and long white socks sitting on a wood bench.
Sun peering through leaves of a grand land scape in distance
A group of friends posing for a picture at a deli.
A large passenger jet on an airport runway near the coast.
A man standing next to a white horse.
A baby girl is using two brushes to arrange an older child's hair.
A young girl is sitting on a bench in front of a rock cliff.
A dog laying on a bed under a pillow.
An adorable grinning girl laying in bed between ms piggie and kermet the frog.
Coffee and powdered sugar doughnut on a woven cloth.
Two pieces of meat covered with gravy next to broccoli on a plate.
A flock of birds sits on top of a large giraffe.
A statue of man sitting on a bench overlooking the ocean.
a train on a track near a platform with people near by
THERE IS AN INSIDE OF A KITCHEN IWTH A STOVE AND A DOOR
A baseball player is holding a base ball bat at the game
A young man ridding a surfboard down the rapids of a river.
A red stop sign under a street light.
One man is on skis and another man is behind him as they stand in the snow near a pond as a group of onlookers stand off to the side.
A toy truck sits on top of a table.
The small bathroom has an electronic toilet near the sink.
A clean handicap restroom with plenty of toilet paper.
Two multicolored cows cross the road very slowly.
A airplane in a field with a freight train going by in the distance
An equestrian riding on the back of a horse at an event.
An picture of an old building with two towers and a clock is taken from below.
a computer that is sitting inside of a room
Three young people sitting at a table and enjoying some lunch.
a brick building with a blue sign on it in front of a metal pole
A group of people posing for a photograph at a black tie event.
A man sitting in a chair while working on her laptop.
A man is jumping as he tilts his skateboard to the side with his feet.
A hand holds a ball in a green sign that sits on a post.
a woman stretches high to hit a shot in tennis
a close up of a dog laying on a bed
The people are drinking from cups and smiling.
Two ovens next to a plastic bucket and trash container.
A giraffe enjoying the company of another giraffe.
A warning sign in front of train tracks.
a laptop with some other electronics on top of it
A bench is in front of a flower bed.
Three airplanes are lined up for take off.
A couch is sitting outside on the curb by the pole.
A cat relaxes in a suitcase next to a pile of clothes.
Cooked broccolini and greens on a white plate.
A leaning stop sign has a street sign on top.
there are many vegetables sitting on this counter
Children looking at a zoo giraffe and its baby
a motor bike sits parked on some ply wood
A donation station on the side of the street
A group of vehicles in street area next to a building.
A sumo wrestler is shown wielding a baseball bat and awaiting an incoming pitch.
A bathroom sink with no mirror behind it
A shiny new racket looks down upon the worn shoes of a tennis player.
Three people in the distance riding horses along the beach.
A young skier headed down to the ski lodge.
The beat up car is parked beside the building with a statue of several men.
A large long train on a steel track.
A dog is on the grass playing frisbee.
Two zebras chasing each other in an enclosure.
There is an animal walking along the hill.
A group of long horn bulls in a field.
A man is laying on the floor of a hotel room next to an open suitcase.
A view of a dock with a lighthouse in the background.
a person holding a hot dog with mustard and ketchup.
A man on skis on a mountain trail.
There is a hotdog and a side dish on a plate.
A black and white picture of a jumbo jet parked  on a runway.
A black and white photograph of something I cannot quite make out.
A zebra standing on a field next to lush green trees.
The man is sitting on the bench by himself.
A girl is petting a cow through a fence.
A woman wearing a short skirt kneeling on a tennis court.
Two black, white and orange stand on the grass near a cliff.
the baseball players are talking on the field.
People are walking on the street by a homeless person.
A grey clock tower above grassy area and building.
An elephant standing next to a body of water.
A person trying to fly a kite on a beach.
A dog with a bandana and goggles sits on a red motorcycle.
A pile of identical teddy bears lays on top of some pillows.
A couple of red double decker buses sandwiching a small white bus.
A grove orange trees filled with juicy oranges.
A man riding a snowboard down a snow covered slope.
People and cattle standing at the waters edge on a bright sunny day.
a little boy retrieving a mans banner flags that have broken
an overhead view of people at desks working on their computers
A person in a wet suite running beside the water, holding a surfboard.
A pair of seagulls resting on the top of a lake.
A close-up photo of a piece of broccoli upside down.
A large tall tower with a clock on the top.
A snow boarder in mid trick with the Hilton in the background
A bear and a dog sitting together on a hillside.
A person that is catching a ball in a baseball game.
a guy standing in front a large building holding a tennis racket
Three people sitting on a bench in front of a lake.
A bus driving down a city street next to tall buildings.
A couple of little kids sitting in the grass.
A black plate topped with lasagna and garlic bread.
a bus is driving down a snowy road near the days inn and suites
A vegetable and fruit stand on display at the market
Several bundles of green and yellow banana's hanging around a table.
A group of people who are sitting on couches.
Boy performing a trick in mid air on a snowboard
The people are cooking out and have hot dogs on the grill.
THIS IS A PHOTO OF A SITTING AREA WHERE SOMEONE HAS PLACED THERE LAPTOP
A young child sitting in front of a pizza.
A Siberian Husky dog is being brushed while he lies on the floor.
A toilet area with bright and colorful wallpaper.
A group of people playing a video game on Nintendo Wii.
There are plenty of apples to choose from in this outdoor market.
A woman and a small child watch a train as it passes.
a lock on a door under a window
Two vases of flowers are sitting on a counter top with bears.
A class room full of students and they are on their laptops.
An old stoplight with a clock and a troll doll next to it.
Buses are parked near a field with a fence.
A lady is pulling her luggage through a terminal.
A group of sheep gathers under a tree in a grassy field.
A BOY JUMPING OFF A CRATE WITH HIS SKATE BOARD
A black and white cat sitting on a suitcase.
An older man downhill skiing down a slope.
Large canyon surrounded by a series of trees.
A waiter lighting candles on a cake at a restaurant.
A man with skiing gear on top of a mountain with snow
A man skateboarding on a skate ramp at night.
a person spraying a horse with a water hose.
A man and a woman ties a boat to a wall.
I am unable to see the image above.
A black and green vintage engine moves along tracks by a station.
a see through sun roof cover is being used
A person in a room with knives and scissors hanging on the wall.
A train pulling several carts traveling down the rail road tracks.
A table with various meets, breads and tomatoes on it.
The two ladies is outside  talking in the rain
A woman in a plaid cap cross-country skiing in a group.
A small child with a kite walking on a beach.
a man sitting  in a lawn chair in the snow
A window with a bench is under a staircase.
a sign on a pole advertising free bus rides
a bunch of students stand around on the field behind some school buildings, playing Frisbee
A family who are selling bananas in a portable cart.
a person is holding an old cellphone outside
Girl in a dress throwing a red frisbee.
A group of people in the ocean on surf boards.
A large crowd of people gather in a square with Capitol Hill in the distance.
A plane with a sign attached to it flies high over an ocean beach.
The kitchenette has a stove, microwave, and sink.
A red, white and blue train filled with passengers.
a close up of a vandalized stop sign
A woman playing tennis holding a pink umbrella
a man and woman recline in a bed, each with their own laptop
There is a purse on the floor with its contents spilled out
A bunch of surfboards are standing in a room.
A man and a woman are cutting a cake while others watch.
kitchen with a wooden kitchen island and checkered floor
The two elephants walk next to each other in front of the water.
A bus filled with little monitors displaying video.
A laptop sitting on a couch with cell phone on a table.
A tile floor in a bathroom with a urial.
A jet airliner is on a runway on a cloudy day.
A vase of flowers sitting on a checkered table cloth
Two vases filled with white and purple flowers.
People walk up the stairs to get on a small airplane.
The young child is eating from a spoon.
An end table with a vase, remote, phone, candle and wedding picture.
large tourist clock near a body of water.
A parked motor bike on the side of a the street.
A bowl of fruit is presented with a pitcher of water.
A woman that is hold a device in her hand while standing on a court.
A strawberry shaped cell phone holder hangs from a belt loop.
A silver train traveling into a train station next to a platform.
Two women, one with glasses standing next to a sheep pen
A man is in the park with a hula hoop.
A car that has the front of it open.
A street scene with signs and people on bikes.
A plane is flying in the air nearby a mini van and rental truck.
A bird perched in the top of a leafless tree.
A stream with rocks outside by train, with hills and evergreens.
A picture of a gross looking cheese pizza.
A door is open on a white subway train
View from behind of two women under umbrellas
A man or woman skiing down a snowy hill.
There are sheep grazing together in the grass.
an image of two horses at an outdoor park
A woman and two men on skis on a snowy hillside surrounded by trees
Two women are riding motorcycles down the street.
Two street signs on top of a metal pole.
a brown horse with a brown nose laying down
A hazard yellow Navy plane sits in a hangar.
A couple of people playing Frisbee out side.
Several remote controls piled up on a flat surface
The people are moving across the snowy mountainside.
recovery tow truck towing a bus from a parking lot
A living room setting with two bookcases with books
a small refrigerator on the floor next to a freezer
Zebras are grazing next to a car in a field.
The curious kitten is looking down into the bathtub.
a yoilet is int he middle of a clean bathroom
A young girl puts a TV remote control to her face like a cell phone
A man engaging in a game of tennis on a court.
A clock on a red building letting people know the time.
A bunch of green bananas tops a large bloom.
A brightly painted bus pulls out of a parking space.
Front half of a commercial airplane on a runway closeup with dusky sky.
A vase of flowers sits near a window with a blind.
A street sign in front of an old building in Ottawa, Canada
some elephants are standing around some water
A computer mouse, mousepad, and computer keyboard on a table.
This is a picture of a popular sking mountain.
A woman walking on a tennis court holding a racquet.
A man that is sitting down holding a telephone.
A sand area that has various sets of vehicle tire tracks on it and one beach umbrella open and set up in the sand.
an old man holding an umbrella next to a bare tree
People fishing and enlarge Mountainlake with trees lining the shoreline
A tractor sits on the back of a large truck in front of a clock tower.
A boy feeding a giraffe something green with palm trees in the background
The clock tower sits in the middle of the pavilion.
A sign indicates directions of travel in a circle
An elephant standing in dirt under a tree.
There is a giraffe that is standing at the fence and someone is petting the giraffe
A young dog lies on a freshly made bed.
Three green birds perched on a limb with the sky in the background.
Three good friends having a bit of lunch and drinks together.
a person walking on a city street with signs and poles
a baseball player holding a bat on the field
a group of people on a paddle boat in the water
Some bananas are placed on a cutting board along with some yogurt and a package of creel.
A white toaster in the middle of an asphalt road.
We see a close up of a vegetable ad pasta salad.
A couple of men playing a game of frisbee.
a diced up credit card next to some scissors
A blue and red airplane is flying in blue skies
THIS IS A UP CLOSE PHOTO OF A PLATE OF FOOD
Red traffic light at intersection on paved four lane road.
a cargo truck is loading a train with luggage
Kites flown in large grassy open area with numerous onlookers.
Motion capture shots of a person riding a snowboard.
A large long train on a steel track.
Two black bears being kept in an enclosure
A man snowboarding down a snowy and hilly slope
A female tennis player gets ready to begin play.
One tennis racket is place on top of the other one.
A man on his motorcycle is attempting to mount before taking off.
a man holds his racket out while on the tennis court.
A man in white shirt and shorts playing a game of tennis.
A baseball player holding a bat while standing next to home plate.
A cat lying on carpet with its head on a banana.
A Macbook is placed on top of a book.
Many brown and white cows are in a dusty field.
A woman holding a tennis racquet walking near a little child and a man.
a person holing a hot dog with onions on it
A black and white photo of a glass bakery shelf.
A man on a boat preparing his fishing pole.
A football game on TV reflects in a bathroom mirror.
A baseball player holding a bat next to home plate.
A couple buses are parked in a parking lot at night.
A light brown dog laying on a leather sofa.
THERE IS A BABY THAT IS IN FRONT OF THE REFIGERATOR DOOR
A man is filming something on a cellphone
A little boy holds a toothbrush in the bath.
A photo of a bathroom sinks and tub taken in a mirror.
A cat sleeping on the contents of a piece of luggage.
A laptop, mouse, cell phone and a notepad sitting on a table.
Two giraffes grazing the in wilderness with a mountain in the background
A rusted locomotive on a hot summer day.
A group of men standing next to each other holding snowboards.
A man in a baseball uniform walking on a field.
A train travels along the platform in a train station.
The table has most of the items needed to keep in the repair shoulder bag.
A bird is perched on the branch of a tree.
A metal stove sits under a granite countertop in a kitchen.
A metal refrigerator freezer inside of a kitchen.
A person in winter clothing, a helmet and skis, doing a trick i the air with jos skis crossed.
there is someone holding a remote in there hand
There is a rowboat out on the water in this sepia tinted photo.
A bathroom with a shower and toilet decorated in pink and green.
A  woman in black standing near a bus stop
A black and white motorcycle parked on the sidewalk outside a store.
Lights shine on two matching, white, pedestal sinks.
A man with a dog in his backpack walks down an aisle on a bus
a clock on a building with a sky background
A new silver motorbike parked in a garage.
A black dog laying on a bed under a blanket.
A truck that is parked on the side of the street.
A guy with a broom and dog stands on a surfboard.
A young person laying down on a surfboard riding a wave.
a man and a woman are playing a video game together
A boy is sitting on a hospital bed.
Zebra alone in a field of dry vegetation.
A chef preparing food inside of a kitchen near  a window.
some people on some grass playing frisbee and some trees
A young boy standing on his tip toes playing a game on the Wii.
a surfer laying on their board about to catch a wave
A clock at 7 during a hazy autumn time of year
Two laptops sit atop a desk on either side of a phone.
A man and a woman sit together on a bench.
dressed up toilets in a toilet competition on fake grass
A group of sheep are out in the fog wandering.
A baeball player I l9e standing in a field
A brand new black stove in a primarily white colored kitchen.
A herd of cows walking across a river.
A baseball player slides in to base to try and take it.
The plane is flying very low to the ground.
A parked red and white motorcycle is shown from closeup.
Black British fighter jet doing a barrel roll.
a rusty old truck sitting in an overgrown field
A pair of men looking at a tablet perched on a table.
A pizza sitting on a table outside
a sotre front with bread in the display window
A cat lies on a rug and chews on a banana.
A bed is in a bedroom with two lamps on nightstands.
Woman as seen through window of red vehicle.
Spectators watch as a skateboarder performs a trick on a ramp.
A girl is on her cellphone surrounded by fruit
Two people using cleaning brushes on an outdoor monument.
a lone zebra stands between some trees with a zoo sign in the background.
There is a row of parking meters on the street.
A baseball player throwing a baseball during a game.
Cargo train is traveling on a track next to a forest.
There is a man putting bread on a shelf.
a tennis player swinging a racket at a ball
A large commercial airline taking off from the airport.
Tables of laptops are visited by various people.
A person riding a skateboard through the air on a ramp.
A woman swinging her tennis racket on a tennis court.
A surfer balances on a surfboard in the ocean.
A steeple of a large building, with a clock on it.
An airport runway with a jet airplane ready for takeoff.
A pair of people sit at a table with food and drinks.
A laptop computer on a desk beside a paperback book.
Bowls of soup sitting next to oranges and limes on a table.
A fire hydrant is shown on a sidewalk with a brick building nearby.
A large tray filled with tasty looking food.
a boy throwing a Frisbee at night in a park
a close up of a man with a mustache smiling
A flock of ducks swimming on a lake together.
there is a man in the water and a boat next to him
A very crowded busy street with many signs hanging from tall buildings.
a surfer is out at sea riding a wave
A man on a skateboard passing by large glass windows.
A train is on a bridge next to buildings.
These are special repair vehicles used on train tracks.
Person laying on bed by a window reading a book.
a group of people seated around a dining table outdoors.
An old fashion setup with cakes, candy, and tea.
A skateboarder doing a trick in a parking lot.
A bench with memorial bears and flowers on it.
Up close picture of baseball batter wearing gloves and helmet.
A woman crosses the street while she talks on her cellphone.
Young adult male is surfing and riding the waves.
a surfer walking down the beach looking at the ocean
The young man is practicing on his skateboard.
There is a person holding a Wii remote in their right hand while holding the nun-chuck in their left.
A man riding a wave on a skateboard in an ocean.
A cat is laying on top of something on the side of the road.
A man is talking on the phone while working on the computer.
an older person passing out plates of food to young people
A person on a snowboard is sitting in the snow.
A team of baseball players are posing for a group picture.
A plate of food with broccoli and different kinds of pasta.
A woman laying on top of a surfboard next to a black cat.
A  bedroom filled with bunk beds and a latter.
A few kittens in a bowl in a white void
A small dog wearing a colorful sweater leaning out a car window.
an open pizza box sitting on top of a stove
A donut with red, white, and black sprinkles.
A woman running to hit a tennis ball with her racket.
a baseball player holding a bat in a batters box
Military plane is being flown by a pilot
A cow stands next to a calf inside a fence.
there is a man on the beach that is flying a kite
Three boys playing a soccer game on a green soccer field.
A bowl of corn chowder with broccoli, and a spoon on the side.
an image of a group of people surfing
A man surfing with a photoshopped character on board in front of him
Two zebras grazing in front of a large bird
A zebra grazing on long dry grass in a field.
A man and woman looking at their cell phones.
A dog has a frisbee in his mouth outside
A young man swinging around a Wii remote.
a field full of windsocks and cars parked in the background
A couple of deer standing next to a zebra on a grass field.
A stylistic pot and vase sit on top of a mantle.
Two stacks of towels are laid out on a bed.
A classic car waiting at a 3-way stop sign.
A bathroom scene complete with a toilet, sink and bath tub.
Two women sitting next to each other on luggage.
A dog sits waiting while his owner cuts some meat.
A living room with chairs and a wall of windows looking to a patio.
Room with a lamp on a wooden computer desk.
A woman is reading a book with her head cupped in her hand, as she sits in front of a park.
Two towels hanging over a shower rack in a well-lit bathroom.
Five jets flying in the sky and making colored smoke.
A young child eats a hot dog on a bun.
A train passing by a field that has been cleared.
A cat who is sitting in front of a keyboard.
Snowboarder going down the side of the mountain of snow.
A bird is sitting perched on a branch.
There is a black cat that is sitting on top of a toilet
a little cat standing on the lap of a man sitting in a chair
a kid is riding on a surfboard at the beach
A small dog sits on the driver's seat of a car.
Two toilet stalls in a bathroom with a black and white checkered floor.
A giraffe standing on top of a grass covered field.
Two people siting together on a bench matching .
Several people seated at a table with pizza.
a small white hand held remove control device.
A person driving a motor bike through the sand on the beach flying a kite.
Two frames of a woman with a tennis racquet.
A woman with blue hair and a giant toothbrush
Four guys with game remotes playing a video game.
A photo taken from a plane looking down at the mountains.
An airplane moves along a runway at an airport.
two guys sitting next to each other with laptops
A man jumping to strike a tennis board with his tennis racket
A piece of cake sitting on top of a plate.
A kite surfer flying above the ocean in his wetsuit
Men and women sitting under umbrellas on the beach.
Vehicles at night on a highway near a large hotel.
a double decker bus is parked in front of a store
A large teddy bear sits next to a red wall inside a toy store.
An abandoned red train car in dirt lot.
Two women sharing a plate of breakfast are happy.
A porceline toilet sits outside on a sidewalk.
This is a train that is parked near a building.
A warped photo of an unoccupied bathroom in a home.
The manufacturers box for the Nintendo wii on the floor
Four people carrying luggage turning for a picture
A dog with a purple Frisbee in its mouth.
People standing on a sidewalk near a parked bus at night.
6 people gather and socialize in a kitchen.
A bird flies over the water near an island.
Green salad with broccoli and peas with fork and bowl
a woman laying on a bed with a sleeping cat and poodle
A man posing in his office work cloths
A sturdy, small brown horse looks back as he walks through the hot sands.
A young blonde boy sheers the wool of a sheep.
a group of pictures of the same table with multiple trays of food
a man is standing on a skateboard around people
A fuzzy picture of a man on skis
An old refrigerator is near shelves of bottles.
A herd of sheep standing on top of a lush green hillside.
Two puppies playing in the green grass of their yard.
A teddy bear is sitting outside on a chair near flowers.
a large kitchen with fancy counters and white cabinets
The pizza has more sauce than cheese and pepperonis.
A picture of a group of people surrounded by bananas.
A person in a purple shirt plays frisbee golf.
A stop sign and street sign stand on the corner of a street.
The woman on the horse is racing the course.
A rusted stop sign attached to a school bus
A cow that is standing in the grass.
A zebra taking a drink out of a basin at the zoo.
A white horse pulling a horse carriage down a street.
Cat laying on the floor wearing a tie around his neck
Two white plates topped with french toast and fruit.
A sub sandwich on a white plate on a table.
A woman sitting in a car while her dog hangs out the window.
A sheep in a field overlooking a lake and forest of trees.
A pizza that is laying on a table.
A young boy catches a soccer ball in his house
Laptop and mouse sits on desk in front of computer monitor
A pole with several street signs outside of a building.
A boat floating on a river that runs through a city.
A phone case, with a phone hanging on a belt loop.
A young man standing next to a skateboard.
A man sits holding jewelry near a woman.
A cat sits on the seat of a motorcycle.
A picture of an airplane flying high in the sky.
A pickup truck with a camper is in a parking lot.
A boy skateboarding down the a busy street
A train sits on the tracks by the platform.
a microwave sits on a stands with a vase on it
A food entree is served on a plate with skewers.
A man sitting next to a Wii machine with a Wii controller in his hand.
A building outdoors on a town street near some street signs.
a man wearing a wet suit in turbulent water
A group of cows walking across a grass covered field.
The modified school buses are in a muddy arena.
An assortment of different pottery on elevated shelves
A parking meter in front of building windows.
A woman stands with her green and black luggage.
People with a without surfboards watching a surfer in the water.
a blue vase holding some flowers next to a wall with a border
A clock tower in roundabout next to an ocean.
A half eaten pizza sitting on a table next to stuffed animals.
A street riddled with garbage and people walking, sitting and standing around it.
A woman inside of a room with many items plugged into wall outlets.
A  child riding a bicycle with a lady sitting behind him.
A kitchen table and bench made from a door
A black bird standing in the green grass.
a couple of women take a photo of a bath room
Plane on the tar mat of an airport.
an image of a woman eating food at the restaurant
A view from the street of two traffic lights and a building.
A clean bedroom with a tidy bed and large windows.
A man is on a field kicking a soccer ball.
A CITY BUS IS PARKED ON THE SIDE WALK
Many different fruits that have been organized by types.
A set of three piles of ripe bananas.
a very big bus moving on the street with no people
A newly married couple touching a strange mans hand.
Two business men with colorful ties looking to the right.
A breakfast plate including potatoes, biscuits and gravy.
Two white toilets in a alley with a tiled wall.
A old fashioned colonial dining room hutch and an anniversary clock on a shelf on the wall.
stop light placed near the ground beside a white building.
A man walks under an umbrella for The Bitter End.
The streetlight has several different  colored  lights.
A woman sitting at a couch with two cats looking out a window.
A boat is parked on the side of the dock.
A hand sitting on an open laptop computer.
a couple of people that are playing a wii
Three women are enjoying an outdoor lunch on a sunny day.
A large black and white statue of a cow.
A large building in the background with a clock and tower on the top of it and people walking in front of it down a sidewalk and paved area.
A surfer is bent over riding the wave in to shore.
an elephant in an enclosure at the zoo is walking
A man in a tie and vest looks seriously at the camera.
Several different kinds of vegetables on a black countertop.
A woman stands next to a traffic light.
Eight busses are parked in front of a field.
A white refrigerator freezer combo sitting in a kitchen.
Two older people walking two dogs on the beach with surfers in the water.
The people are dancing down the street with umbrellas.
a close up of a person in bed with a book
A cake sitting on top of a plate with a knife
A lady with a dog is talking to a lady and man.
A toothbrush with round and straight bristles on it.
Four people standing next to a net holding racquets.
A clock that looks like it has melted sitting on the edge of a shelf.
A peach cobbler is made in pizza style.
someone sitting on the couch while they use  their laptop
Two elephants with grass in front of them in an enclosure.
A very attractive and neatly kept bed room decorated in red .
Meat and cooked vegetables served on a white plate.
Man herding some skinny cows in a street.
a tennis player stretching to hit a serve
Bicycles in the bed of a pickup truck.
A black cat sitting on top of a bathroom sink.
a black cat is hiding in a box with shoes
A bouquet of flowers in a blue vase contains roses and large leaves.
A woman is talking on the phone and leaning on he xar
A kitchen with a black stove top oven.
Airplane flying over the top of a White Castle.
A man walking his dog on a quiet country road.
A man poses with a cane and purple hat in front of a woman carrying an umbrella.
A man speaking to an audience in an auditorium.
A long train traveling along train tracks in a train yard.
a hand is holding a silver cellphone against a white background
A flock of birds standing on top of a grass covered field.
A picture of a bench outside by the water.
A dog that is sitting by a computer.
THERE IS A BLACK BEAR THAT S WALKING IN TEH DEN
Man up at bat in a baseball game.
A couple of people pouring a glass of wine.
A bird is perched on a large rock near the shore.
A pot on the range with different types of vegetables.
some cupboards with a microwave sitting on top
A person is holding a doughnut with coconut on it.
The confused man is trying to read the sign.
A brown and white cat sitting on top of a desk.
A view of a sign on the side of a building.
A body of water near a city with ice chunks.
A plate with a chicken breast, ear of corn and broccoli with sprinkled parmesan cheese.
a boy performing a skateboard trick in a skate bowl at night
A baseball stadium with a crowd watching as a man holds his bat and another man throws a ball.
A bus is stopped while three people are crossing.
An adult and a baby zebra are walking through the grass.
A man on a tennis court holding a tennis racquet.
a yellow blue red and silver train engine and some tracks
A small toilet in a wood walled bathroom
A tennis player, playing in a stadium, in mid air.
A tan dog's head poking out from a dark colored backpack.
A bike is propped up against a building.
A professional tennis player walks at the back of the court.
Two horses grazing on green grass in a fenced in area.
this is a close up picture of a roosters neck
A cat that has just come through a doggie door.
A dog in an open doorway with a pile of green bananas in front of the house.
A skateboarder performing a trick next to a bike rider.
A boy in a hat is smiling while holding a Wii controller.
A cat lays between two parked bicycles in a black and white photo.
The batter on the Ray's  baseball team is celebrating a run, giving the incoming runner  his outstretched palm.
The woman is playing a video game on tv.
The woman is flying the kite on the walkway next to the water.
A bathroom with a wooden vanity and large wall mirror.
An elephant with seat on its back standing by a fence.
A little boy running on the beach with a kite.
a desk with a ton of televisions and monitors on it
A dog standing by a truck pulling a trailer.
Boat that just crossed under a bridge on the waterway of a city.
A beautiful young lady standing next to another beautiful lady and a man.
a person riding a horse on a beach
a young child standing in the kitchen next to an oven
Two birthday cakes sitting on table beside each other.
Bright and shiny red motorcycle parked on the street.
Two people sitting on a bench by a tree outside a building.
A man holding a tennis racquet pretending it's a guitar.
A dog is lying on a bed with a red blanket.
Teddy bears of all colors are in a big pile.
Three boys walking along the beach carrying surfboards.
Two bathroom sinks under two mirrors next to paper towel dispenser.
A very cute cat sitting in a corner.
A very big pretty bird by the water.
Two small children ski down a snowy tree lined slope.
Two women hold umbrellas outside a store with a young girl.
Woman in a folding chair with surfboard beside her on the beach.
A small bird perched on a fir tree
Four zebra stand near each other looking at the ground.
A woman prepares a fruit smoothie inside a blender.
a street view of cars parked alongside parking meters on a one way street
A man who is looking at a giraffe in an enclosure.
A white bowl that includes carrots and broccoli.
A white pickup truck is parked in a parking lot.
Several planes are admired in an airplane museum.
A woman walking down a street on a sidewalk.
Two cats perch on the roof of a car.
People are walking down the sidewalk in a storm,
The sheep are scattered to graze in the field.
a young kid stands in front of a granite table with a train on it
A pink cell phone sitting beside a tree.
A woman is milking a cow into a metal pail.
Boy doing a skateboard stunt with feet and board off the ground.
People line up in the snow for pizza and soda.
A woman holding a umbrella over her head.
Group of people watching something with man recording in room
Two large pizzas covered in sauce and cheese.
A billboard on the side of building features a bull.
Two plates filled with hot dogs sitting on a wooden counter next to drinks.
An airplane sitting on the tarmac with several service trucks around it.
A glass table with pink flowers and green plants.
There is a funny picture on the screen of the laptop.
A dining table is set with many different dishes
A bakery shop displays an assortment of cakes in a vintage case.
A girl displaying a sad expression while she eats.
A wooden trunk sitting outside with stickers on it.
A city view shows architecture and people walking.
a cat resting on top of a luggage bag resting on a bench seat
A person holding an umbrella leans out the train door.
There are two computer screens next to a lap top on a desk
An artist's rendering of birds flying past a lighthouse.
A boat is traveling on rough waters in the ocean.
A cobble stone path through a park leading to a bench.
A beautiful red haired lady preparing food in a kitchen.
The guest of the wedding are gathered in a house.
A train that is sitting on the tracks.
A herd of elephants walking across a stony river.
A smart phone is very companct and handheld.
A surfer riding a wave in the ocean, performing a trick.
some snow coming down on some street signs and trees
Several women sitting in front of a birthday cake and laughing.
A colorful railroad train arriving at a station.
A piece of cooked pizza that is on a plate.
there are two street name signs on a street pole
a man standing in the park while holding onto a frisbee
A box that is filled with oranges in the grass.
A view of a sign that reads steep descent on it.
A young man on his skateboard next to a rail.
A plate of cookies, a bowl of carrots and blue frosted muffins
A blue basket filled with bunches of ripe banana.
A woman prepares a large pan of food.
a sandwich with a bunch of mushrooms on a plate
a person sitting wearing a suit and tie
blue and yellow train carts on  the tracks
Multi-colored stuffed animals standing side by side in a shop.
A guy with a pet sits in a parking lot
a man is looking into an oven opening
A herd of elephants walking across a field.
A woman standing next to a man holding a cake filled with lit candles.
Modern bathroom with two sinks a toilet and a shower
A person holding a controller aiming it at a tv.
Sleepy dog guarding two remote controls on the couch.
Two checkered chairs and a clock in a room
He is flying over the steps on his skateboard.
An open box of pizza with toppings on a counter
A truck pulling out of a parking lot onto the street
Slices of vegetable pizza arranged on a white platter.
This is a bride and groom cutting their cake
The front view of a bathroom toilet inside a stall.
a man is watching a television on the floor
A teddy bear is sitting down wearing a bow.
A person is standing in the intersection of a street.
Two men in purple rush to catch a frisbee.
A group of competitive cross country skiiers in a race.
a white bathroom a sink toilet and tub
A person that is looking at something down the street.
The flat bed truck has a huge roll of tape on the back.
a man swimming on a large wave in the ocean.
A cartoon version of a bed and bedstand
A caved in street with a bench in the hole
A yellow train on the track at the train station.
a cat is being fed by it's owner in a bed.
Two young boys in shorts at park with hands raised.
A young boy pulling a pink piece of luggage.
A man who is holding a surfboard and walking in the water.
A young surfer riding a very nice wave.
Four photographs of a man shaving his face.
A painting of a dog holding a dead duck in it's mouth.
Some people walk on the sidewalk near a busy intersection.
Row boat sitting in the middle of a lake by building
A man riding a skateboard down the side of a ramp.
there are many people that are flying kites
A piece of cake with a fork and one and a half apples on the plate.
A young man is riding his skateboard on the road.
this is a man ridinbg down a hill on skis
An elephant crossing the road behind a car that has just passed
there is a pink rose in a glass vase
A couple of men walking with a large elephant.
a train on a train track on a city street
A tennis player getting ready to serve a tennis ball.
A man on a surfboard riding a wave.
Some vegetables in a stew of some sort.
A man riding a skateboard up the side of a ramp.
People seated on a stone bench on cell phones.
A photograph of a giraffe in the wild.
People sitting at a bar with a lady turned smiling at the camera.
Two men standing in front of a TV playing with a Wii.
A woman flying a kite and holding onto kite string.
Two people jumping up to catch a frisbee
A man sitting in a chair playing a guitar in front of a microphone.
a black gray and white cat is sitting in a sink
Large motorized model plane parked beside air field.
A young child at the table with a birthday cake and three candles.
A very big display of many kinds of pastries.
A baby sitting on a females lap staring into the camera.
A boat tied up to the pier next to other boats on a clear day.
A large clock outside of a window building.
This is an image of a giraffe with a city in the background.
A skateboarder has his feet off the board before a landing
Cattle walking in open rutted field on sunny day.
A girl making a "peace sign" with her hand and a woman holding a big black suitcase.
A laptop is next to a desktop compute near a window.
A basket ball player is posing in front of a basket.
A large sandwich being cut by a person
Large, mild waves are coursing towards two boats.
A giant clock is on the wall of a brick building between two windows.
A skier stands next to skis stuck into the snow.
A plate that has a sub sandwich on it.
A table has a plant next to the glass doorway in the kitchen.
An old fashioned passenger train traveling through the countryside.
a person sitting at a bench with a skate board
A woman tossing a frisbee on a lush green field.
Jockey riding a race horse on a runway.
A horse grazing in a field witha blanket over its back.
Two men in suits with one man leaning on a railing.
A man and child sit on the floor with game controllers in their hands
A blue and green plaid tie with a flag pin on it.
A tofu and broccoli dish simmering on the stove
A couple of chow dogs sitting in a car looking onward.
A crowd is watching a woman play tennis.
A small boy with a birthday hat on holding a tennis racket.
The view from a motorcyclist's point of view, looking down a street.
A man playing a guitar and other musical instruments
Batter winds up ready to hit the baseball
A chocolate caked frosted and topped with blueberries on a metal cake plate.
A large clock mounted to the side of a pillar.
A bird sticks its head into the water underneath a layer of plants.
A baseball player standing in front of an A's poster.
Woman in maroon shirt holding up a bagel.
A couple wearing skis at a ski slope
A white toilet sitting next to a large window.
A red and black motorcycle parked on the sidewalk
a yellow cat going after some corn on the cob
A baby cow with his ears tagged with yellow markers.
A red and gold painted fire hydrant on the street
a cat sits on a wooden cluttered table
Three Asian takes on hot dogs on display.
A gray haired man is wearing a blue shirt and has a tie draped around his neck.
a cake with a section missing sitting next to a burning candle
These two riders are far ahead of the ones behind them.
A woman talks on a cellphone while holding a pen.
A plastic container filled with sliced carrots next to a yellow object.
A girl swings a net a tennis ball.
A cargo train that is traveling down railroad tracks.
many small boats in a large body of water
An older zebra and younger one nuzzle in a field
The clock on the post has faces on four sides.
A woman holding her head out the side of a train.
A person on a motorcycle with a stuffed animal on back.
A flower is put into strange pots next to a plate.
Two zebras stand in the grass together near a fence.
A man in a surf board shaping studio.
very long and nice buses standing at the zebra crossing
a window shoing a man standing alone on a train platform
A red fire hydrant stands in the dirt of a stone platform.
A box containing three round doughnuts and a fritter
varies vegetables sitting on a black counter top
An orange cat laying on top of a black piece of luggage.
there is a man playing with a frisbee on the field
A large market display of citrus fruits including navel oranges and clementines.
A plate of assorted desserts and dessert sauces and a bowl of ice cream.
someone holding a half eaten hot dog that has mustard and ketchup
A bowl full of soap with a bowl of vegetables on the side
There is a rug on the lid of a toilet and another rug in front of the toilet.
A black and white modern bathroom showing he sink and mirror
A hospital bed next to a blue chair
a room wit ha chair a bed multiple windows
It's a very elegant looking bathroom with double sinks a large mirror and a tub.
A baby holding a spoon and looking at a pair of scissors.
A woman rides a horse through a grassy field.
a group of zebra drinking from a trough together
A skate boarder reaches the top of a steep barrier.
A plaque on the floor in front of a chair and grandfather clock.
a bus painted in white, blue and yellow
Some is holding a bottle of wine next to a huge hot dog covered in chili.
A cardboard box containing a reef of glazed donuts.
A bathtub with candles lit up around it and a stool next to it.
A man wearing a blue shirt and an orange and black neck tie.
A car parked next to a brick sidewalk on a street at night.
A man is flying a kite on a clear day
A picture of a lot of kites in the air.
A black and white image of a young men on his skateboard.
A girl using her laptop computer on her bed.
The young man hurls his frisbee towards the metal structure.
A clock above two pink colored stone arches
Vegetables being displayed with each other in arrangement.
A game strategy is hatched by the boy NOT wearing the boat like a hat.
A little boy wearing a bib eating a doughnut.
A bathroom with a urinal and tiled walls.
A photo of a living room with a purple chair thete
A little girl that is flying a butterfly kite.
a man eating food at an airport terminal
A child wearing pajamas holding a brown teddy bear.
A slice of pizza with lots of vegetables on the top of it.
a bowl with some fruit inside of it
a man is on a surfboard with a dog
a number of people standing near one another wearing suits and ties
The kitchen counter is cleaned off and ready for us to use.
Two young people sit next to a bunch of snowboards.
An airport filled with planes sitting on tarmacs.
A man on a horse during  a race jumps over a hurdle
The little girl is sitting in the chair eating candy.
Two zebra standing in the trees next to a fence.
The girl in purple is using her phone.
A double decker bus driving down a city street
Four sets of legs with one standing on a skateboard in the dirt
A table filled with food on a patio
A cross country skier traveling down a slight slope.
A man and horse near a painted man wearing shorts.
A man in a black coat sitting on a bench at night.
a city street with bicyclists, double-decker buses, and many lights
A book mobile bus from a library sitting by a street side.
A man sitting in front of a tv with a Wii remote in his hand.
A group of young children are petting a horse near the gate.
The keys 1, 4, 7, and 8 are clearly visible on the remote.
A laptop sitting on a bed near a window
a rocking chair siting in a house next to a green lamp
A busy street is crowded with umbrellas on a rainy day.
A little kid standing on a household appliance
A fridge with a bunch of papers hanging on it.
A pole that has different types of signs pointing.
Woman leans over as she serve the tennis ball back to the other side
A toy monkey sits on a desk beside a laptop.
A man in a suit and tie is smiling.
A young lady with blue hair is holding her phone, posing for the camera.
A group of horseback riders walk down a trail.
Two giraffes eating the leaves off a tree.
A flat screen TV mounted on a brick wall in a living room.
A close up of the luggage claim at an airport with many suitcases.
A baby with a teddy bear looking over his shoulder.
The skier is jumping into the air above a half pipe.
A tow truck driving down a rural road.
Two street signs located above a stop sign.
A group of actors and stage workers on the set of a TV show.
a tennis player about to hit a tennis ball.
The man is posing for a picture on his motorcycle.
A person riding a board on top of a wave.
A man in a wetsuit surfs a churning wave.
a living room that has a couch and a chair in it
A hand is seen pulling a piece of food from a toaster.
A man and a woman stand under an umbrella at a street crossing on a rainy day.
A group of people riding on the back of an elephant.
A pulled pork sandwich with a pickle slice.
A virtual woman in a rainjacket, carrying an umbrella.
A yellow cat is among the camping equiptment.
A pigeon stands on a window ledge overlooking a street.
Two men are checking out several wines in a crowded room.
Several sheep herding towards an outdoor pen on a county side.
Yellow train on the tracks running parallel to the trees.
people are taking samples of wines in a room next to an outdoor area where people are sitting
Large white passenger bus parked in a parking lot.
an elephant standing by some trees with it's trunk in the air
A dad or grandpa looking at a child both are smiling.
A close shot of a unique looking plate of food.
a couple of bears are sitting near a glass
A black and white dog examines something on the ground.
a big propeller plan flying through the air
A group of street signs in a display case in a room.
A small green vehicle model is on display next to a busy city street.
This is an old picture of a train at the station in Boyne City
A group of people sitting on a yellow couch playing a video game.
A zebra standing next to a tree on a dirt lot.
A lady with a young girl standing in front of a few english muffins.
A happy girl is showing off her Nintendo Wii.
A man in a crowded room gazes into the distance.
A series of street signs in French on a city street.
A person in a cross guard uniform directing traffic.
A man sitting in a  chair with a laptop computer.
A "One Way" street sign pointing to the right.
A man standing next to a news stand on a street.
A red bus parks in front of a building by a large tree.
A white busted up toilet sitting on it's side.
A young girl standing over a soccer ball.
A gray vanity with three spigots in a public restroom.
A small refrigerator sitting on top of a wooden counter.
A boy is sitting at a table eating.
A group of people is playing frisbee in a field
There is a cat drinking from a faucet
a close up of a bench near many plant life
This is the head of a giraffe standing in a fenced in area.
A plastic male doll is sitting on a toothbrush on its holder.
Multiple items on a metal bar near an outlet.
A big building with a large clock at the top of it .
The street pole contains traffic and street signs.
Two representatives from two different governments shake hands.
A large black bear traveling across a grass covered field.
A game of baseball being played in front of a large crowd at a stadium.
A park with kites flying in the air
A chair and a couch in a small room.
There are vegetables that look like they have seasoning on them
Two people riding horses down a sandy beach.
A bunch of busses are in a lot.
A plate filled with a chees filled meat sandwich with sauce.
A bowl of soup, rice and fish by a woman.
A group of scooters parked next to an old building.
A man doing a trick on a skateboard off of a rail.
A boat is running in the water with a low sun in the sky.
The jazz band is taking part in a parade.
A bunch of seagulls eating on the beach.
A little girl enjoying a sweet confection and awaiting a sugar rush.
Two giraffes stand in an open area with water and other animals in background.
a vase full of colorful flowers in a bedroom
A group of people riding boats on top of a lake.
A computer desk with a computer on it and a chair in front of it.
A tennis player trying to hit the ball.
a very large animal submerged in water with two people near it
An adult and a child sleeping in a bed.
A lady with blue pants and grey sweatshirt playing tennis.
a giraffe standing next to a tree with more trees in the back ground
A closeup shot of several zebras standing together.
a baseball player wearing green and yellow wearing his glove
a giraffe standing on a field near a bush
Woman getting ready to hit a ball on a grass court.
A young lady taliking on a cellphone in the hallway at school.
A woman holding a skateboard posing for a photo.
A black and white dog on a brown tile floor next to counter.
A black train traveling past a train station.
a tray that has a plate and a bowl with food on it
A woman sitting on a couch in a living room.
A stop sign with lights lit up all around it.
TWO TRAYS FULL OF FOOD SITTING ON THE TABLE AT A RESTAURANT
A train on train tracks that run parallel to many other train tracks.
A group of zebras are standing in a field.
A young child in a white dress holds a teddy bear while standing outside.
A man with a tie and a work badge
Five loaded hotdogs surrounding a tray of cheese fries setting on a round table.
A train goes through an intersection with traffic lights to stop traffic.
an image of two men that are walking down the street
A large sheep and a smaller sheep graze from a field.
A small bulldog sleeping on a bed while wearing a pirate hat.
A brown dog carrying a black frisbee in its mouth
A long row of wood and wrought iron benches along a sidewalk.
a young woman with a slice of pizza in her mouth
A pitcher throws the ball towards the batter at a game.
A cute little girl sleeping in a wooden framed bed.
A horse connected to riding equipment walking in the street.
A car parked on top of the curb next to a meter pole
A date book is next to a phone, calculator, and a keyboard.
a cat sitting in the refrigerator next to a gallon of skim milk and a bottle of gatorade
A man is prepping a turkey in front of a bottle of wine.
A wooden stop sign in a rural area
Small children playing with toys and stuffed animals
People sitting and eating in a restaurant.
A group of young kids playing soccer on a grassy field.
Tables and beach chairs on a sandy beach.
Some unfinished looking wood is in a white bathroom.
A baseball player slides into base while another leaps over him.
A boy is jumping into the air on a skateboard.
Children on a tennis court holding a tennis racket and tennis ball.
a person riding a skate board at a skate park
A young child standing at a table with a plate of food.
a couple of elephants walk in a caged area
The gentleman is taking a selfie while riding his motorcycle.
The dog is looking at the toy bird being held by him.
A bus driving on a street with people approaching it in the mountains.
A man is peeing and has his behind exposed.
A figurine with a plastic witches head is standing in front of a computer keyboard.
a bi plane with a nazi flag on the tail
A dog is standing in the grass with its tongue out.
A red car is parked by a parking meter.
A man sits on a blue and black motorcycle.
a young man holds a snow board
a tennis player swinging a racket at a ball
A COUPLE WEARING YELLOW DRESS STANDING NEAR TWO HORSES.
A television is on the beach near the ocean.
A white bear sniffing on to some rocks
a black and gray cat is sitting on a toilet
A bike tire and a boy with a skateboard
A sandwich with chicken and lettuce is on the table.
Various sized knives are hung on a wall magnet.
Two red traffic lights lit at a street corner
A rhino and a baby elephant by a river.
A girl is drawing on a birthday cake.
A herd of cows is standing in a grassy field.
A man laying on a bed bent like a pretzel.
A brick wall with a blue and white sign next to arc.
There is a building with surfboards outside of it.
An old image of a pickup truck broken down on the side of the road.
a mama goat and her baby walking on a slope
Two men standing in a store aisle with one holding a baseball bat.
A cat and dog are laying on a red rug.
Train on tracks riding pass bus and couple cars on the street
A bunch of animals out on the field.
A woman standing on a beach throws a frisbee.
Three workers stand behind a colorful fruit stand.
There are several boats docked at the dock.
A man standing next to a large elephant.
traffic lights besides the road with so many vehicles
An eagle soars through the sky near trees.
a bathroom with a corner bath tub and duel sink.
A man wearing ski gear and  skiing downhill in the snow.
large brown elephant making his surrounding look so small
Standing in the ocean waves, a man flies a kite.
a twin engine airplane stored at aviation museum.
Skiers grouped up in front of a vancouver sign.
A very large group of people are sitting at tables.
Man and woman enjoying video game in living room.
Several people cross-country ski on a snowy mountain.
A man stands in a room with a cardboard box sitting on a chair.
A man holding a racquet preparing to serve a tennis ball in front of a crowd.
A plate with cooked meat and vegetables served on it.
A toilet is standing in a room with a picture frame on top of it.
A Yorkshire Terrier is looking out the window of a house.
A person does a trick off a ledge on a skateboard.
A very big airplane that is making a turn in the sky.
A man lunging forward towards a frisbee next to three other men.
an old black and white photo of a large building
some big black cows in  a grassy field
A line at an airport with people and their luggage
a nice neighborhood with some green grass in it
A giraffe standing on top of a lush green field.
Two skateboarders are racing through an obstacle course.
Two giraffes and a zebra roam in a preserve area
Little Asian girl holding a wii remote control.
A computer monitor, keyboard, phone and various papers sit on a desk.
A blue suitcase is leaning against a post on the street while a man walks by.
A VW long van parked on wood strips on a grassy area.
A skier pauses near the side of the course.
Old passenger train making its way down from a rocky hill.
Two women are standing playing with a nintendo wii.
Large group of clothing sitting on top of each other.
A mug of hot beverage sitting by a computer.
a lake with a lot of boats on it
A partially open door with a bathroom behind it.
Many people are walking around in this square.
Two flowers are on the blanket across a bed.
Ingredients for a tasty bite, including peanut butter, oats, banana, preserves and syrup.
Looking at a barge cross a channel of water under a cloudy sky
a man putting a pan of food into an oven
Several people boarding an old fashioned airplane in a field.
Black dog jumping up at big screen television.
A person is holding a banana that they are peeling.
Box of dollar bills tooth brushes pills and spoons.
A hotel bedroom with balcony overlooking the ocean.
horses stand around on a neighborhood street in front of a car
A large crowd is watching a baseball game.
A zebra is grazing on scarce grass in front of a rock wall.
Various luggage tagged and stored on numbered shelves
A herd of cattle standing next to each other on a dirt field.
A child dips broccoli in dressing before eating.
Two black cats looking out of a window.
A dog in a grassy area with eyes on a flying frisbee.
A young male plays with a green frisbee.
A room with some big equipment and a toilet.
an elephant with a seat on it's back
A boat that is sitting in the water.
A bed next to two mirrors on the floor.
a tour bus with a wi-fi notice parked on the side of the road
A mostly white bathroom has a black toilet seat.
A man on rollerblades at a crosswalk holding a sign that says slow.
some buildings and a clock tower with two white clocks
there is a herd of animals running infront of a man
a person that is holding up a frizbee
People hanging out in a kitchen eating and drinking
Two animals that are looking at something in the wall.
A banana and a vanilla bean are next to a shot glass.
A young girl holds a Frisbee at a park.
A person flying a kite on a beach
Steak sits on a plate with broccoli and mashed potatoes, next to a glass of water.
a number of people riding skis on a snowy slope
A house plant on a sink in a bathroom.
an image of a female tennis player returning a serve
Perhaps he's a magician who will pull a rabbit out of that hat.
The man is using the toilet with the bathroom door open.
A person looks at their reflection in a bathroom mirror.
Two adult giraffes and a baby giraffe are in a cage.
A grupo of people in a field with tents flying kites.
An asian dish topped with sesame seeds.
Two dogs and a cat on a boat at edge of water.
Several plates of foods including strawberries and vegetables are next to a sippy cup.
A very long limo with a bunch of farm animals on top of it.
A couple of umbrellas in a small room.
A girl walking and talking on a cell phone.
A man with skis walks through a snowy area.
A view of a bathroom with a yellow towel sitting on the shower.
A red truck moving towards a busy highway.
Two people are walking towards some motorcycles to leave a market consisting of umbrellas over tables.
A close up photo of parking meter on a street.
A ship of people cruising along the water.
A couple of giraffes are standing in the wild.
A tray topped with sandwiches and cut up apples.
A person on a snow board performing a jump on a mountainside.
A large passenger jet flying through a cloudy blue sky.
A man swinging a tennis racket during a tennis match.
A computer desktop with a keyboard and monitor.
a small white vase is on a table
A bridge stands over a river before a city sky line.
A group of people riding skis across a snow covered slope.
A cyclist rides through a tree-lined path in the park.
A man drinking from a glass on top of a night stand.
A giraffe standing at a dirt road eating off a tree branch.
A pooh bear is sitting upright holding a honey pot.
a living room with a fireplace and a big brown chair
A man riding a motorcycle across a lush green park.
a couple of beds sit inside of a room
a herd of giraffes on a dry grassy plain
A group of cats sitting on top of a chair.
Some strawberries floating in a bowl of pudding with sparklers added.
A tablet, a laptop and a computer on a desk
Bathroom with granite counter top and single sink.
A train traveling along a rocky mountain side.
A man and a child are dancing by the water.
a girl and a dog are sitting on a bed hugging
A man and a woman standing next to each other holding tennis racquet.
A cat scanning the floor in front of an orange bucket
A wet polar bear holding a green cone in its mouth.
The pink Frisbee is laying on the snow covered ground.
Two computers on a desk in a small bedroom
a close up of a table with an ipod headphones and a remote
A destroyed toilet and sink lying on the ground.
two people on stage performing a song to a crowd
Two females sitting on BMW motorcycles under a tent.
A boy is holding his hands out as he jumps with his skateboard.
A boat filled with produce and people floats on a river.
A public official helping to feed some school children a healthy lunch.
A red parking meter sits on the sidewalk.
A man holding up his tennis racket .
The shirtless man plays frisbee in the water.
A group of people sitting at a restaurant table with food.
A wall with vines and old tools strewn on it.
A very big nice looking truck on a street.
A slug crawling on the seat of a toilet
A pole holding a traffic sign at an intersection.
A radish on a cutting board next to a knife
A bride and groom walking next to one another.
I am unable to see an image above.
A couple of women holding up a cake together.
A fire hydrant at a intersection at night.
A baby pressing a key on a laptop.
there are many different pies on this table
a bunch of people are sitting in a busy room
people in a large sleigh being pulled by horses
a close up of a person throwing a pair of scissors
A blue vase with sunflowers and other flowers
The snowboarder is standing on a conveyor belt with others.
A couple of  men holding a bunch of baby sheep standing next to each other.
Three people in work uniforms and visors standing together in front of various types of donuts.
a close up of the front end of a school bus
A young child walking down a street past two nets blocking a road.
Dog fetching a frisbee in a rough field.
A group of kids standing in a forest.
A person wearing a wedding ring has their hand on a teddy bear.
a very decorated work cubicle with a laptop
A catcher and a batter playing baseball in a park.
A guy jumping through the air with a Frisbee in the air.
Two large green and white jumbo jet planes on the tarmac.
Twisted bars of metal connected to a tall building.
The double sink in the bathroom is nice and clean.
a horse and foal grazing on dry grass.
A train traveling over a bridge spanning a river.
Bunches of fruit growing on native trees shown on cloudy day.
A man gets ready to hit a tennis ball with a racket.
A lamp on a table in a livingroom
And upload picture of some food in a bowl.
Lady with a slice of piece in front of a stack of pizza boxes.
A train is on the train track, which is surrounded by trees with autumn foliage.
A kitchen area with a refrigerator, table and doorway.
The person is taking a high jump on their skis.
The skiers are getting ready to go on their run,
A red two level bus with front damage to it being towed down a street.
A small bathroom with a yellow toilet, sink area and shower.
Seagull on rock with ocean and lighthouse in the background.
Men in a teaching kitchen discussing all the visible prepared food.
a wooden desk with a black and silver computer
A train is going through the pretty country side.
A herd of elephants drink from a river as two wander away from the group.
A group of people standing in the snow with skis
A picture of a person and a motorcycle on the street.
A white bowl of food with a spoon.
three giraffes standing up near some dry plants.
A couple of men riding motorcycles behind a herd of sheep.
A girl in a bikini sits on a towel at the beach and holds a pastry.
A large computer screen with keyboard on a small desk in a corner.
The zebra is standing behind the rocks in the exhibit.
A zebra eating hay out of a container near a rock.
A boy prepares to swing his bat during a baseball game.
A foyer furnished with a sofa, arm chairs, and end tables.
A motorcyclist is being followed by a familiar face.
A person sitting at a wooden table with pizza, and some other foods on a brown paper bag in front of him
A MAN IS RIDING AMOTOR BIKE IN THE CITY
The giraffe looks like he is in the wild.
a couple of giraffes that are outside a brick building
A bathroom with hand towel, mirror, and sink.
A man surfing on his surf board against the waves
A statue is set on top of some banisters.
a man walking through the water holding a surfboard
A man in a car who is on a cell phone.
A bathroom counter with a sink and various cosmetics and toiletries.
A blender sits on the concrete next to some greenery.
an animal in a field behind a fence
A metro bus approaches an intersection where a traffic cop is directing traffic.
a table covered in vegetables of all sizes and colors
A traffic light with a pedestrian crossing sign on it's sides.
A torn apart bathroom with a toilet in a bathtub.
A man does a trick at a skating course.
A double decker bus and a truck driving next to each other.
Boats at a dock near a large hotel.
Black and white photograph of a tennis team and their coaches
Wooden benches are lined along the edge of the water.
a group of people standing in the snow next to a building
a couple of red lights are on a pole
a man standing on a tennis court holding a racket
A chef is instructing two women on how to slice vegetables.
A woman looking up at the kite that she is flying.
A red and yellow fire hydrant with the lid taken off.
A man in a green shirt holds an appliance while another man stands by.
A small brown teddy bear sitting on a white bed leaning on pillows.
Three people sitting in the snow with snowboard on their feet.
a very tall tree in the field with nice flowers
A young boy standing next to a giraffe he can pet
A young boy with a fish hat eats a snack.
Skiers and snowboarders mill about on a mountain.
A young girl standing in front of a book shelf holding a red tie
A male skier navigates a course at the Vancouver Winter Olympics.
a kitchen that has a bunch of people in it
Happy girl in a green shirt holds onto her suitcase.
a bath room with a toilet and a window
A young man playing a ball game on a cement basketball court.
Multiple boats sitting dormant on a lake bay.
A bunch of people riding motorcycles down a road
Assorted food items displayed in white dish on wooden table.
A red double decker bus traveling down a city street
Chicken wrap cut in half displayed on wooden board near silverware.
a man in a wet suit rides on a surf board
Two horses graze in a pasture in the setting sun.
An airplane is flying in the air on a clear day.
some people are riding elephants in the jungle
A man hitting tennis balls on a blue painted tennis court.
A person with a laptop sitting in front of a window.
A baseball player runs across home plate after hitting the ball
A purple swamphen with a red crest on its head walks on the ground.
A delicious looking hotdog sits in cardboard with tons of toppings.
It's strange to see a bow tie with a military uniform.
a black cat sitting on a cement patio
An infant in a high chair covered in pink glop
Bicyclists ride down the sidewalk in front of several stores.
Some zebras that are sitting on the ground next to each other.
People watching a horse race image is fuzzy.
A bedroom with a bed and other furniture in it
A blurry photo of meat patties on a big meat patty
a buffet in a restaurant with some big crocks glasses and bins of other foods
a group of zebras standing around a food trough to eat
Brown gull in water on beach littered with seaweed.
A group of people wait for the start of the race on their bikes.
a man sitting in a chair watching another man pretend to be an elephant while playing with a child on the floor
A minimalist bedroom with low furniture and a quote on the wall.
Two birds sitting on top of a branch on a tree.
City bus next to traffic cones in the far right lane of a busy freeway.
A cat reaching up to grab a feather on a string.
a person leaning on a stop sign with a skate board
A woman in a blue sweater sitting at a table with food.
A child making a silly face over a tray of donuts.
The bed is located on the edge of the beach.
A bald man is using a surfboard to ride a waves.
Broccoli next to some meat on a small plate.
Shelves filled with pots, pans, and cooking utensils.
Two large boats sitting on a docking area in the evening.
A man jumping on a dirt bike while another man watches
A cow laying down in a grass field.
a blender sits on a counter top unplugged
A bicycle leaning against an old white building.
Three men, one caring a skateboard, are wearing matching t-shirts.
The clear shelves on a green wall that have vases with designs on each shelf.
Helmets should always be worn by motorcycle riders and passengers.
two people on a field wearing baseball equipment
A man puts on his jacket while standing near snow skis and poles.
A blender sits on a kitchen counter surrounded by baking supplies.
A small child wrapped in a towel brushing their teeth.
Green and white airplane sitting on a runway by the ocean.
A keyboard and monitor on a corner desk.
Two couples getting ready for a tennis match.
A street sign showing the words Gay Street.
A man in brown shirt standing in a kitchen.
People cut a cake outside for a celebration.
A large passenger plane is parked on the runway.
a close up of a red fire hydrant with a chain on it
A young child running down a rain covered walk way with an umbrella.
Couple people walking up the snowy hill wearing skis
A dog is sitting down in front of a mirror
a group of guys standing out on the road
Four people standing on a balcony with a clock
Why would the cow be grazing in front of those homes?
A woman is taking a picture in the bathroom.
a little boy and his father skii down a big hil
A man with camera watching a group of giraffes
Herd of sheep standing on pasture with stone buildings in the background.
a bunch of people walk on a beach to the water
A myriad of wind socks blowing in the wind.
A trolley rolling down the tracks in a forest.
A box of doughnuts and pastries with strips of bacon.
A woman on a cell phone at a station.
A pair of hand slicing carrots with a large knife.
A woman is sitting down in her kitchen to feed her young child.
A horse drawn trolley sitting in the middle of a street.
a smiling woman holding onto a pizza box
A picture of a computer sitting on the floor.
Two giraffes are found wandering around the buildings.
Jockeys on horses riding on a racing track.
The newborn baby is sleeping next to a teddy bear.
A girl riding on the back of a scooter on a cobbled road.
A girl in pink ski gear that is sitting in the snow.
A couple of people sitting on a wooden bench.
A flat screen TV sitting across the way from a laptop.
Two horses graze in a field surrounded by barbed wire.
Mass transit train waiting for passengers at the station.
Four women sit on a park bench with groceries.
A baseball player in a blue jersey standing ready with a catcher's mitt
A larger commercial jet is flying in the air.
a large pizza is sitting on a pan
A cow walking on the beach towards people on lounge chairs
this is stuffed teddy bears sitting in the grass
A man a woman are standing together holding tennis rackets.
A blue and white double decker bus on side of street.
Person in a parka taking pictures with a mobile phone camera.
A cat next to a grocery bag on the hardwood floor in a kitchen.
A man riding a paddle board into a massive wave in the ocean.
Man and woman in a bedroom holding up Wii controllers.
A group of people on some skis in the snow.
A woman working in commercial kitchen with stainless steel appliances.
A man sitting on a horse drawn carriage
A girl in black jacket drinking milk and eating pizza.
A  deep marble bathtub under an ornate mirror.
a woman posing on the street for a photo
A horse pulling a carriage down a street with other people.
People are huddled together under umbrellas on the beach.
a person sitting on steps talking on a phone
A large passenger jet flying through a cloudy sky.
A cat is lying on its back in a man's lap.
Several people are snowboarding off the top of a snow covered truck.
The adult elephant stands idly in his zoo habitat.
A man is holding a baseball bat while wearing a muddy outfit.
A woman is sitting with a suitcase on some train tracks.
A train traveling over a bridge over a freeway.
a couple of chairs are around a table outside
A man in vest and bow tie standing over a keyboard.
A cat sleeps on a pile of discarded shoes.
An open cell phone in a person's hand.
There is a dog walking down a path near grass.
Three giraffes behind a wire fence next to a tree.
A tennis player turns her racket sideways as she returns the ball.
A horse grazes by itself on a grassy plain.
People playing frisbee out on the lawn, on diving for it.
Some type of bed outside on the beach
Virgin Ameican Airline planes with passenger boarding bridges attached.
A woman walking a horse down a trail.
Two polar bears playing in the ice and snow.
A wide shot of a modern kitchen with a glass table in the foreground.
a photo of an old tall cathedral and bell tower.
A parking meter and a umbrella on a street.
A vase sitting on a table filled with flowers.
Damaged bathroom with a toilet, sink, and damaged window.
A batter up to swing in a baseball game
Boats docked in the water in a marina.
A young giraffe running across the road on an African plain.
a picture of a bulding with a open window and clock.
A plate topped with two pieces of cake and strawberry.
A man with a helmet is on a surfboard
Two men shaking hands while standing on a tennis court.
Visitors walk beneath huge airplanes on display in a hangar.
This train is riding a rail near some water
A stop sign that has another sign saying all way under it.
People standing on the street holding umbrellas near buildings
A knitted cap sits upon a red hat stand.
A desktop computer sitting on top of a desk.
The sandwich has chicken, melted cheese, and tomato inside.
A diesel locomotive approaching a rural grade crossing.
A Nashville bus with a big ad for Coors Light on the side.
Three people standing at a baggage claim at an airport.
five zebras standing in a row in the wild
a woman holding an umbrella on the street
A pizza cutter being used as a spatula.
A male flying a kite on a sunny day.
A group of people standing on top of a lush green field.
a person sending a text on her phone
A hand is holding a pack of Japanese donuts.
A couple of red chairs against a wall
Smiling and smirky people are in a small kitchen.
A building with a clock tower on it
A steam locomotive with passenger cars crosses a bridge over a channel
A female tennis player hitting the ball.
Various zebra in dirt field with mountains in the background.
A black and white image of a vehicle that is decorated like a dog.
a bucket of oranges sitting next to a bike
A freshly made bed resting on a tiled floor.
Two people in the middle of a skiing trail with trees lined on each side of the trail.
a street-side market with colorful plastic furniture.
A dessert that consists of a piece of cake and some ice cream.
A toddler in a kitchen trying to use a vacuum cleaner.
Three sheeps are grazing in a small field.
A very cute small dog laying on a big couch.
a table and chairs with silverware and plates a pan and bowl of food
Stuffed bears sit in the window of a store.
People and carts loaded with suitcases on a train platform.
A boat with a bed set by a set of windows.
a messy living room with the television on.
A living room with leather couch, settee and chair, rustic tables and a cowhide rug.
A red flower vase placed next to a clock on a window sill.
A  glass vase full of dried dead roses
Two googly eyes and a Santa beard placed on a microwave oven
A tabby cat sleeping on a wooden island in an old looking kitchen.
Three giraffes in the wild stand by shrubs.
A young woman wearing a white hat in a commercial kitchen chopping lettuce.
A person who is standing in front of a laptop.
A couch and a chair in a room.
A flat screen television mounted above a fireplace.
A cart with a load of suitcases pile on it.
A white sandwich has pink meat in it.
A bunch of people are on stage and the guy in white is doing something to the one child who is holding his skateboard and next to him is a child in a red helmet.
A banana plant with a large flower and unripe bananas.
A yellow and grey train on tracks beneath a traffic signal
A very nice looking trolley car on a city street.
a person wearing a black coat and a tie with bolt designs on it.
A giraffe stand alone in a zoo during the day.
There is a long wooden bench with a fountain in the middle of the area.
A couple pieces of food that are on a table.
A adult elephant and a couple children in the water.
Taking a moments rest on their cross country ski trip.
A bunch of cut meat sitting on a cutting board.
there is a luggage that is sitting on metal outside
Four people are posing for the camera with flags behind them.
a living room with book bottles a lamp and television set
A white plate topped with two different type of food.
A woman standing at a counter using a blender.
this man is jumping high over the grass
People are gathered to watch two women, one who is doing the splits.
Two men are standing and talking alongside an old fire company van.
a young girl is getting her temperature taken
A green tennis ball bouncing on a wood tennis racket.
A cat sitting on a chair looking straight at the camera.
A Studio apartment with minimal furniture and a refrigerator.
A giraffe is walking through a grassy field.
A room in a home that has a small table with one chair on the side and another piece of furniture in the next area.
A living room with white walls and stained wood furniture.
zebras stand next to each other in the zoo.
A train station that has a train pulled into it.
A stop sign in front of a Google building.
A woman standing in a grass field with a cell phone.
a close up of a dog on a desk near a monitor
A street sign that reads Ronald Reagan Allee.
A white plate topped with a piece of toast and eggs.
A cut in half bagel sandwich sitting on top of a plate.
A couple of bowls are on a counter by a man.
A man leaning on a fire hydrant on a city corner
a girl and a dog looking angry in a photo
The corner of a batch room with a white sink and red shower curtain.
a close up of a cell phone on a table near earbuds
A person holding a red phone nest to a flower filled plant.
A couple of cars that are in the dirt.
A pastry is decorated in a lattice style on a piece of burlap with a knife.
Herd of goats in grassy area with herder.
A semi truck pulling a trailer filled with logs.
A couple of people standing with a umbrella.
a man is in a store making donuts with flour
People at the table getting sandwiches to put on their plates
Four men and two women sitting on two different benches.
a whole bunch of bananas cut up in a large bowl
a male in a black shirt taking a photo in a mirror and a sink
A large passenger jet flying through a blue sky.
The display case has many different scissors on it.
A red fire hydrant sitting on a slab of cement in a patch of grass.
A cat sits on the couch next to the remote.
a close up of a dog laying down with a chew toy
A man holding his cell phone in front of him in his left hand.
A boxer dog faces the camera while sitting on a computer chair.
A man riding a snow board down a snow covered slope.
A man sitting in the back of a van talking on a cellphone.
a black and white clock on a gold and black tower
a baseball player that has a ball in his hand
A truck parked on the street with a man getting out
a woman hitting a ball at the end of a tennis court
A bathroom with sink and a toilet in it.
Two plates of food that include potatoes, broccoli and sausage.
Two gentleman are playing on the Wii.
A living room has a fireplace and bookcases in it.
A soccer player chasing a ball in the air.
Two cows standing close together on a grass field.
A laptop computer sitting on top of a wooden desk.
A knife being slid into a wooden block.
a man pouring liquid in a line of glasses on a table with a hat on his head
Several people in a field, some are flying kites.
Some items sit next to the door.
A food truck parked along side the street
A man swinging at a tennis ball with a racquet.
a fire house with a grass field in the back of it
A brown cow under a tree in a grassy area.
The insect is flying around on the porch.
A woman puts her market shopping in her motor scooter seat
Four hotdogs in buns sitting on a white platter.
A group of boys wearing white shirts, black ties and red caps.
a woman hitting a tennis ball with her racket
A transit bus pulling through a shopping area.
A cluttered restaurant has a boy at a table with a phone.
Two girls sitting at a table near a dishwasher.
A desk topped with a laptop computer and speakers.
A passenger train is going down the track and people are in the car.
A group of sheep grazing in a large open field.
A teddy bear is next to a banana in the air.
A bowl of fruit is on the floor in front of some feet.
A woman riding down the side of a skateboard ramp.
A beautiful girl playing a game of Frisbee with an orange Frisbee.
A plate with a slice of cake on top of it next to a fork.
A man loading luggage onto a machine as it comes off a plane.
A man and boy are talking behind a rickshaw.
A woman dressed in colorful clothing preparing a meal.
A vegetable pizza on a plate on a table.
A small, black cat sleeps next to a mouse and keyboard.
A black dog laying on a rug next to a TV.
A stop sign with street sign at an intersection.
two woman standing in a kitchen by astove
this bathroom is all white and has a white toilet and a tub
Large dog laying on top of a bed and looking up at mirror.
A room with some windows and a clock and a air sign.
Group of men with skateboard celebrating while in grassy park.
Several skis and snowboards laying around in the snow.
A dining and kitchen area with high wood ceilings.
Two older men throwing a ball on a baseball diamond.
A dark frame surrounds the window and mirror in this bathroom.
Two children playing Wii while adults look on.
some toy buildings a fire engine and a police car
A tablet is set in front of a Dell computer screen.
Airport baggage handlers loading luggage into a cart.
A picture of an animal catching a frisbee.
Two women laugh and show movement in the picture.
There are keyboard keys on a wooden table.
A wooden cutting board topped with a sandwich with a knife.
A woman in green cardigan with brown dog at a table.
a big bear walks through some grass
A young woman riding on a brown house through a course.
a black and silver motorcycle is parked and some people
A busy street in the city on a sunny day
a baby elephant walks through some shallow water
A boat traveling along a river surrounded by grass fields.
Two people are crossing the street as they are heading towards the stop sign.
A person performs a jump on a hill on their snowboard.
A shelf that has a wedding photo on it with flowers.
A gentleman is trying to pull off a skateboarding trick.
A laptop and a keyboard are on a computer desk.
A small boy skateboarding in a city mall
A pile of broccoli with a sprout sticking out of the top.
some people playing soccer while a crowd watches them
a couple of birds that are standing on a beach
a bunch of bottles are in the fridge
A white cup of coffee sitting on top of a wooden table.
A green cloth holding a white tray full of food.
An open laptop computer sitting on top of a bed next to a mouse.
The cathedral has two clocks on each of it's walls.
He person that is doing a skateboard trick.
Two pieces of pizza on a plate with a small servor.
Several cars and a motorcycle are parked in an alley.
Blue bullet train waiting at the train station
A plate of some sort of a vegetarian pizza dish.
a man with a hat holding a baseball bat
an elephant is scratching his head on a tree
a train engine and box cars on the track
Two people sitting on a bench with their dog.
A man smiling on skis in the snow.
A cow running on to a road near a town
A large wooden structure displaying boxes of fruit.
A bird is sitting on the top of a log.
A young boy riding a small skateboard on a pile of dirt.
An SUV driving on a rain soaked roadway past a red stop sign.
A very big building with a clock on it.
An unmade bed is covered by a comforter and a bowl.
A picture of a street light through a rainy lens.
A man is about to hit a tennis ball during a match.
Several women in a kitchen preparing many identical meals.
A man and woman are walking and the man is pulling a suitcase.
A woman riding a horse wearing a white outfit and helmet with yellow stars on it.
A black and white traffic sign under a cloudy sky.
A group of people sitting around a long white table.
A bowl of pasta sits on a table with a candle.
A line of buses parked along a wall by a building.
a semi truck driving on a road with a sky background
A very tidy living room with a white couch with pillows on it.
A stop sign in English as well as some other language.
Dozens of brightly colored kites lined up on a beach.
Freshly cooked lobsters served at home with vegetables and salad
Some people stand on the beach and others go in the water.
The people are riding motorcycles on a racecourse.
A person sits on the road near their motorcycle.
A Kenyan Airways airplane sits on the runway.
A male tennis player stands with his racket poised.
a large air plane on a run way
A white cats sleeps on the seat of a chair.
A beautiful woman holding a hunk of cake.
A field with large clear balls and a large amount of people in the bleachers.
An action shot of a moving bus on the street at night
A man is skateboarding in front of two women.
A stove top oven with a couple of pots and pans.
a hand is holding a kites string and a flying kite
Colorful graffiti on an old Canadian train car.
There is a picture of a traffic sign with north and south arrows in the foreground and a graveyard in the back ground on a projectable slide.
A bedroom with a balcony in a hotel
A fire hydrant is attached to a building wall.
A living room with a white circular table in it's center.
small boats in a large body of water
A fluffy dog is walking up the beach
A man on a court swinging a tennis racket.
Two custom pizzas with different and interesting toppings.
Zebras and wort hogs living together on the plains.
A wooden statue of a man near a window of stacked donuts.
A couple of shaggy haired sheep grazing in a field.
A group of surfers walking along the beach with surfboards
A group of people skateboarding in park area next to palm trees.
A grass level shot of a small heard of zebras in the wild.
A child is tossing a baseball to another child with a wooden bat.
A man is spraying an elephant with a water hose.
A horse-drawn carriage traveling down a city road.
Two giraffes standing together next to a wall.
Couple of attentive enthused women playing Nintendo wii
A group of women standing around a cake cutting slices.
A street scene with a couple taxis lined up.
A child in a vehicle holding some toys.
Two fire trucks from Seattle sitting in a lot.
An older gentlemen reads in his hotel room
A man surfs down the waves of a beach
A herd of horses grazing on bales of hay.
A large clock constructed of landscaping plants and flowers on a small rise.
A stop sign that is next to some plants.
A flying bird seen through a liquid filled feeder.
A close up side view of a zebras face
A man on a water board speeding down the ocean.
Four adult elephants and a younger elephant walk through dry soil.
a couple of anmails standing next to a truck
A giraffe is overlooking a barren plain, behind trees.
A woman and child walk, holding hands, under the large freeway sign.
Two people are eating pizza at a dinner table.
A cat eating at something dead on the beach.
A cluttered desk with a laptop and discs sitting on it.
A cat in a bathroom sits on the lid of the commode.
A little girl with a broken arm standing in her bathroom.
A man is skateboarding near the parked cars,
A stuffed animal dog sitting between to trash cans.
A refrigerator with magnets on it sitting beside a trash can.
A milking a cow in the middle of a  pen.
A black cat lying down on a laptop.
A red stop sign next to a brick building.
a sheep eating hay next to a log cabin.
a group of zebras on a farm in a field
Two people are in shallow water with horses.
A hot pizza on the table is loaded with pepperoni and cheese and sausage.
Two boys sit on chairs and play video games.
A lot of fishing boats have  a lot of men off loading their catch.
Group of people sitting in auditorium with a screen.
A person is holding two spoons over the sink.
A young boy who is surfing on a surfboard.
An assortment of computer devices resting on a large wooden table.
a fenced in park on a city street
a couple of zebras are grazing on some dead grass
A baseball player taking a swing at a ball
a boy on a skateboard is skateboarding on the ramp doing a trick
There are adult bears that is sitting in a den
A small cell phone sitting next to a glass of Pepsi.
A couple of bananas hanging from a metal hook.
a bag that is filled with pens and scissors
People sitting outside along a concrete wall on a sunny day.
A large tiger cat sits on a chair.
A man riding on the back of a motorcycle.
A bike parked out a store front with a lot of boxes.
a group of people under a tent celebrating something
The young men are playing a game of baseball.
A yellow door is detached from a refrigerator outside
A gyro and french fries with a drink displayed on a table.
Large hotel room with a king sized bed and large view of the ocean.
There is some chicken with cherry tomatoes and edamame.
A bed with white pillows next to a wall.
A person with a blue, red, and green plaid umbrella
A room with a bed, chairs and various boxes.
A bunch of little kids playing a game of soccer.
An airplane parked on a runway in the day time.
The children are fascinated with the making of the cake.
Ties of various sizes and colors are hanging on a portable shelf.
A cat is sitting on the floor while watching television.
Three different colored apples and a banana next to one another.
A boy playing tennis on a tennis court swings his racket.
A skateboarder up in the air over a snowy hill.
Man in grey uniform during a baseball game.
A woman standing next to two baby elephants.
Two bicycle riders are on a trail through the woods.
A cat sleeping on top of a brown chair in a yard.
A mostly empty train station with two trains ready to depart.
A man and a woman holding remote controllers in front of a television.
Sun is coming through a window in a living room.
A young boy riding a surfboard on a wave in the ocean.
A pretty little girl flying a kite on a lush green field.
A picture of a police man riding on a motorcycle.
A parking meter with a picture of a bicycle on it.
The thin pizza is sitting on the plate.
Tan suitcase behind a match magazine and CD.
Five sausage, egg, and cheese egg muffins.
A woman with pink hair walking next to a man with a suitcase.
A group of skiers pose on a snowy slope.
An outdoor table and chair setting on the curb
The old time fire engine joining the parade.
A tiled mosaic empty shower stall with bathroom mirror.
a desk with a laptop and a desktop on it
A young man on a skateboard maneuvers around traffic cones
a train on the railroad near a forested area
a person on a bike rides next to a city street
Skier on slope in alpine mountain area on sunny day.
a man in a suit carrying a drink and a red and white sign
A glass of alcohol sitting next to an open laptop computer.
This old wooden fishing boat appears to be permanently dry docked.
THERE ARE PIZZA THAT IS ON THE TABLE
A male tennis player bouncing a tennis ball.
People standing around in the street talking near buildings.
A bus and cars sit on a street.
A closeup of a empty boat surrounded by dark waters.
a black gray and white cat is sitting on a bookshelf
a black silver white blue red an orange parking meter and a hand flipping it off
pitcher with grey and white shirt throwing a pitch
A slice of pizza is on a round white plate.
three motorcycle riders some dry trees and a few green trees
Three people, one in a suit, are posing for the camera.
A toilet and trash can behind a wall in the bathroom
a group of people pose for a picture at a wedding
A man is sitting on a bench, taking in the city.
A man sitting on a white chair on top of a tennis court.
In the station people are standing and talking.
A man and some giraffe standing in a field.
A zebra with a left side pose while standing in a field.
The man is dressed in a suit and tie posing for a photo.
A man wearing eye glasses is staring at the camera in front of a room.
A white and blue vase with a peach rose in it.
a black and white photo of a tooth brush in a cup
A group of sheep and some birds in a fenced in area.
a small child is looking at the kite flying.
A yellow bus driving down a street next to a ball building.
A person with blue hair takes a photo of themselves.
An adult with a child riding skis down a small hill.
The group of three friends are sitting on a fallen tree in the woods.
A black cat laying on a parked car.
The man sits cross legged while typing on a laptop.
A couple of women riding on top of a blue motorcycle.
A clean looking bathroom has a white shower curtain.
An open door on a public transportation system.
A small clean simple bathroom contains a sink tub and toliet
A clock displays the time on a brick building
People walking on a beach, many carrying surfboards
a person riding a skate board at a skate park
There are two people and two motorcycles by a brick building.
A little girl out on the beach with a fish kite.
a little bedroom with some curtains blocking the window
A brown horse grazing in field behind a fence.
Hundreds of people cycling in front of several skyscrapers
this is a sandwich and french fries on a plate
A man is swinging a baseball bat on the field.
a close up of a cup with tooth brushes
a bunch of people sitting under a umbrella
A small dog buried in the covers of a bed.
Two man preparing their surfboards to go surfing.
Two giraffes standing in front of a wooden wall
A computer on a desk with two cds lying on top of the keyboard.
A small cat sitting on the edge of a toilet seat looking into the toilet.
Someone skiing down a hill on the ski slope.
A cartoon of a person surfing a big wave
Urban street with storefronts and parked trucks, on a rainy day.
A man takes a bite out of some sort of food.
A man riding on top of a wave on a surfboard.
A remote control lying on a wooden table
a gothic clock tower beneath a blue sky
Two skateboarders are riding on a slanted walkway.
A group of kites flying through a blue sky.
A ripe banana sitting on a table next to an apple.
Some passenger buses that are driving down the street.
A little boy is at a dining table in public.
A double decker bus driving down a street next to a tall building.
Pizza and appetizers with a side of ranch dipping sauce.
A large statue of an Italian chef wearing an orange tie.
a man standing at the beach in the water holding a kite
A group of people in a circle, while holding tennis rackets and standing on a hard surface tennis court.
A dog that is running on the grass with a Fribee in its mouth.
A clock that is sitting on the side of a tower.
A bed with a colorful blanket sitting under a picture.
A stop sign on a corner with water and snow covered mountains in the distance.
Snowboarder bundled up in winter clothing while on slope.
An old plane is sitting on a runway.
An old clock is seen on a foggy street.
A man cross country skiing through the woods.
A parrot sitting on a person's hand while eating fruit.
A group of eople binding over fastening their ski boots.
A young boy holding onto a parking meter.
An elephant roaming the grassy areas in his natural habitat.
A few people standing on a court playing tennis.
A couple of bears are outside, both on logs.
A spiral glass water feature showpieces a commercial bathroom.
A man attending to food by a pile of fruits and vegetables.
A baseball player with a mitt on one hand.
a person jumping a skate board in the air
A man who is eating a glazed doughnut.
A computer on a countertop with a tangle of cords behind it.
A tray of food consisting of vegetables meat and rice.
A unique style bed with red covers and a mirror behind it.
A large and a small teddy bear at the teddy bear museum.
A person water skiing behind a boat full of people.
A man on skis standing at the base of a mountain.
A container with a variety of vegetables, desserts, breads and other types of foods, with one spoon on top of the food items.
A pair of plush animals dressed in halloween costumes.
A black dog in a yard jumps up toward a yellow Frisbee.
A horse reflected in the surface of water
a toddler standing while holding onto a toilet and reaching for a towel
Two men with suitcases and a lady nearby.
A black and white dog curiously looking at something on a counter.
an adult and two children snow skiers snow and trees
A group of children running after a soccer ball
A blurry image of yellow flowers with a fence in the background.
The long meat and cheese sandwich is wrapped in plastic.
A man standing in front of his tv.
A train riding a group of people around.
This is a plate holding a double decker sandwich.
A giraffe grazing from a tall tree next to a rock.
A youth baseball team and their coach poses for a photo on the field
some people are sitting under umbrellas at the beach
A surfer carries his board through the snow, and rides a wave.
Apples and leaves on the ground with a cat in the background.
A dim living room with modern furniture and potted plants.
a big group of people that are standing under a shelter
A street sign on a pole on the side of the road.
a woman taking a picture of her microwave
A living room with everything in it labeled
people watching young boys playing a game of some sort
The young man is practicing his tricks on his skateboard.
Lambs in a sheltered place are eating and laying around.
A living room scene with the television and a Christmas tree.
A living room has two couches and a television.
Two men who are looking at a passenger jet.
A man sleeping in a bed with two cats.
A woman bending over holding and kissing her cat.
A person jumping in the air on a skateboard.
The stop light reads green, and there are two huge buildings in the back.
The back ends and legs of three elephants, including a baby, are seen on the side of a road.
A baseball player wearing the number thirteen at home plate.
Two large white commercial airliners on an airport runway.
Nested measuring cups and spoons on a gray surface.
A man playing with a Frisbee in a gym.
A crowd of people carrying umbrellas across a rain soaked street.
three fourths of a pizza with meats and vegetables on a pizza pan
A woman and a man standing with a horse in a boat and a dog laying next to it.
A dog is sitting on a counter in what looks like a factory setting.
A large tow truck drives down the street.
A silver, stainless steel refrigerator in a kitchen
A living room filled with furniture in front of a fire place.
A woman with green lace underwear is walking away as tennis balls are hanging all around her.
A photo of a dirty bathroom with a sink and toilet.
A gray bird perched on top of a tree branch.
An elderly man and a teen play video games together.
A keyboard and monitor on a wood desk
People sit and wait, looking at papers and phones.
Traffic light in a blank space with lit green light.
A woman at a table eating with two pizzas.
A man standing next to a woman holding an umbrella.
a man that is looking into a stove
A desk with ruler, whole punch and scissors on it
A bathroom that has a mirror in it.
A long empty road with an over pass bridge.
A group of young children sitting around a table eating food.
A man is skateboarding down a path next to some grass.
A child at a store display selling green bananas.
an adult feeding a baby some cake
a suit coat shirt and tie hanging on hooks
A zebra walking through a green field of grass.
Two photos containing food with hot dogs and pastries.
A picture of some delicious pizza ready to be eaten.
A small white and brown bird resting on a twig.
A herd of black cattle grazing on a lush green field.
A toilet in the bathroom with a wheel in the window.
A CUP OF COFFEE AND A PASTRY ON A TABLE
A black and white cat is sitting in front of fall foliage.
Two adult males enjoy playing a videogame together.
a living room with a low ceiling and it has a couple of couches
The personal sized pizza on the plate has many toppings.
A little toddler boy sleeping on his couch with a remote in his hand
a cat dressed with a collar and tie decorated with irish symbols
Little boy in boat with two halves of a banana in mouth.
a man on a pay phone holding his hand out to someone
A bunch of food sitting on a plate with a spoon
A giraffe leans its neck as it walks through the bush.
A toddler brushing her teeth with an electronic toothbrush.
Little girl with a group of children watches a show.
The tray has fries, meats and vegetables.
A long row of train carts sitting in a yard of tracks.
a woman in a dress prepares to hit a tennis ball
there is a woman sitting on a couch holding a piece of cake
Several sausages cooking on a grill with glowing charcoal.
Black and white photo of a large clock located outside.
this person is doing his work on two computers
a dog and a person stand on an edge with a mountainn in the back ground
A table with plates of food and an orange on it
A hanging street sign that says Rockefeller Plaza.
Two people with remotes in a living room.
A couple of people on a field playing baseball.
an elephant resting in the water next to the shore area
Black and white photograph of a woman surrounded by pigeons on a city street
a couple of animals are standing in a field
some boats going down a tree lined canal
An overhead view of a lot containing many parked, empty buses.
A sign indicating Florida Avenue and another one stating the speed limit is 35.
A guy holding a piece of food up to his mouth.
an image of a dog with one paw out the window
A book is open and kept in front of a soft toy.
A disturbing doll sits next to a clock in a mirrored image.
people bicycling down a city street in daylight
A group of people sitting next to each other on a bench.
A person walking down a sidewalk carrying a back pack.
A reflection a person catching a frisbee in a mirror like object.
These military guy is celebrating something big with a nice cake.
a very big elephant with some clothes on carrying three people
Two bears coming out of the woods to a road.
Broccoli and a deep fried food lay on a black and white plate.
There are two elephants standing next to each other.
Three cows in a barn eating food off the ground.
A truck driving through and intersection waiting on a pedestrian to finish crossing the street.
A car at the light getting ready to go because the light is green.
A book setting on a green bench in a park.
A small herd of cows grazing along a path on the side of a hill.
Helmeted and uniformed military men travel together on horseback.
two men in suits standing next to each other
Three people are standing in front of a truck while another is in the background.
A person that is going out some candles.
A man and a dog on a skate board.
The serving counter of a restaurant is quiet.
A man tossing a teddy bear off the side of a bridge with a parachute.
Two men sitting at a table with a very large pizza.
A couple of ships in the water by some buildings.
A big building on some grassy field during the day.
a boy laying down on a surfboard in the water
A plate of food that has pita bread, green peppers and tomatoes.
A man that is sitting on a moped.
A picture of an oven with food baking inside.
A man riding down a snow covered ski slope on skis.
The travelers stare outside a tram as it approaches a giraffe standing by the roadside.
A man filling jugs with water from a bathroom sink.
A white cake with blueberries and oranges on top.
Leaves and purple flowers come out of a brown vase on a desk.
A toddler is running through a kitchen while some adults stand close by.
A cat laying on top of a wooden desk near a monitor.
A couple of people on a field with a Frisbee.
A view of the street signs "W 122 St.", "Seminary Row", and "Broadway" in front of an old red brick building.
A skier lifts their ski poles in the air on a slope, with other skiers nearby.
A protest sign painted like a stop sign stating "stop harper"
The face of a dairy cow in a pen.
a person ina field playing with a frisbee with trees nearby
There is a person flying a kite at the beach
A bowl of raw fruit on a table by a painting
A plane flying over waves and a small island.
A open suitcase containing shoes with a table on top.
A man poses with a pinwheel against the blue sky.
A modern looking living room in an apartment.
People walking down the stone sidewalk in the rain.
A group of people are around a dining table.
A lot of people that are in the street.
A zebra standing near a tree in a field
A man eating chocolate donuts and a woman smiling next to him.
A group of people posing around a woman holding a cake.
a person getting ready to swing on something
A pretty young lady carrying a white umbrella.
A flamboyant man wearing a tight green marching band uniform.
Someone has set up a make-shift photography workshop in the field.
A gray train is on a track on a hill near water.
Two giraffes stand back to back and eat leaves
Few persons are seen on zebra crossing on road and an elephant with a banner is there.
A small herd of cows stand in a high mountain meadow.
Two elephants are in the middle of a circus ring.
A person holding an unusually thin Chiquita banana.
The bathroom is in the process of being worked on.
A plate full of broccoli with fries and carrots
A horse is grazing in a grassy field with a view of mountains.
a wet black dog has some sand on its nose
A man sitting in a wheel chair under an umbrella on a busy street.
A white fire hydrant sitting outside a building with a mural painted on it.
A herd of animals standing in a large field.
A row of white toilets sitting on top of a dirt ground.
A tennis player is lifting is bending his leg off the ground and reaching his arm up in order to hit the ball.
A young boy in a wetsuit on a surfboard.
Two young women playing a game of soccer.
A plane sitting on a runway beside water.
There  are people skiing next to a dog.
A grey and white cat sitting in a sink
a person with a large afro and glasses
A man holding up a kite so it catches the wind.
A country scene has a rocky trail leading to a body of water.
A hand holding a bagel covered in almonds.
A girl taking a bite of a slice of pizza.
A middle-aged man in a suit with messy black hair.
barren clean white kitchen with white appliances and stainless steel sink
a plate full of vegetables with seasonings sprinkled on top
A young man in blue jacket riding skateboard in snow.
a hydrant in a place near some houses
A female tennis player jumping up to hit the ball.
An open laptop computer sitting on top of an office desk.
A picture of a building with a very nice clock.
There is a large plate of tomatoes and a pan of sliced tomatoes
A car that is sitting near a green street light.
A young woman using a cel phone, in a college tank top.
A sink and some counters in a small room.
A man riding a skateboard on top of a ramp.
A couple of young people standing in front of a TV.
A person putting some food on a white plate.
a person at a table with a plate of pizza
Bright blue train carriage awaiting passengers in Peru
Two slices of plain pizza are sitting on a plate.
A grey table with a white plate of food.
two people sitting on a bench near trees
a guy in the photo looks sad and dark
A group of people seated using cellphones, three ladies with handbags
The side view of a man with coffee casts a shadow as he ponders at his laptop
A large brown dog sitting next to a frisbee.
A woman in a white and green tennis dress setting up her shot.
Three women in a kitchen at a table full of food.
Two giraffes in their pen at the zoo.
Very pretty clock with the base surrounding by brick floor
A small transport truck with a white trailer.
An open faced sandwich, chips and sauce are on a plate.
White dish piled with ham slices and broccoli.
A train with lots of red cars traveling down tracks.
A man on the beach has a large umbrella.
A picture of a man in a green baseball uniform batting for his team.
two brown and white birds sitting on a roof
A herd of flamingo birds in the water near a construction site.
a boy in a baseball uniform standing in a field
A red and black long van is parked in a parking lot.
A giraffe next to the road on a safari ride.
A group of cups that are sitting on a table.
Biplane flying over blue ocean next to coastline.
There is snow on top of the snow board.
An open living room with hardwood floors and a vase of flowers
A picture of a bathroom with white tile walls and a window with white blinds.
A herd of sheep roam in the grass.
A sidewalk area with a red fire hydrant near a light pole
A slow moving subway train that is going down the track.
A woman hitting a tennis ball with her racquet.
A young child sitting on a leather couch holding a controller.
A restaurant is filled with many people and newspapers.
A couple of donuts are on the plate, ready to be eaten.
A stove top cooking in a pot and frying pan
A street sign with the name of a street on it, and next to it is a post with various names up and down the post.
close up of a bulding in the mirror of a vehicle
A living room with a fire place and lots of furniture.
a bunch of bright lighted signs and mopeds on a street
a man with a helmet is touching some food
A silver hippy van and a bus for vegans.
A Southwest airplane is parked on the runway.
a close up of a person holding two birds
many different clock on a shelf near a wall
The small bathroom has a glass shower door.
Train traveling through countryside near tall brick structure.
Two girls outside, one flying a kite and one sitting down.
A kitchen scene with focus on the sink and counter with vegetables.
A cat sitting on a windowsill next to a painted pumpkin.
A family on skis posing for a picture.
The big rig truck is parked in the parking lot.
A cat saying on a sofa with many pillows.
A group of animals grazing on grass in a field.
A man riding a snowboard down a snow covered ski slope.
two people on a beach next to a large body of water
A banana with a face written on it in front of a mirror.
A table topped with three plates of food.
An old ornamental building features many beautiful windows and a clock.
A light blue airliners is parked on the tarmac.
A man handing out slices of pizza to protesters
A bus and a car travelling in the same direction on a sunny day.
A computer mouse is placed next to a computer keyboard.
Somebody is sleeping in the bed next to the clock on the table.
a baseball player holding a bat on a field
A young bearded man holding a partially eaten hot dog.
Woman in white and black outfit on a tennis court.
A white bus driving down a street next to people.
A sofa with pillows next to floor and blue rug.
A smiling woman holds a banana up in the air.
A yellow tractor digging next to a yellow and red fire hydrant.
A snow boarder is going down an indoor slope.
a couple of trains sit parked as it overlooks a city
Group of people holding up umbrellas in front of cactus.
Female tennis player preparing to serve the ball
A man riding a skateboard doing a trick.
A double-decker Liverpool Street bus on a city street
A person cutting a pizza with toppings into slices
A large bird sitting on top of a metal spire.
A tennis player raising his racket to hit a ball.
A tray with food on a table
A person takes a picture with their cell phone.
A white toilet sitting in a bathroom next to a TP roller.
A stop sign that has some foreign words written on it.
A skateboarder rides his board in a concrete pool.
Soup presented in a bowl on a plate.
Two buses under a large open structure at a station.
A baby elephant reaching for grey bag at a zoo.
Multiple pictures of a man brushing his teeth.
A salad to be eaten with wooden chopsticks and a drink.
A couple of men in police uniforms sitting on horses.
Two children sitting in the grass eating food
A herd of elephants walking across a river.
A line of stuffed animals in a child's room.
Plate of food that are on top of a table.
A computer is sitting on a computer desk on the far side of the room.
A giraffe sticking its head through the rails of a wooden fence.
A white and black bus on street next to a building.
A woman talks on her cell phone as she skates down the sidewalk.
Two zebra and other animals grazing the grass.
a lady that has kids in her lap at the table
Two men jumping to catch a Frisbee while people watch them playing.
A man wearing all blue with an oil can walking around a train engine.
A young man walking down a sidewalk pulling his travel bag as others watch.
A street view of a protest and a woman with her fist raised.
a cat on a bed with dishes on top of it
A green and white airplane behind a fence.
A yellow fire hydrant on the side of the street.
A tall obelisk sitting next to a tall white building.
A herd of zebra standing behind a wire fence.
A cat relaxes in this tan leather chair.
A group of people standing around a table filled with fruits and vegetables.
A man with a racket walks on a court.
a person is playing tennis outside on a court
A man in a black jacket riding a skateboard on the street.
A group of young men standing next to each other on a field.
Cutting board with various fruits, utensils and spices.
A little dog is running around an outside shopping stand.
Two tangerines and a banana atop a blue plastic bowl.
Several traffic lights are seen near a busy highway.
A yellow and white train traveling down train tracks.
A group of people sitting on horses in a row.
A bench outdoors on a path near a fence.
An animal stands in grass on a hillside on a sunny day.
An almost empty box with a partially eating doughnut and a knife in it.
A clock tower next to a large building
LARGE SANDWICH CUT IN HALF ON A PLATE.
A lone train is parked on the train tracks at the station.
People skiing down a slope with many moguls.
two blue and white trains buildings and some wires
A street scene with people and cars on the street.
Young professional looking man looking at the camera.
A large lizard float is rising in the air.
A young girl with a tennis racket is in a parking lot.
a person wearing a dress and riding skis indoors
A WOMAN IS NEAR A CAMEL WITH A UMBRELLA
Box of cereal sitting next to a box of donuts.
A woman on a skateboard riding on the sidewalk
A red passenger bus makes its way past Big Ben in London.
A tooth brush in a blue glass sitting on a counter.
a person in a tie and suit sitting before a white plate of food and wine glass.
A man approaches an intersection in the rain.
A man sits at a table that has a surfboard propped against it.
People in business suits standing in front of a building.
Several elephants walking around grassy area in the wild.
A tennis player holding a a racket on a tennis court.
A single seagull swimming towards a rocky shore.
A man in beard and glasses with a red and white suit on.
A man skiing down the side of a snow covered slope.
A view inside a refrigerator that is completely packed with food.
A lamp post with traffic signal, street light and street signs.
Baseball players playing on a professional baseball field.
A spacious bathroom with lots of lower cabinets and a toilet.
an image of a guy that is walking by a train
Stop sign is above a red triangle sign next to a barb wire fence.
A hand holding a water bottle in front of a cat.
four plants being grown outside in a planter
A surf boarder stands as he rides a wave.
A museum display featuring professional baseball jersey and bat.
A zebra standing in grass next to trees.
The old man is talking on the phone.
An unmade single bed in an upstairs bedroom in the early afternoon
Young skate boarder doing a nearly vertical stunt
The cat lies next to a cat sitting inside of a sport bag.
A partly eaten pizza and a fork with wine on the table.
An older man sitting at a small table about to eat a slice of pizza.
A man surfing down a rushing rivers wave
A man walks a dog near a large bus.
A bike rack full of bikes and people every were.
a small yellow boat set in the water by large rocks
Giraffes in the wild on a sunny day
A woman stirring a large metal pot of food.
A pitcher preparing to throw a base ball.
a plate with half eaten food on it
A large white bear walking across a river.
A bike parked next to a parking meter on the side of a street.
A white bowl topped with a sandwich filled with meat and veggies.
two black horses are grazing on green grass in the field.
People milling about a bus terminal getting ready to board.
The cat is balancing on top of the door.
A man holding a white and black umbrella in a large parking lot.
people sitting on a bench facing the water.
Two men sitting down at a both eating .
Several people standing around and looking at a vintage plane.
A white plate topped with vegetables.covered in sauce.
A girl is riding a surfboard in the water.
A bright red motorcycle parked with other motorcycles beneath streetlights.
The street sign is in the middle of the flood waters.
A stuffed panda bear is sitting on a bench near a Buddha statue.
Small bird standing on rope near open ocean.
A plate topped with pasta, meat and broccoli.
A woman with a snowboard jumping in the air.
Cars are driving through the intersection underneath traffic signals.
A person holding a tennis racket on a tennis court.
two baseball players standing close to the base
A black and brown dog rests on a couch.
A black dog sitting in the middle of a bathroom.
a motorcycle with a boot on the back wheel
A woman and girl watching donuts being made through a window.
a scooter with a rifle bag parked in front of a fence
A nun sharing pizza with two young men.
A snowboarder is on a board and is jumping in the air.
A little boy in a inter tube at a water park.
Two black cooling rack shoaling pieces of pizza.
Room with patterned carpet and wallpaper and dark wood furnishings.
A door to a bedroom is open with a wooden dresser in view.
A car parked in a lot with a surf board strapped to the top.
A man, women, and child sitting at a table.
4 seagulls stand on rusty rods with people in a boat in the background.
A train is stopped in its tracks next to a building and cars.
Various items on white surface including a cellphone, keys and camera.
A monkey hanging from ropes eating bananas strung to it.
a group of children playing soccer on an open field
A rose, an entry way to a forest, a water fall and a lounge sign are in a series of photos.
a wedding cake with a picture on it
A man sitting on bed looking at a television and person in mirror.
A couple of large birds standing by some eggs.
A hotel bathroom has a granite vanity with a big mirror.
A young man is holding up a skateboard.
An umbrella is strapped to a blue bike.
A couple of pairs of skis in the snow.
A sliced panned pizza on a table ready to be served.
Three giraffes stand in front of blurry trees.
A green passenger train stops in a station to pick up passengers.
Three pedestrians crossing a street at a stop light.
a man with a beard a deer and a pink fire hydrant
Two male tennis players meeting at the net for a high five.
A very tall brick building sitting next to a traffic light.
A man jumps and reaches for a frisbee
A group of people sitting around a table.
Man holding up a plate with a brownie in the shape of a spaceship.
A small garden area features a few springs of growth and a small busy plant and a few bricks.
A traffic light with stormy skies in the background.
A girl thinks she is being funny while eating pizza.
The surfer is barely hanging on to his surfboard.
A stop sign that has spikes sticking out of it.
A plate of food that includes chicken and broccoli.
A white toilet in a very small bathroom.
A crowd of people watching a baseball game where a batter just hit a ball.
A person holds a pink frosted donut with jimmies.
some white black and brown sheep in their pen
A black truck is driving on an open sandy area
A snowboarder flying up in the air with the sun behind him.
a table with a blender and a glass on it
This young girl is learning to throw a frisbee.
a couple of geese are on the water
A row of boats on a river with trees in the background.
a red fire hydrant next to a stone brick wall
A man sits on a bed talking with hand gestures.
person walking down the sidewalk at night in rain
A giraffe standing next to trees on the plains.
Three horses on a green pasture with an old building in the background.
a couple of boats are sitting in the water
A man flying a double string kite in a large grassy area.
A man in a ball cap riding on a mule.
An old red truck is driving by the water.
An assortment of vegetables sit out on the cutting board.
The two zebra stand in a black and white photo.
A girl swings a raqcuet at a tennis ball.
A young girl sitting with a young boy at a table with food.
A silver and black train passing under a bridge.
A person that is going out in the water.
a large red bus is at a stop
an open kitchen and living room in a daylit house
A young boy standing on top of a rug in a living room.
The head and arm of a person flying a kite.
A busy market sports colorful umbrellas that shade the vendors.
A bird is perched atop a computer monitor.
A chocolate cake is being frosted with chocolate frosting.
A large dog has a collar with clock on it.
the truck is going up the hill in the snow
Adult with laptop with dog lying next to him.
A white plate topped with fries and sausages.
An old man n his computer in front of the fire.
A worker performs maintenance on a fire hydrant.
Some donut are on a round white plate.
a cat standing on some rocks next to some bushes
the bus is blue and is stopped. Some people are standing waiting for it
A man on a surfboard is riding a huge wave with his feet out and arms extended.
A living room filled with furniture and a rug.
Two small birds in a large green grassy field.
two older people stand next to a statue of a horse head
A boy in the stands of a baseball game biting into a hot dog.
A very pretty shallow stream in the woods.
A busy intersection in the city is full of people and signs.
A gray minivan on the curb at W 38th st in a big city.
Man playing racquetball about to hit a ball.
A couple of chairs sitting on top of the back of a truck.
Two men standing next to each other holding giant sugar donuts.
pepole eating at a restaurannt meat and veggies
A close up of a parking meter by a parked car.
A living room with windows all around it .
A photo of an old clock tower next to some buildings.
Several stuffed animals sitting in wooden boxes outside.
Several long boarders are riding long boards down a quiet street.
The animals are roaming in the backyard outside int he grass.
A kitchen with furniture and decor in it.
Crowd of people standing around while someone flies a kite
A man watches another man that has numerous bananas on his head.
a fake mouse is in a box of doughnuts
A sink with several faucets and a large circular basin.
Two park benches with one man sitting in woods.
A large white bed covered in two white pillows.
A group of people that are sitting on benches.
A living room features a wood ceiling, stone fireplace and large glass window.
a desk with a monitor keyboard and mouse
A young boy about to hit a large ball with a large baseball-like bat
A woman stands behind a cake and baking decorations.
The dining table and chairs are outside the small kitchen.
A cat is sitting in front of some steps
a male skateboarder in a black shirt doing a trick
An elephant moves is gesturing toward a bus.
Many people are sitting around tables with dinner plates on them.
Two men standing in a living room holding Wii controllers in their hands.
A person walking on the shore with a surfboard under their arm.
A kite made like an airplane flying above several American flags.
an image of two bags set on a hotel bed
Keyboard, sunglasses, book, pen, and various items on a table.
Collection of books scattered all over a bed.
This blurry picture has a male in a suit in it.
a woman wearing a cowboy hat face to face with a horse
A beautiful young lady sitting on a park next next to an old man.
A yellow and red fire hydrant in a yard.
A little boy is waving at the runway as a plane is sitting waiting for takeoff.
a lit candle sitting next to a plate filled with food
a picture of a hang glider on a beach
a clock that has two figures sitting on a mantle
A male and a female holding up their cellphones
An empty bench is on the curb side of a grassy area.
A man in casual wear holding a baseball type bat.
A close up of a person's hand with a scissors cutting something wet.
Wine glasses sit in a row on a wooden ledge
A large grassy field with giraffes and a few other animals.
Smiling child with a tooth brush in hand.
Pitcher at mound throwing ball to baseman near runner and umpire.
A luggage cart stacked with a very tall pile of luggage.
a road with many traffic lights and cars driving
Two people flying a kite in a park
Wildlife standing near water area in natural setting.
Two businessmen talk over a cup of coffee.
A bunch of construction barriers near an old, worn down building.
A flock of birds flying over a body of water.
A tennis player hitting the ball with the racket.
An overturned skateboard lying on a grassy field.
Two boys are playing catch with a frisbee.
A black and white zebra grazing on grass.
A cat sitting on top of a chair.
A clock that is in between two windows on a building.
A stop sign set on the inner curve of a curving dirt road.
A cat sitting on the floor by three shoes.
A man holding a ski board and parasail rope.
A tennis doubles team with one player in the air, her racquet in motion.
A person standing on a surfboard riding a wave.
Seagull in the sand near a boat launch
A public restroom with focus on two urinals.
an image of a man carrying luggage in a cart
A CGI man sitting on top of a CGI hospital bed.
Child sitting down in a chair eating a sandwich.
The cat is looking at the television screen.
A small child stands in a shopping cart with an umbrella.
A plain piece of bread resting on a wooden plate.
A train door from the inside of the car with exit signs and grab bars.
The man is sitting on the bench typing on his laptop
a train car sits parked as people stand next to it
Food trucks serve customers in the parking lot at the event.
A man and woman seated at a table in a restaurant.
A number of train tracks with a train on it
A plate full of spinach salad with dressing
a couple of people that are playing with a Frisbee
A wooden stand with many types of fruit.
A white plate holding two pieces of cake on a table.
A giraffe handler training a giraffe at a zoo.
A small sink area is packed with items.
A pole with many different stop lights in different directions.
A tree filled with lots of fruit and leaves.
A brown horse standing on a lush green field.
A brown towel that is sitting on a tub next to a toilet.
A blue bowl containing various fruits such as apples and bananas.
A bear sits next to another bear on a white blanket
A dish of vegetables mixed together in a bowl.
A boy is doing a trick on his skateboard.
A large bus on a open city street.
A clock with a colorful drawing on it.
a person riding a surf board in a body of water
Caucasian and African-American business men standing in line to buy 'Po-Boys from a catering truck.
An old picture shows a man up to bat on home plate.
A plate of bread , eggs , and bacon .
Many people and a few cows are spending some time in the water and on the shore.
A man and a woman cut a cake together.
A bear walks in the bushes and plants in the wild.
A man is riding a horse in front of several buildings.
a basket of apples oranges and avacado on a table
The cup that contains a toothbrush, toothpastes are placed next to the mirror.
A red car and red motorcycle parked at a curb near a woman walking with an umbrella.
A toddler is sitting in the bathroom sink playing with toothbrushes.
A woman posing next to a double layer stack of donuts.
Man intercepts man over a game of frisbee
Man surfing on an ocean wave in the summer time.
An aerial view of a city and waterway with ships in the water and a bridge.
A bunch of books that are lined next to a clock.
A man in a blue suit eating a hot dog in a gym.
Pink and white flowers planted in an outside area.
A skateboarder rides a ramp in a skate park.
A dog laying on a couch with a Frisbee.
The drink in the glass is garnished with toothpicks and rosemary.
a man dangling in the air over the ocean
A chocolate and ice-cream dessert in a restaurant
a single person walking the beach with a dog
The train is traveling down the tracks by the station.
A room filled with computers and laptops on a desk.
A group of people sitting around a table with clutter on top of it.
A bedroom containing a bed without sheets and a dresser.
A couch is made into a bed in a room with a desk.
A player at bat in a baseball game.
A boy is flying a kite on the beach.
a woman in a gray top is cooking outdoors
A man sitting with his back to a dining table, with a laptop on his lap.
A young guy standing by a tree while playing outdoor activities.
A couple of white parrots perched on top of a tree branch.
A man holding up a tennis racket as he coughs into his arm .
A young man is tilting a skateboard up with his feet.
A woman is sitting on the curb with a decorated parking meter.
A frozen pizza box with the cooked pizza lying next to it.
A brown vase sitting inside of rocks next to a set of green plants.
A half eaten sandwich is wrapped in white paper.
A bald man with a mustache wearing a suit.
This is a yellow and blue double decker bus.
A picture of a wooden hedge hog clock with a price tag of twelve dollars.
Four people with a birthday cake on a table.
young children getting healthy food from a table.
Some street signs point directions to various places
A Water Dept sign is placed in front of the fire hydrant.
A plate has beef on it near a glass of wine.
A cat looks back over its shoulder while laying on top of a fuzzy white blanket.
A train engine carrying carts down a track to a station.
The electrical components of an oven are being tested with a multimeter.
Young man wearing shorts throws a frisbee among trees.
The woman is playing with a wii controller
A person on a skateboard does a trick in a bowl.
A man that is holding a knife and a pot with broccoli.
The Halloween display includes a spiderweb and lots of pumpkins.
There is a neatly made bed in a bedroom of a log cabin.
White swans swimming in a harbor with docked boats.
A novel is on the seat of a green metal bench.
A man standing and posing for a pic in formal wear.
A few airplanes on the runway at the airport
Baseball team holding batting practice on the field
A man with glasses talking into a microphone.
There is a hanging clock in the hallway of the home.
THERE IS A GIRAFEE THAT IS WALKING IN THE WOODS
A surfer takes a ride on a wave near a mountain.
A freshly baked pizza resting on a table.
A man sitting at a table with a glass of juice in his hand.
a kitchen with a counter some chairs and a sink
A woman riding a skateboard in the street behind a man on a bicycle.
a couple of guys that have emt equipment
A man wearing skis holding two ski pose on top of a snow covered slope.
A small potted bonsai plant is on the floor getting licked by a cat.
A parking meter that has a blonde wig on it
A large truck driving down a busy road with the back full if dirt.
A clown talking on a  phone next to a building.
A tennis player on the court holding a tennis racket.
A plate of fries and a hot dog sandwich.
A double decker bus stopped at a bus stop.
TWO BUSINESS MEN WITH TIES ON CONVERSING OUTSIDE A BUILDING
A bed sitting in a room next to a wooden door.
A view of a person's hand on a computer mouse.
A park bench in the woods with a bag on it
A cat is looking out of the window.
four giraffes standing in a field 2 are facing forwards
A pile of oranges sitting inside of a basket.
On this table there are mugs of hot chocolate with shapes and half eaten donuts on plates.
a teddy bear nailed to a tree suspended above garbage
A room with a toilet, a door and shoes in it.
A public bathroom area with orange tile walls.
A watch and class with a beverage sitting on a wooden table.
Room with many hanging clothes, a bed and dresser.
A photo of a group of bikes behind a bus.
Pendant lights illuminate a bathroom sink for two.
A prepared pizza is sitting on an appliance.
three people standing in a room and eating food.
A plate that has a sandwich and a bowl of fruits on it.
A woman posing with a bat and wearing a batting helmet.
A man wearing a white lab coat walking a cow down a field.
A pony grazing on grass in front of a lighthouse.
Man spreading peanut butter on an English muffin
A train is traveling down a track in the middle of an arid plain.
A woman is playing Frisbee with two dogs.
a vintage photo of man standing in the middle of some waves
Personal pan pizza on a wooden table top
A woman is standing in front of a stove
A man stares at a cake with candles
A surfer is riding a yellow surf board as he hits the waves.
Several cars driving towards a public market.
a traffic light and a street sign on poles
A church steeple rising high in the sky.
A jet airplane flying in the daytime sky.
A tablet sits on a table with two pizzas.
People walking down a sidewalk on a street.
A yellow fire hydrant near a grassy field.
a man in a black jacket is holding a hot dog with mustard
Two friends are eating an extremely large pizza.
There is a toilet and a bathtub in a bathroom.
A guy skateboarding indoors in front of a crowd of people.
A cat that is looking out of a window.
there is a woman holding a baby and a pizza pie on the table
A desk with laptop, mug, paper and a monitor.
A woman with a tennis raquet prepared to hit the ball.
Three shelf deli display case with bottle beverages on top.
The kitchen has five beams running across the ceiling.
A woman grabbing a piece of cake off the top of a plate.
A sandwich and a salad are on a tray on a wooden table.
a kid stands on a hillside while flying a kite
A man in a red snow jacket is on skis.
A woman standing next to a little girl playing a game on  Nintendo Wii.
a woman is playing tennis on a court
A crowd of people standing outside of a brown brick building.
A woman is preparing to bite into a sandwich.
A young boy tying paper kites to a string stretched across a room.
A dog that is sitting down by a bench.
An airplane sitting on the runway in the snow.
A man about to put a leash on a large cow.
People are lined up along a train station waiting for a train.
Young man on a skateboard approaching a street.
A red and white fire hydrant on a sidewalk at the park.
A man and woman walking across the lawn carrying an umbrella.
a person riding a motorcycle on a city street
a young kid performs a trick on a skate board
Men playing soccer on a field at night.
A couple of women riding skis on top of snow covered ground.
Two ladies using the Nintendo Wii in a living room.
Some vegetables on the ground are in planters.
The clock on the side of the building is also a sculpture.
She appears to be hanging on the street sign.
A giraffe on  a large plain with herd animals in the background.
A man is about to swing a baseball bat.
A child is in the snow with one ski on and one off.
The bed red couch from the Mc Donalds commercial sitting in a living room with a fireplace next to it.
A sign above a white stove and refrigerator next to it.
Some babies playing in the bath tub one holding a tooth brush.
A man drives by on a person holds onto a ladder below an airplane
A lone zebra standing in the middle of a field.
A woman is playing tennis on a fenced outdoor court.
Well decorated restroom with sink and chair for sitting.
Skateboarder in the motion of turning on his skateboard.
A large grey horse is behind a wooden fence.
a mountain with a bunch of animals next to it
A dozen people smiling for the camera at a large wooden table in a restaurant.
A black dog laying on top of a rug on a hardwood floor.
Two giraffes in an enclosure are bent over peering at visitors.
A man riding a skateboard up the side of a ramp.
A little girl that is sitting in front of a laptop.
A professional female tennis player engaged in competition on grass.
Two computers are sitting on a brown desk.
Beach umbrellas and chairs next to each other.
A person who is on a barrel on a snowboard.
A man standing outside holding a sausage dog in his hand beside the food stand.
the mirror is showing a picture of the microwave in the kitchen
a man is making some food in a kitchen
A snow filled street with a stop sign on the corner.
This is a wide perspective of a room in a region.
a close up of slices of pizza on a plate
A pair of boats stacked up on a beach.
A man doing a jump on a skateboard
A bunch of cars driving through down town New York City.
A group of people are flying kites in a field.
A large airplane sits on the runway at the airport.
A group of people sitting down at a dining room table next to dishes.
A group of young people playing a game of soccer.
A guy riding the an incoming wave on a surfboard
a little bird sitting on a ledge as it looks at the window
green peppers red peppers a tomato corn and hot peppers
Biathelete skiing forward with her rifle on her back.
Zebras racing each other in their zoo enclosure
A beautiful young bride standing next to a her husband as they prepare to cut a cake.
Pizza with pepperoni, mushrooms, olives and sausage on a pizza pan.
Bicycles and a motorcycle parked on a city sidewalk.
A loading truck carrying boxes and a Stop sign
A man jumping up to catch a frisbee
A living room scene with a large window.
a close up of a doughnut covered in sprinkles
A person with a bike and a dog on a leash, boarding a train.
a close up of a person wearing a shirt and bow tie
A kite flying over a sandy brown beach.
A group of people enjoying a day at the beach.
A kid standing in the dirt with some fruit.
A group of people are together in the snow on skis.
a person in an open area flying a kite in the sky
a train on a train station and people walking near by
An old fire hydrant casts a shadow on the sidewalk.
A man driving a yellow car on the road
A kid is playing on some toy drums
A street with many signs on the corner
A merry go round with lots of colorful giraffe and other animals.
People are playing ultimate frisbee and someone is about to catch it
A woman that is sitting on a bike.
Two women trying to compete for a Frisbee during a game.
The child in the black helmet is swinging at a tee ball stand.
a public transit bus on a city street with people near by
A laptop computer sitting on a cluttered desk.
A pedestrian sign has been devised in comic fashion.
A man wearing a pair of glasses and a tie.
Two dogs near a carry-on bag on a tile floor.
A dog wearing a bandana rides a skateboard.
Mini pizzas on shelves waiting to be bake.
there is a white toilet that is broken on the street
A messy bed in a room with large glass windows.
A black and white view of a clock tower with a ferris wheel in back.
People at an outdoor market under a canopy.
A computer monitor in a home style office
A woman reaches out to pet a giraffe who stands in confinement with his companion behind a fence.
A man on a cell phone resting his legs on his luggage
A black and white photo of a man walking around with an umbrella.
a tall giraffe standing in front of a wood fence
A clock tower with lighted clock faces, against a twilight sky.
The sign on the sidewalk shows a U turn.
Two people are aiming controllers at the television set while other sit on the sofa watching.
A kitchen with steel dishwasher, refrigerator, cabinets and microwave.
Two men play Frisbee in the sand while others watch.
A brown and black dog laying on top of a wooden seat.
A blurry image of a knife cutting into frosted cake.
A man in surf gear walking down a crowded street.
A man in a warehouse riding some moving object.
people flying very high and waving their hands
A man cross country skiing in the country.
a building with some really big and fancy clocks on the side of it
A close up of a giraffe with its face against a pole.
A person flies a kite in a field.
Some cooked vegetables are sitting on a plate.
A plate of food and some cups of drink on a table.
Several slices of pepperoni pizza sliced into squares.
A large fed ex plane flying over mountains.
A banana, tomato and apple laying on a desk
The laptop is connected to a full size keyboard to make an effective work station.
A giraffe is standing in the bushes and tilting its head.
A man riding a skateboard while flying over a board.
a broken up DVD in front of a keyboard
Two people in a room playing a game of Wii.
a white horse sniffing the hand of a person in front of them
Some chopped vegetables layed out on a pan
A woman throwing a tennis ball up in the air to serve it.
A baby elephant following an adult elephant by a fence.
Small group of people playing video games in a living room.
A woman in pink dress playing a game of tennis with people in background.
A group of airplanes fly through the sky.
Skateboarder in purple shirt riding on top of his board.
Several young Asian people are snowboarding and skiing.
A kitchen with appliances that include a sink, dishwasher and a refrigerator.
Four giraffes are standing next to a bare tree.
Three beds with clothes laying folded on each one.
A man has his neck covered by clothing.
The warning sign is below two street name signs.
A group of people watching kites being flown in a park.
A man and a woman eating donuts and having drinks.
A jockey sitting on the back of a horse
a red and white sign in front of a white house
A man rides a donkey pulling a trailer of hay
A variety of sandwiches on a table with photos on it.
A dog with it's nose on a couch and an open laptop
A young boy is sitting on the wooden bench.
Clouds loom over the city skyline with a clocktower in the front.
A group of men holding cell phones down at their waists.
A guy that has a burrito in his hand and is eating the burrito.
Stuffed animals are sitting on top of bookcases.
A woman playing a game of tennis on a tennis court.
A close up photo of a baked food in a pan on a stove.
A white bowl filled with different colored vegetables.
A pile of carrots and broccoli next to green onion.
a surfer in a wet suit is surfing on a white board
A lady wearing a hat talking on a cell phone.
A female equestrian is riding her horse in a show arena.
A little dog sitting on a wooden bench.
A single skier is the only person for miles of flat snow.
Someone is displaying a colorful pinstripe wallpaper on a cell phone.
A very rusty old car near some pretty flowers.
A group of men sitting next to each other holding cell phones.
A cat that is laying down on a couch next to a remote.
A man is jumping and guarding in mid air while another guy is throwing the frisbee.
A bear laying inside a decaying mass of some sort.
Two train cars are beneath some trees on the top of an incline.
A man is doing a trick on a skateboard.
A laptop computer is on a table in a nice back yard.
A jar of food on a wooden table.
Thee people stand in a lot while one holds an umbrella.
TWO BALL PLAYERS ON THE FIELD, ONE RUNNING TO BASE
Men standing and one pointing to an object on a street.
A man swinging a baseball bat as another looks on.
The baseball team getting ready to walk off the field.
A small kitten walking on a laptop keyboard.
A CITY HAS A CLOCK ON ITS BUILDING
Train that is very aerodynamic in its appearance
A person wearing skis, standing in the snow.
Two plates of broccoli are sitting next to each other.
A woman sitting at a table across from an entree of beef.
A line of food trucks parked on a city street.
A yellow commuter train pulling into a station.
A large picture of a man with a mustache and a bird on his shoulder.
A large group of people at a table using laptops.
Bird sitting atop a wooden railing among the trees.
A guy holding a cellphone from a display.
A view of a street with multiple store fronts.
A woman helping a small child on snow skis.
a plate filled with assored meat, some fruit and veggiesm and a roll
A person crouched over on open lid toilet
A man using scissors to cut white paper.
Look at how high the snowboarder is in the air.
A large yellow and brown boat floating on a body of water.
close up of a large stuffed pasta shell and vegetables on a plate
A line of bicycles beside a street where a bus is stopping for passengers.
A ginger cat sits and looks out a window
A holiday cake with holly designs on it.
A woman feeding a giraffe under a tent.
A clock on the side of a church tower.
an image of a girl walking on the sand on the beach
A promotional photograph of professional MLB player Travis Buck.
There are several hot dogs on this plate along with two sides.
Baseball players are watching as a hitter hits a baseball.
Several pictures of someone baking using an old school outdoor wood fired stove.
A duck swims along a large body of water.
Large group of motorcycle riders coming down the street with flags.
A horse is walking down the street alone.
a small child is playing in a field
A blender and a glass on a counter top.
A red stops sign stands on a grassy island that has grass and is near a street.
A baseball player up to bat during a baseball game.
a multi-colored boat with tents sitting on the water
A break room with a sink and a microwave.
A couple of toilets sitting in a  bathroom.
A locomotive on tacks with smoke coming out of it's stack.
A group of baseball players standing on top of a field.
Small herd of sheep walking and grazing in fenced farm field.
Sheep are grazing on fresh leafy vegetables that have been given to them.
An incoming train is approaching a railroad crossing.
A SURFING BOARD STAND WITH A PERSON STANDING NEAR BY.
A boy that is holding a bat in the grass.
A family gathered around a dinner table getting plates of food.
An older man is holding luggage outside a transport center
A girl is standing next to a horse.
A giraffe stands next to a lone tree in a grassy area.
A white building sitting below a brown tile roof.
Woman in center of dirt intersection holding pink umbrella.
A pizza is shown displayed on a plate.
A green road sign with a bike painted on it.
Someone holds a bottle of mayonnaise near a hashbrown sandwich.
a young person riding a skate board on a wooden surface
Two cows with heads through bars eating hay.
Two large elephants walking behind a wire fence on green grass.
A towel rack in a bathroom topped with two stuffed animals.
Woman in bathing suit sitting on a beach chair, drinking a soda.
Two sheep in a  grassy field with a rabbit nearby
The person in the bodysuit is surfing a wave.
A small plane flying through a blue sky.
a old jar that is sitting on the ground
Many pedestrians are navigating around a street corner
a man in a suit standing in an office
An orange cat is sitting on a bag.
A landscape photo of a large swimming pool area.
A cat outside a window looking at a Buddha statue.
A batter has just hit the ball but has not dropped the bat yet to run.
Trolleys in the mountains travel through the snow.
A photo of a woman sitting on a train on her cell phone.
A plane is parked and being examined by several men.
A group of skateboarders atop a concrete surface.
A man with sunglasses dressed in a suit and tie
There is a baseball game going on, the hitter is about to hit the ball.
People look on as an airborne snowboarder competes.
a glass wall to a shower in a bathroom
A water skier holds on to a rope being towed by a boat
An unmade bed in front of a poster on the wall.
Three people on horse back at a rural road intersection.
A woman walking around a living room next to a TV.
A douhnut and coffee are on a table.
A person covered with snow on the mountain with skis
Three women sit on the beach with two of them holding onto some umbrellas.
A woman in a red bandana slicing a banana.
A man is paddle surfing alongside his dog.
A plate full of half eaten food with utensils.
a person in red is snowboarding on a hill
a dog sits in front of a window on a bed
A classic building in the background frames a stoplight.
A group of men standing on top of a baseball field.
Many people sitting under umbrellas on a sunny beach
Two zebras standing by a log in a grassy field while people in a car watch.
A woman on a court swinging a tennis racket.
A person in a red shirt is riding a skateboard.
Three different vases are on a shelf.
A woman in a red dress talking on the phone.
Carrots fresh from the ground with dirt and gardening gloves
A fireman is getting water out of a boot.
Two men are holding video game controllers preparing to play.
Some guys are watching two others playing the Wii.
A young person in plaid doing snowboard tricks
a iced cake that has been cut up with a server resting on the plate next to it
The living room looks into a small, well organized bedroom.
Cooked broccoli and beans are a side dish.
A group of people standing in the sand with a kite.
A group of people sitting around a wooden table in front of a projection screen.
A person looks on as two other people prepare to fly a kite.
this is a person flying a kite in the water
Man looking at a screen while holding a Wii controller in his hand.
A man with a tennis ball sticking out of his skull.
A donut factory with donuts on a conveyor belt
A building with a clock tower and a light blue roof.
A store shelf filled with different heart shaped boxes.
A man is smiling as he eats his passover dinner.
A city as the sun sets with a gas station next to a traffic light.
Model car sitting on a table next to a slice of chocolate cake.
A man and his shadow on a red tennis court while the man swings a tennis racket.
A birthday party for a baby with it's parents
There is a big room with furniture and items inside.
Two wine glasses sitting on top of a table.
Two zebras face each other and graze an open field.
an  image of a guy that is on skiis
A little girl riding a pair of skis on top of a conveyor belt.
A pastrami sandwich being held by someone
A fanciful dressed piece of pizza on a plate.
A small Frisbee is lying in the water.
A man holding a small white dog while wearing a black hat.
some white birds flying over very long grass
A red double decker bus parked near a curb
Two computers are side by side on a desk.
barefoot little boy holding a hairbrush in his hand
A boy throwing out a pitch in a ball game.
Some sport players are competing in the Frisbee game and having fun.
A train platform with passengers and two stationary trains.
Two horses trot on a field with their handlers.
A white cow makes a face as he stands near a stone wall.
An open top double decker bus driving down a street.
a desk with a laptop and a monitor and keyboard on top
A giraffe standing in an open field next to some rocks.
A group of three people sitting on a couch.
A vase with ref flowers in it on a table.
Poised to slice into an iced multi-layer cake.
A bench next to a small pond with a white bird standing in the water.
A black cat underneath a umbrella in a room.
There are many birds flying near the boat.
A room with a wooden desk and matching shelves
A clean and tidy kitchen counter with nothing on the counter.
A couple of girls standing in a livin groom holding Wii controllers.
Inside a restroom stall, a rag floats in the toilet water.
A closeup of a deep dish pizza in a restaurant,
Seven vases sit displayed on top of pedestals.
Men are in a life raft which is beside a ship.
A large boat with people on the back in the water.
A giraffe bust hanging by a Rain Forest Cafe Sign.
a man that is skateboarding on a ramp
some forks people and a white cake
Man riding on the back of a painted elephant.
Two women with clear umbrellas stand near two people in uniforms near a building with a thatched roof.
A motorcycle parked in front of green doors.
a plate holding a slice of broccoli pizza next to a bottle of beer
Two mean getting ready to hug each other while standing in a classroom.
A young man preparing to throw a frisbee.
A man on a surfboard surfing in the ocean.
A close-up picture of some food on paper plates
A baseball player at home plate with a crowd of onlookers watching
Some children are playing game in the room.
Twin beds with pillows, and a lamp and vase
A table full with a display of cupcakes and donuts.
A chicken sandwich and french fries are on this plate.
A vintage tennis team posing together on the court.
A group of trucks on a mountain side trail just sitting there.
Someone skateboarding in the park and doing a trick in the air.
a cat that is laying down on some carrots
A black and tan dog laying peacefully on a sofa
A pole with a lot of street light signs on it.
A table with many fruits and vegetables, including carrots, potatoes, squash and apples to name a few.
The brown dog is waiting for his owner to play frisbee.
A shot from the crowd of a player during a tennis match.
The train is stopped on the tracks to pick up passengers.
Some hotdogs and plates are on a table.
Black train cars on tracks next to trees.
A man with a helmet on, on skis at the top of a slope.
A vase with yellow flowers sits upon a red and blue table cloth.
An office desk with several monitors and birthday balloons
A sleepy dog wearing a cowboy hat in the back seat of a car
A truck hauls a group of tractors down the road.
a large clock resting on a poll by some trees
An Italian meal with marinara sauce served on a long tray.
A town square with a statue in the middle.
Large variety of fruits and vegetables on display at a market.
The complete perspective of a washroom with numerous things to see.
An object that looks like a dog sitting by a miniature cell phone.
A MAN IS PACKING UP SKIES ON THE SNOW LAND
A group of people holding candles on a sidewalk in the snow.
a big yellow school bus shown through the rear view of another school bus
A woman on the phone standing in the kitchen with her mouth open
A bathroom with shower, sink and a mirror.
Player and referee at tennis match on red court.
a man holding a bat gets ready to swing it
A BATHROOM THAT IS IN SERIOUS NEED OF A REMODEL
Father, mother, and young son playing in the water.
A man in a tennis match is swinging his tennis racket.
A cow resting on the side of the road.
A dark bathroom with a white bathtub and a white toilet.
The street sign has numerous street names on it.
A giraffe walking through a zoo type enclosure.
A stuffed monkey sitting alone on a bench.
A guy doing tricks on his kate board
A red table topped with two plates with slices of pizza.
A man and a woman with three dogs read the menu outside of the deli.
A group of people walking down the street in what appears to be a marketplace.
A red box on a pole with a solar panel on top.
A white plate holding a sandwich and fried potatoes.
A train stopped in a station with people walking towards it with luggage.
Lady standing in front of two couches with a remote control in her hand.
A bunch of airplanes are parked on the runway.
Several small white boats on the open water.
A couple of people on surfboards in the water.
A view of a restroom urinal covered in filth.
the start of a broccoli stalk in the garden
A toilet in front of a window, and next to the shower are shown
A person on a field swinging a baseball bat.
A cat that is cleaning its paws while sitting on a suitcase.
A bear lays on a pile of food
A pair of giraffes standing in a pen at a zoo.
Three men stand in front of a beige building and the man in the middle who wears a hat holds a white Frisbee.
Young girl gets ready to blow out candles as family watches
A young man doing a skateboard trick while others watch.
A boy is eating a slice of pizza at a table.
A man holds scissors to his protruding tongue, as if to cut it off.
a sign for Bras Basah Road next to a pedestrian stopwalk
A man standing in a field holding a small parachute.
A microwave oven mounted into the side of a wall.
The city street is quiet this time of night.
A young boy standing on the top of a sky slope.
Several kites of different colors laying on the sand on the beach.
Workers in uniforms next to a truck and construction equipment
A bowl of vegetables with a silver spoon.
People sitting at a table and eating soup.
An orange and white cat chasing a feather
An Australian Shepherd herds cattle in a pen.
Doorway view of a bathroom with a toilet and window.
Two women standing under an umbrella having a conversation.
A picture of some people playing with a frisbee.
A cat playing with a shoelace of a tennis shoe.
A man riding down a snow covered ski slope on skis.
Two donkeys are standing together.  One is facing out and the other one has his head bent.
A cat is lying in a houseplant on the window sill
A couple of glass items that are in a room.
A plate filled with lots of different types of food.
The cow is hoping for a way out of the fence.
a small white and red plane parked at an airport
Three men standing together while on of them handing another one a frisbee.
A female surfer stands on her board in the water.
The extra long passenger bus is entering the intersection.
A dog is crouched down beside a toilet looking up at the paper.
A bicycle is parked between a welcome sign and a street light.
A close up of two teddy bears hanging from two strings on a hook.
four sheep grazing in a open snow pack
Two men are seen eating something standing on the street
a large building with people outside looking around
a bed room that has a couple of beds in it
this is several zebras in the grass running
A girl laying down on the couch holding something in her hand.
A railroad train pulled into the station with people boarding
A family riding on the back of an elephant across a field.
A stop sign affixed to a cyprus tree in a body of water.
A bed made up with flowered comforter  in a room with two windows.
A group of Asian people seated around a restaurant table.
Several people in ski gear standing in the snow and in front of trees.
one brown cow and one black cow standing in mud
A large open living room with a decorative rug.
A train moving along a track outside during the day.
A man is holding an apple in an advertisement.
A giraffe and a zebra grazing the grass.
A desktop and a laptop on a desk.
A number of signs hanging from buildings.
A group of people sitting around a restaurant table.
A person on a field with a baseball bat.
A jar filled with different types of fruit on a table.
A giraffe is standing in a grassy field.
A bowl of food is sitting on a table beside a glass of wine.
Three men holding snowboards on top of a mountain
A lone cow walking in a large field near houses.
a toilet a bathtub a rack bottles and a shower curtain
THERE IS A MAN THAT IS PLAYING BASE BALL ON THE FIELD
A skateboarder with a hat is skating down a ramp.
A group of kids at a skateboard park doing tricks
A small boy on a guys lap with a toy guitar.
A parking meter sits in the foreground before a church and other large buildings.
Laptops, keyboards, and other computer equipment on display.
A short boy with a penguin backpack stares at a large bear in the zoo.
A cat laying on top of a laptop computer.
A cat is sitting on a wooden surface behind a vase of flowers.
A large propeller airplane flying through a blue sky.
A baseball player getting ready to hit  with a catcher and umpire at a game.
Barrack Obama eating a hot dog with his young blond boy toy.
there is a plate that has meat and rice on it
Tennis players stand together for a group photo.
A smaller giraffe is standing in the green grass.
A wooden bathroom with a wooden toilet next to a window.
A man stands with a tennis racket on turf.
Two gulls perch on a mossy concrete wall overlooking the sea.
Two little boys sitting at a restaurant table with an adult.
A man holding a cabinet in a kitchen.
There is a mountain behind the light house.
A young woman taking a picture with her phone.
A train is parked near a platform at the station.
a couple of buses parked behind the other in the street outside some buildings
A boy, three dogs and a frisbee in a dried up creek bed
Many people are walking around the dock near numerous ships.
A living room filled with furniture and a large TV.
A white stove top oven with two tea pots on top of it.
A small gathering in the living room with drinks being served.
A sign saying no drinks allowed is hanging
A giraffe is standing with his front legs apart.
A room that has stained glass windows separating another room.
An instructor pointing at something on top of a screen.
a building with a clock tower near other buildings
A woman holding a baby near a long horn steer.
A crowd gathered for a small-town parade looks on as the next float comes down the street.
A brown dog with it's head hanging out of a window.
A man sitting on a concrete structure on the beach.
A young man riding a skateboard down a curvy road.
A living room with a sofa and built in tables.
A guy leading a bunch of people in a choir.
A large long train on a steel track.
The two hot dogs are prepared and ready on the plate.
An airplane is flying high in a blue sky.
A man standing on a field talking on a phone under two colorful kites.
The man is carrying the bananas down the road.
A sailboat is floating outside on a lake.
A wide building with many glass partitions has a front pavement with standing and milling people, some of whom are headed to the open door of a bus also resting on the pavement.
Two people are lying in a bed with a computer.
A big bus and other traffic on a busy city street.
No parking signs hanging on a pole.
The man in black came up to the brightly colored food truck.
A group of travelers wait to receive their luggage.
The motocross driver races down the dirt hill.
A very comfortable looking bed with big plush pillows.
A blue boat skims the ocean with a crew of several people.
A pair of racing motorcycles coming to a start line.
A hand lifting a slice of pizza off a pan.
Several people are sitting at a restaurant as staff work.
A tennis player reacts to hitting a ball.
Two giraffes, one is closer and larger then the other, appearing to be curious about the photographer.
A banana sitting on top of a white plate.
A man and woman look at a piece of paper
A skateboarder rides his board at a skate-park.
Stuffed animals displayed on table with assorted items.
Many kites are lying on the field on a cloudy day.
A woman holds a little girl's hand while cross-country skiing
a desk with multiple monitors and a laptop
A beautiful black and white dog catching a frisbee in midair
a group of people walk through a rain storm
A group of children are standing in line.
An orange sign with black lettering near a city street.
A person attempts to para-sail with a parachute.
A man is riding a wave on a surfboard
Several people on the beach with chairs and umbrellas.
Two zebras with one of them laying his head on the back of the other
A group of people in a park watch a man in a green sweatshirt and hat catch a white frisbee.
Horses bumbled up next to each other in an enclosure
a male sitting on a toilet with a laptop
Several potted plants in front of a window.
Man playing tennis in motion with crowd and tennis court
A man standing on a tennis court holding a racquet.
a bus that is filled with people crammed together
A picture of a stainless steel stove that is in someone's kitchen.
A surfer in a bodysuit rides a wave.
A picture of a toilet taken from above it.
a large monitor and a small laptop are on a desk
A bench that looks like a round hut.
Commuter bus on roadway at night in city setting.
Lunch recipe calls for whole eggs  baked inside bread, served with tomatoes on the vine.
A handsome sink on a long pedestal in a bathroom
a man on a horse rides through the streets while others watch
A group of black and white cows are on the grass.
a toilet a tub a brown wooden floor and a mirror
Three cars traveling down a street in front of a large building.
A white table that has black chairs in a kitchen.
two zebras standing together in a field a by a small tree
A close up of a fire hydrant with a skyscraper in the background.
A person standing next to a pole working on a traffic signal.
This is a cluttered room with alot of boxes of stuff.
A toddler pulls himself up next to a toilet
A white toilet sitting in a bathroom next to a wall.
A bicycle parked near parking meters both covered in snow.
A man is jumping near a ramp on a skateboard.
A surfer looks back as another surfer catches a wave.
A small train is going through a bushy field.
A beautiful woman taking a picture with her smart phone.
A group of people flying kites over a sandy beach.
A man holding a bat on the beach looks down
a woman is cutting a fourth of July cake while two other girls watch
A table has a handbag, brush, mints, wallet, and cell phone on it.
A man sitting on a stone wall talking on a cell phone.
A smiling man holds a bunch of freshly picked bananas
A person crossing a street next to a crosswalk.
A small blue car parked outside a house
The woman is posing for a picture on the side of the road.
two hands are toasting some wine glasses and a person in a black jacket
A pizza with spinach on top of the sauce and cheese
A group of people walking through building with large umbrellas.
a pastry with some powdered sugar on top of it
A crosswalk signal with a lighted red figure.
Two trains on the track at a railway.
A montage of people shaving and cutting their hair.
A plate of pizza sitting on a table ready to serve.
A yellow fire hydrant is on a city curb.
A close up view of a mirror reflecting cars parked on a street.
Asian man and woman sitting and looking at cell phones
Man removing a pizza from a home oven with a peel.
An old restaurant in Lucerne that apparently has wonderful wiener schnitzel
The little girl is eating lunch and having milk.
A bedroom with a bed under two framed paintings.
several young students working at a desk with multiple computers
A man cutting a cake on top of a table.
A large group of sheep stand near the water all looking down eating
A white and brown cow eating grass in a field.
Smiling friends posing over a bag of donuts
A behind the scenes look at a photoshoot for a bunch of bananas
A zebra in a fenced in area next to a man.
A man in a grey apron with a sandwich full of barbecue.
A large teddy bear with pink camouflage on the street.
A very tasty looking pizza sitting on a table ready to be eaten.
A black TV sitting on top of a desk next to a couch.
Two horses eating grass by a body of water.
A beautiful blond haired woman talking on a cell phone.
A tourist looks at sheep grazing in a yard
These motorcyclists are waving their American and Marine flags
Buses and cars stopped at a traffic light.
Close up of metal post with a walk signal and a Do Not Enter sign with profane graffiti with building behind.
A crowd watches a softball player with a red helmet.
A man about to hit a tennis ball with a racket.
A man doing a trick on a skateboard while people watch.
Three adults watch a child holding a toy doll.
A cat that is eating some food on the ground.
A table with a book camera and shells
A dark colored cat standing on a wood floor.
Two white ferries passing each other on a body of water.
Orange cat walking across two red suitcases stacked on floor.
A stop sign on the corner in front of a row of stores.
A cat that is sitting near a sink.
A toothbrush is sitting on a sink that has the words mystery toothbrush on it.
A baseball player takes a swing at a pitch.
a small boat in a large body of water
a man sits on a bench while holding on to a dog
City two way street with cars lined up on both sides.
Four different food dishes including rice and chicken.
A man is wearing a blue shirt with a black coat and a gold tie.
A black cat sitting on top of a red couch.
2 farm cows stand on a baron field
Two female skiers are standing in the snow wearing purple attire.
A sheep grazing in a field above a pond.
a pizza that is in a pan that is on a table
A man on a skateboard riding over a hill.
A bathroom with a large green plant growing on the wall.
The group of people walking in the city have umbrellas up.
A pretty yellow city bus on a wide street.
The pre-school child is trying to kiss the toddler.
A large group of people playing frisbee with onlookers.
A black-and-white shot of a woman in a dress holding a tennis racket.
We have a distorted view of a bus and a pillar.
An upward photo of a man in suit staring in the distance with another man holding a finger up.
A small teapot is on  a plain wooden table.
A kitten is laying on a laptop watching a video.
A girl wearing glasses posing for the camera while holding a tennis racket.
A woman is painting a green fire hydrant.
A couple of people standing in a room.
a man on a skate board does a trick in the air
A woman riding on the back of a brown horse.
a little kid is looking at some doughnuts under a display
Man in a field walking behind two Clydesdale horses.
The zebra and giraffe gaze into the open meadow.
A person with a pink umbrella and a suitcase next to a taxi cab.
Signs showing different street signs on the corner of the street.
a display of a giant bear standing in the middle of a shop
A stop sign in an area with grass, trees and small buildings.
A man stands near a podium in a gray suit and blue tie.
a woman standing in a kitchen while preparing food.
a person holding a kitten and feeding it milk
A young person is playing a soccer game.
Two small dogs look around in the yard.
A large truck is parked on a street.
A man in the water on a surfboard.
A train covered with snow sits in a train station.
Some bananas are for sale at a store.
A cat sitting on top of a television
Slivers of cut, sun-dried tomatoes lay to the left of a pair of food shears there are uncut tomatoes on the right.
A giraffe is walking near a fence at a zoo.
A commuter train stops at a train station with it's doors  open.
Wild animals walking in large open field and path.
A person is standing in front of a store mannequin in the dark.
Several young boys are playing a baseball game.
Somebody is having in the peaceful of the picture.
Bus, cars and a motorcyle all stopped in the street
2 Motorcycles are sitting in an empty office
A female tennis player swinging to hit a tennis ball on the court.
A man looking down next to several hanging bunches of bananas.
A woman in black shirt and skirt playing a game of tennis.
A little girl smiles next to a foil wrapped cake.
a zebra is eating grass in a stable
an emaciated man wearing tie standing erect showing teeth.
a motorcycle that has some sticks on his back
A bride and a groom look ridiculous as they stuff cake into each other's mouth.
A table topped with paint and construction tools.
Lady wearing a hat and sunglasses riding on an animal.
two people at a bar holding drinks
A three story white building with cars parked on the street in front of it.
An apple is carved with facial features and teeth
two young children in a garden eating greens
Woman and her dog tends to the herd of sheep
a man riding on an elephant near a stream of water.
some people are traveling down the street in a city
A stop sign leans to the right at a small town intersection
The man is just getting ready to serve the tennis ball.
a mechanical robot holding a base ball bat
A giraffe bends over to nibble grass in a rock and lawn area at the zoo.
luggage is packed and lined up for traveling
people dressed in costumes at a ski resort
Three people are having a cook off in the kitchen.
The skateboarder is learning how to complete his trick.
A kitchen counter with  a lot of empty bottles on it.
People are sitting in chairs with laptops, papers, and cups.
A multi-hued teddy bear wearing a royal robe and blue ribbon.
A group of giraffes feeding next to a tree in a caged area.
A beautiful woman in a bikini surfing with her dog.
A black handled toothbrush with new bristles on it.
Many different types of small boats on the water.
a security officer sitting on a fence while talking on a cell phone and holding onto a segway
Several kinds of doughnuts are in a cardboard box.
A girl is going to the field with her soccer ball.
A dog lying on a couch while wearing a collar.
A multi colored train parked on a train track
Blender on a messy counter in a kitchen filled with food.
A slice of pizza with vegetables sitting on a plate near a drink.
A man wearing a black helmet swings his baseball bat.
Broccoli and waffles with a mushroom sauce on a plate with a spoon beside it.
A pan has a slice of pizza left in it.
Toy cars line the parking lot of a toy setup.
there is a baby elephant standing in a field with tall grass
A humming bird flying over a red bird feeder.
A boy and a girl with a blue frisbee.
A man on a bicycle passing by a taxi.
A baseball player swings and makes contact with the ball.
This is a large kite flying high in the sky.
A man standing on a tennis court holding a tennis racquet.
Tall green pine trees in back of large grassy field.
A bench that is by some trees and grass.
A man is standing in a semi-dark room making a call on a cell.
A white vase of flowers sits on a wood table.
A few empty boats at a river ride
Two men holding surfboards while standing in the ocean.
Dog on skateboard wearing t-shirt during parade event.
A toilet and sink sit in an empty bathroom.
a sink sitting in front of a bathroom mirror
A compact bathroom with a shower and a mirror.
A shaggy dog lying on a green and blue blanket.
Elephant with young rider standing next to adult elephant near parking area.
A small bathroom has green walls and beige floor tiles.
A plate of pizza on top of the table
Someone is frosting a cake that is on a glass plate.
A computer monitor and speakers on top of a desk.
A snow skier is being pulled by a rope overhead.
A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.
A large fleet of boats in a large body of water
A man wearing a white shirt and tie.
a bunch of food is on a white plate
A teenager has his feet off the ground holding an umbrella.
Two children stand near a large teddy bear.
People in a market shopping for fresh produce.
A brightly colored quilt on a bed in a furnished bedroom.
A train that is yellow is moving down the tracks.
A living room that has wooden shelves with many movies on them.
A man riding a motorcycle over two cars.
A black and white checkered bathroom with toilet
four motor cycle cops on a city street
A white toilet sitting inside of a red bathroom stall.
A surfer raises his arms for balance on a wave
Three people on a bench are smiling and waving.
a group of lambs walk across a grassy plain
a person riding a surf board on a body of water
A young elephant at a watering hole with other elephants in the background.
A white polar bear standing on a concrete surface.
A man jumping a motorcycle over a row of parked cars.
A man that is doing a trick on a skateboard.
A bathroom with a shower, toilet, and multiple sinks.
A CAT IS SITTING NEAR A TOILET SEAT
A group of four men riding horses holding flags.
This tech wizard leaves all options open, equipping his computer area with both a laptop and desktop machine.
a black and white cat a hand and a laptop
Four people playing a game with a frisbee in a grassy area.
A reflective mirror at the junction of two hallways.
A very big dining table with some people at it.
The surf boarder is coming out of the water.
A man in an old-fashioned baseball uniform hits a ball with a bat.
People are wearing hats with umbrellas attached to them.
a man is in the air riding a skateboard outside
A parking meter that is placed on a sidewalk.
A stuffed teddy bear and memo sitting on a bunch.
two zebras close to one another inside of a fence
black and white stripped  poles with stop lights attached
A dog with his leash attached to a bench
A bathroom sink, mirror, soap containers and a towel shelf below.
Large statue holding a black and white umbrella.
A little, brown bird on a tree branch
many different bikes on a city street
A wooden and metal park bench sets at the side of a path.
There are many traffic lights on this busy street.
Pedestrian traffic and advertising in an Italian airport
A group of people on street in snow next to cars.
A young man who is drinking a glass of wine.
A building and cars parked in a lot.
Kitchen with white cabinets and refrigerator and black countertop.
The person who decorated this bathroom likes cats.
A large room has many different planes displayed.
A man is painting on the side of a wooden compartment.
People waiting to cross at a busy intersection.
A baseball player standing on the pitcher's mound
The street view of an average city street.
Electrical plugs are coming out of a box on top of a box.
A herd of zebra standing on top of a dirt and rock field.
a boy with glasses a cheese pizza with onions on a silver platter
Several house barges lined up on a river.
A policeman on a motorcycle waiting  on the street
An orange cat on carpet outside of a door.
A picture of some kids playing a soccer game.
A bi-plane in the sky on a sunny day.
A couple of small white bears on some rocks.
A donkey joins a group of zebras around a water trough.
A man has a ponytail on top of his head
The man is playing baseball on the baseball field.
A submarine sandwich sitting on a white dinner plate.
A girl in grey jacket and tie standing on a street.
A couple of kids hovering over a pizza sitting on top of a wooden cutting board.
A woman in a black dress swings a tennis racket
a man takes a bite of a doughnut
THERE ARE YELLOW TOWELS IN THE BATHROOM HANGING
The woman is standing in the kitchen empty.
A man on a couch talking on a cordless telephone
a group of people at a park playing with a white frisbee
Apples and oranges are being sold in a market.
A cat is standing on top of TV near a huge bookcase.
A man is standing next to a tall surfboard.
A city filled with traffic next to a tall building.
A red fire hydrant with the paint chipping off, next to a wire cable fence.
The room has red wall, white carpet and matching furniture.
Rain makes the brick streets shiny and dramatic
a close up of a person using a cell phone
A cat that is wearing a festive hat.
a cozy living room with a couch and two chairs, a coffee table and lamp
a baby standing in a suitcase and a mom
a plane flying high in the air below a blue sky
A giraffe in an enclosure standing by a tree.
Train traveling on tracks near populated area near waterway.
a man is snow boarding down a hill at night
Young girls sit at a table making paper kites.
a table with a calculator and phone siting on it
A man is baking something in a portable miniature oven.
A bathroom with blue tile in the midst of restoration.
A brown and white cow standing next to a stream.
The woman is posing on her bed with clothes.
A couple cross country skiing with their dog.
Open bottles of various wines on a glass table
A very cute small child brushing its teeth.
A spotted dog and a black cat hanging out in a bedroom.
Two men are playing a video game with a motion controller.
A close-up shot of a zebra eating grass.
people walking in front of a building
a woman filling a bear at a build a bear type place
A bathroom with a bathtub, sink, mirror, and toilet paper roll.
A bunch of very cute cows going down a road.
A purple frisbee is shown flying high above the sand.
Two pictures hung on a refrigerator by magnets.
Kites flying on the sandy beach on a sunny day.
Cluttered apartment with a large T.V. and a great view.
a man holding a tennis racket and ball
A green double decker bus called "Green Rovers"
A man doing a skateboard trick on some stairs.
A vase with kanji holds flowers and is displayed next to a purple mug, a mug with a dog, and a white mug.
Two women walking on  a train platform
an old diesel locomotive coming upon a track switch
an airplane next to a large body of water
Group of table and desktop laptops sitting on a workbench.
A small bathroom photo focused on the toilet
A pizza with shrimp and basil on a table
A black and white photograph of a traffic intersection.
A man riding skis down the side of a snow slope.
Somebody left the toilet seat and lid up.
An adult stands by a young child on a fake cow.
A young man eating a sanwich while working on a laptop.
A small dog sitting next to a wall in a hallway.
A long black train sitting on top of railroad tracks.
A computer mouse sitting next to a  laptop computer.
A pair of zebras runs in tall grass.
A picture filled with many things all inside.
A woman looking into a mirror while blow drying her hair.
A woman is giving her dog a bath.
A bride and groom are slicing a wedding cake
A pack of zebra standing in a field next to an ostrich.
Veterans riding in the back of a military truck.
A person with glasses on the skateboard as others watch
A chef at a pizzeria behind the counter
A brown dog in a grassy field with a purple frisbee.
A stuffed animal is in a porcelain sink.
Group of children and adults playing a video game.
A toy train track is set up with two trains, houses, and a tractor.
a male in a tan shirt is playing a video game
A girl is smiling while riding a gondala.
A broken fence seen through a broken window.
three men and a woman pose for a picture on the tennis court
A man and woman put ketchup on a hot dog bun.
A man riding on a skateboard on a sunny day.
Brown bear laying down on a log of wood in the forrest.
two men holding wii controllers in a living room
A woman is standing by a truck smiling at simetjing
A bird is sitting idly near some flowers.
A woman walks in front of a horse next to a red trailer.
The bowl has broccoli, celery, and lemon slices in it.
a white stove top oven siting next to refrigerator.
A snowboarder in winter gear riding a snowboard and a steep slope that is snow covered.
A couple sheep on a steep grassy hill.
An outdoor image of a fence at a dog park with a fire hydrant
A bear has just taken a dip in the water
a person riding a skate board jumping in the air
A group of skiers trekking up a hillside in a snow storm.
A meter on the street reads a time of zero.
A kid is holding a controller on a coach
A clock and its reflection placed near a sidewalk.
Outdoor table set with wine and breads in the center.
White cat sitting on sandy area near walkway.
A man in suit and tie wearing a white beanie.
An aircraft that is inside of a building.
An ostrich in a zoo a long with three zebras.
A little boy holding up a packaged electric toothbrush and smiling.
Many people play sports in a grass field.
The banana in the car seat is aging and browning.
A traffic light sitting on the side of a road.
A close up image of a little girl getting her hair done.
Towels stored under a bathroom sink with a glass countertop.
A couple of men standing next to each other holding glasses.
A person standing in front of a stove top in a kitchen.
A person standing next to a building holding an open red umbrella.
A cat with a peculiar look sitting on a bench.
A man riding between two oxen as they travel through water.
Two black crows sit atop two tree branches
A man laying alongside of a white toilet near a sink.
A bedroom with an almost empty bookshelf and desk
Two people holding up cell phones with photos of a young man and woman.
Young lambs with adult in fenced grassy area.
An airplane flying in the sky during the day.
A long-haired grey tabby cat resting on a sofa.
There is a long line of cars behind the rearview mirror.
A lady wearing a white shirt trying to tie a tie.
Skateboarder riding through the middle of park benches.
A man with no shirt rides a skateboard over a ledge of a skateboard park.
A soccer player in the midst of kicking a soccer ball.
The cool dog is riding on a motorcycle.
Two surfers walk onto the beach from the water.
A man is sitting on a black couch with a cat.
A soda can sitting next to a laptop and remote control.
A man on skis on a snowy trail.
a giraffe looking over fence, at person walking away.
there is a young girl and her mother boarding a plane
An outside bathroom carved of wood with a toilet and sink.
A bed covered in clutter and clothing with blankets.
The young person sits on bench seeing the tranquil lake
An open laptop computer sitting on top of a desk.
An unattended office containing several computers and a chair.
Several people sitting around together eating and drinking at a venue.
A group of people that are on a soccer field.
A woman swinging a tennis racket on a court.
a person stands while holding on to a pole
Skier with backpack down hill skiing in the sun
A yellow and red train traveling down train tracks.
A walk in shower sitting next to a white sink.
This is a cake  and a fork in laying in a plate.
An apple is being cut with a sharp knife.
A motorbike parked on a road with a man.
A snowboarder goes airborne over a snowy hill.
Person on skateboard in mid air with color lights above.
children holding stuffed animals and a parent holding a baby
A young girl climbing on a painted fire hydrant
Two people that are skiing together in the snow.
People standing behind a clock in a clock tower filled with massive golden bells.
Four persons are skating on the skate board on snow.
A room of chairs and sofa with red stairs next to it.
A very large semi truck on a wide road.
Someone taking a slefie with a large camera in a large mirror.
A window looking out at a brick building
A pile of luggage on top of a cart
A cart filled with lots of luggage driving down a street.
Man wearing riding gear sitting on parked motorcycle.
A group of people who are skiing on a snowy hill.
A man posing for the camera holding a skateboard.
a group of peeled oranges with purple flowers on top of them
a person is holding a baseball bat by a brick wall
A grey cat sits on an office chair in a home office.
Umbrellas litter a sandy beach next to a beautiful blue ocean.
Several kites sit on the ground, with a few people in the background.
A pink bicycle leaning against a fence near a river.
A dock that is separating the harbor from the ocean.
A yellow and silver train pulling away from a train station.
A microwave or other small kitchen appliance is seen from behind.
A piece of toast and grapefruit half is on a tray.
A stack of four oranges on a table.
a person that is standing in a kitchen next to a icebox
Two elephants are walking through trees side by side.
Pasta with a mixture of different vegetables sitting on a plate.
zebras and antelope graze on the planes next to shrubs
Game pitching plungers into a toilet in a field.
A male is skateboarding in an outdoor skate park near the ocean with many people standing nearby.
A woman with purple hair taking a picture of herself in a mirror.
a man dressed as jesus holding a cell phone
Two brown dogs lying on a burgundy comforter.
Two people in a public bathroom painted red.
A woman in a bra laying on a white surface.
A meal of beef, broccoli, and mushrooms is eaten with chopsticks.
Small sailboats are sitting on the water all over the lake.
A fire hydrant outside a shop with graffiti.
A gentleman is walking through the boardwalk with his surfboard.
A cat looking intently out of a window.
A sliced chocolate desert covered in powered sugar
A grey stripped cat on a table in a room with many books.
a tangering sitting on top of some bananas
The clock is located near the body of water.
A man bending over scooping food into a pan.
A park bench surrounded by a green forest of trees.
A person that is holding a kite in his hand.
a airplane that is flying through the sky over some snow
A cat looking out from a box designed like a bus.
Some cars that are driving through an intersection.
A dog catching a frisbee with a man in the background.
a bunch of sports items sit in the grass
A city street with business signs on buildings
Two people watching a small jet on the tar mat of a airport.
A poster behind a gate against a fire hydrant
A couple of green street signs sitting above  a stop sign.
A baseball player holding a bat on a baseball field.
Two men are playing ball with some elephants.
A pizza in a pan sitting on top of a wooden table.
A cowboy leads a cow through a paddock.
A man twirling a yellow frisbee with his finger
A wide eyed teddy bear with a scarf is sitting on checkered bedding.
A man gets his picture taken at a ski resort
Old fashioned kitchen featuring a two compartment sink.
A group of elephants are walking away from water.
A little kid with a uniform, glove and hat on during a baseball game.
A bunch of oranges hanging from an orange tree.
A woman sitting on a bench while talking on her phone.
a small couch overed with blankes and pinapple designed pillows
A dog laying on its back on a made bed.
a black and white photo with two males on cellphones
Various types of apples and other fruits at a market
Someone getting food from plates with a bunch of different foods on them
a male in a red tie and some other people
A plate with steak, vegetables, and rice being served.
A bed is shown next to a stand and TV.
A pair of pizzas sit on trays with ingredients on top
Jet airplane parked on a cement runway under a large white cloud.
A small herd of cows with halters and bells tied to a cable fence.
there is a woman playing with a dog with a toy donut
A white bed with black pillows and a patterned throw.
A refrigerator door is open and full of condiments, food and drink.
a metallic suit case in front of a couch
A black cow and a brown cow walk near a motorcycle on a village street.
A baby sitting in a chair getting a haircut.
The police officer is observing the airplane in flight.
Large red bed in room with dresser and futon.
a close up of a young baseball player touching his cap
frontal view of airplane with cockpit facing on white airplane
a close up of two stuffed animals siting on a table
THERE ARE PEOLE SITTING IN A WAITNG ROOM
a man is cooking some food on a grill
A car with a wheel lock on its wheel next to a parking meter.
A woman sitting next to a child on a couch.
A tall giraffe eating leaves from a tree
a man standing at the edge of a tennis court getting ready to serve
A teddy bear sitting outside in a chair.
Black and white photograph of a man sitting at a bench.
A kitten laying on a man's lap while a woman plays with a Wii controller.
A young child smiling for a picture, she has a plate of cake in front of her.
A man holding a tennis racquet on a  tennis court.
a teenager attempting a jump on his skateboard
A man in blue shirt walking on street with building in the background.
A table topped with a pizza surrounded by people.
these people are waiting for a train at a station
A little baby that is sleeping on someone.
A group of people gathered together, one holding up an umbrella.
People are loading onto an old red, yellow, and green train.
Individual plates of sausage sushi with ketchup packets
A dog follows a cyclist along parked cars.
A lot of red apples are put in a box.
A couple of giraffe standing under a tall umbrella.
The inside of a bathroom leading out to the hall way and a room across.
A guy skateboarding on a big ramp somewhere.
A middle aged lady is decorating a cupcake.
A baseball player has just thrown a ball.
A neat and clean  kitchen with cooking range,microwave.
Two city buses traveling down a rain covered road way.
A fry pan with a mixture of vegetables in it.
A lot of food that are growing on a tree.
City scene with parked buses and people walking on the sidewalk.
A street shows several street lights and an empty intersection.
A couch with clothes and items scattered allover
a two story bus on a busy urban street
Several men are playing baseball on a baseball diamond.
A train rides down the tracks near a hilly area.
A black and white photo of a dog standing happily on a horse.
A woman sits on top of a motorbike.
The dining room has four chairs at the table, and a hard wood floor.
A kid is sitting on a skateboard with another kid behind them.
The people are having a group meal at the table.
a desk with a laptop, some speakers and a mouse on it
A zebra standing next to a  group of three trees.
Two pedestrians underneath their umbrellas walk across an open plaza in a rainstorm.
Two brown horses pulling a black carriage and driver.
A guy in a big grassy field flying a kite.
A cat holding a toothbrush in its paw and chewing on it.
a man that is skiing down a snowy hill
A fluffy quiche or pizza is loaded with vegetables on top.
A brown teddy bear holding a glass vase in front of a grave.
A big commercial plane parked by some vehicles.
Two urinals in a tiled bathroom with windows.
a man and a woman standing in the living room with her holding a remote
A stop light tells motorists to go across the intersection
A person is showing their feet near a book and headphones.
a couple of large planes are on a runway
a couple of chairs sit under a umbrella
Four dogs are sitting together on the bed.
A bunch of green bananas hangs from the ceiling of an outdoor structure.
A young man that is standing by a big pile of luggage.
A skier in an orange jacket looks out over a snowy valley.
A skateboarder is balancing on the rim of a bowl.
A three dimensional rendering of a woman sitting on a giraffe.
A yellow cat sleeping on the hood of a black car parked in the garage.
a fire hydrant on a city side walk
Several young soccer players playing soccer on a field.
a woman in a white top some lights and a cake
Two medium sized dogs sitting next to each other.
A thin pizza is on a plate with a spatula under it.
A piece of art hanging from a yellow wall in a living room.
A woman and child sitting on the bed with an open book.
A group of men playing frisbee on a field
A dog lies down and waits on sand at a beach.
Three vases of different sizes and shapes all holding pink flowers
A dining room features both chairs and a bench.
Several just baked cakes on top of a stove
two plates some food and a fork knife and spoon
A bed that is unmade next to some plants.
A man standing next to a smile giraffe.
Two teddy bears sit on a rocking chair.
A microwave oven on a mini fridge in a room.
A city bus coming up at the corner and someone is waiting for it.
A couple of zebra standing next to each other on a field.
Brown cabinets and dual mirrors and sinks in a bathroom.
An ostrich watches as a giraffe leans over as it eats some bark from a tree.
A pizza cook getting ready to cook some pizza in the oven.
A tennis player makes a strong return during a match
Teenage girls with skateboards at night in front of a restaurant.
A jar of water with a flower inside.
Four luggage bags are stacked close to each other.
A bathroom with a large mirror above a white sink.
An old train is on the track near a small shed.
a man holding his cell phone to his ear
THERE IS A DOG THAT IS IN THE POOL WITH PEOPLE
A bear is swimming in a cold river.
an image of a man with other men on skiis
a desk with a laptop a monitor and a keyboard
A bus parked outside with Asian characters on it.
a man getting ready to hit a tennis ball
A clock tower with a statue in front of it.
Five surf boards arranged in an arc on a grassy area.
A double-decker bus with few passengers aboard drives down the road.
A little boy against a wall while holding a tennis ball and tennis racket.
A skier skiing past a tree at Snowbird ski resort.
People walk on the sidewalk near the buildings.
A snow boarder laying in the snow after a run
Goats and geese standing near each other in howling pen.
A man sitting on the bed watching tv
A large green train covered in graffiti.
A dog is seated in the living room watching tv
A man has his hand around a zebra as they stare at each other.
Some very pretty zebras grazing in the grass.
Motion blur photograph of a busy city esplanade at night
A bathroom with a toilet, sink and a window in it.
A green bowl of corn and broccoli in a white stew with a spoon and a biscuit  next to it.
A siamese cat laying on top of a white sink.
a person riding a large skate board on a street
A flock of sheep standing in a grassy field looking at the camera.
Shot of a small bathroom with a bathtub and a toilet.
A picture with no head but a suit and tie and flower
A sink with dishes in it and lined by various bottles.
The umbrella's on the street are decorated with messages.
The woman is sitting alone on the bench reading a book.
A man kneeling down on a baseball field pitching a baseball.
An adorable little girl holding a brown teddy bear next to a wooden table.
there is a pair of slightly rusted scissors in a rusted handle
A guy sitting on a big bright purple bench with some headphones.
A group of people standing on top of a snow covered field.
A bedroom is bright with colorful accents in it.
A large zebra and small zebra are standing by a tree.
Pedestrians with umbrellas cross a rainy street corner.
A U-Haul truck with a driver sits in a grassy field.
A red teddy bear sitting in a chair with potted plants all around.
A dining room table with some beautiful plants sitting on top of it.
a man in a black jacket standing by a red and black motorcycle
A woman talking on a cell phone and looking into the distance
A man on a skateboard going over a black box at a skate park.
A cake sitting on top of a plate with a knife in it.
A wooden table with a remote control that reads "control a woman."
Large public transportation bus stopping to let passengers on and off.
Close up of white USAF fighter jets in a blue sky
A bunch of vegetables sprinkled with pepper sitting beside each other
People ridding elephants and one is holding a camera.
The side of a truck that has spray paint on it.
A large shower head in a bathroom shower.
tree are two woman standing in the rain under a pink umbrella
A slice of  vegetable casserole on a plate.
A person with a hat standing by a parking meter.
Three motorcycles stop at an intersection at an oriental restaurant.
Two people with boards riding a ski lift.
Motocross rider going around a bend on the track.
A woman hitting a tennis ball on a tennis court.
A gray and white kitten walking through a square hole.
Two people in a room with assorted luggage
A dog is wearing a paper hat with a star.
A large bear in a river with some rocks.
A giraffe is posing close to the camera in its enclosure.
A large elephant with a couple people on the top.
A helicopter that is sitting with its back wheels on the ground.
A male skier dressed in orange and black performing an airborne stunt
The contents of a back pack are spread out on the floor.
Black and white bags above people on a field.
a young man rides a horse down a paved pedestrian area in a town
the man is swinging the bat at the ball
A sculpture of a man reading a newspaper sitting at a bench.
A boy and girl riding bicycles with a small dog.
Five people just got off that gray bus.
A young male is riding his skateboard in his empty pool.
A commuter train passing by a field of wild flowers.
A skier cutting a turn on a slope.
the hitter prepares the to hit the pitch
A suitcase that is packed to the brim with things.
A park with trees, bushes, walkways and benches in front of a skyline of buildings.
A stop sign on a piece of paper.
A woman sits at a table in a wooden cabin next to a lamp
A toy model train station with a train on some tracks.
An umbrella on a beach with a towel.
The bus has the lights on as it travels down the road.
A group of people standing on a field under a cloudy blue sky.
A man that is standing on a board in the water.
A big crowded beach with some guys playing with a disc.
The room has a television and sports jerseys.
Three buses in a row that are different colored.
A mantle with several glass vases of flowers.
A man holding a baby girl while seated in a cafe.
A comics page from the paper lies on the floor of a bathroom stall.
A plane preparing to take off on an overcast day.
A cup full of toothbrushes and tooth paste.
A bedroom with a bed, radiator and laptop.
A man works on an old steam engine train.
A large yellow school bus driving down a road through a park.
Filtered photograph of a man jumping on a skateboard.
Two people next to a bench at a dock above the water.
A couple of cats relaxing with each other on the bed.
A mom and her kids ride together on an elephant.
An assortment of shaped kites flying in the sky.
Two horse drawn carriages traveling towards a big house.
A bunch of big colorful kites flying high in the sky.
A parked pick up truck with a flame design on the hood.
Jetliner with "Saturn" on the side flying over a body of water
A woman wearing a net on her head holding a box in a kitchen.
Some old guys in funny costumes on some fake horses.
A Kingfisher plant parked at an airport with a food service truck in front.
A Eastcote welcome sign in a suburban neighborhood
A man standing next to a yellow and orange fire hydrant.
A woman cuts a cake at the table with a red cloth.
A smiling man with a goatee sits in the backseat of a vehicle surrounded by luggage.
Players at center court with camera man during tennis match.
Two large white sheep standing on a lush green field.
a stop that has been defaced with graffiti
A black and red train engine next to train station.
A cup with a straw in front of a laptop.
The baseball player is sliding into the base as another player is blocking it.
A bus sitting parked next to a building with people in it.
a plane flying high in the sky on a cloudy day
A very shaggy ram and a smaller lamb in the grass
Two men that are shaking hands behind a table.
A very large commercial air plane on the tarmac.
A man riding a motorcycle driving through a mountain side.
Some kids are outdoors playing baseball during the day.
a single giraffe stands tall in field of bright green grass
A group of four giraffes standing next to each other.
Bottles, cans, and foodstuffs within a wall's recess
s close up of two dogs eating cake off of a table
A man is talking on a phone while standing in the street.
A white metal piece of artwork in the city.
Someone is riding a white horse with a grey mane.
A skateboarder heads down a decorated ramp against a panorama that includes an overcast sky, a line of trees and a field of snow dotted with people in winter clothing.
A man in a large room with baskets and pottery
There is a red car being towed on a truck
A woman holds a string in her hand on a beach.
A white table with umbrella and two chairs on a deck near a railing.
Two people cycling on a road as others walk by
A giraffe standing next to several tree branches.
A woman in a seat is on her laptop.
Two women in bathing suits next to a cat with planes flying across
A man in sunglasses holding a sub sandwich
A close up of a bowl of vegetables containing broccoli and carrots.
An open door shows a small bathroom space with a toilet and a shower while a sink sits near the open door.
A child hugging a stuffed animal while surrounded by stuffed animals.
A plane with stairs next to it sitting in a large lot.
a close up of a traffic light on a city street
Some guys in a dark room playing a game on a big  TV.
A cat laying on top of tie dyed pillow.
A variety of healthy foods arranged on a table top.
A room with a bed, fan and a dining table and chairs.
a close up of a cat sitting on a pillar
There are flowered vases and framed pictures set against a wall with balloons hanging above it.
There are several modern lavatories in the rest room.
Traffic light on a long yellow pole in front of apartment balconies.
A man in a ski suit sitting in the snow with a snowboard.
Several employees are standing behind the bar of a restaurant.
A car turns the corner of an intersection in the rain.
A red toy train stopped on tracks near toy figurines.
empty train cars sit in a snow-covered deserted train lot
Three bears stand together near a fence.
A woman that is standing up with a doughnut.
Bikers and pedestrians populate a street featuring many shops and stands.
Long billed bird standing in green weeded area of fodder.
A pair of surfers carry their boards along the shore.
The person rides in a yellow motorboat with a dog.
A boy in a blue shirt catching a frisbee.
A plate full of meat and broccoli on top of a table
A single zebra walking by some water in the dirt.
A lush green field topped with lots of vases.
A pizza that is sitting on a plate.
Men playing recreational basketball on a hot day
Two young men and a dog standing on a snowy road.
Two people are playing Wii games in the living room.
Couple standing in snow on skis posing for the camera
The clock has beautiful gold detials on the face.
A man holding a kite string as a woman releases the kite.
A birthday cake has an airplane on it.
a labrador retriever bring a frisbee back for his owner
A meat sandwich on a bun with a side of Brussels sprouts.
A zebra and a giraffe foraging together by some trees.
The red and white train is relatively short in length.
A person laying down with a book in one hand and a cell phone in another.
a person in a costume standing talking on a cell phone
A tall building sitting next to a bunch of trees.
Bathroom sinks and a mirror lit by sunlight coming through a small window.
There is a giraffe that is looking at something
A woman standing on top of a green field next to two men.
A male tennis player on a court with a racket and ball.
Adult elephant standing near a multi-wired electric fence.
Different markings sitting on a bag on the floor.
A pair of giraffe are walking in a field in Africa.
A group of three zebra standing next to each other.
Three vases that are red with flowers on them are on display.
The home office features several important business tools.
A grey tiger cat staring at himself in the mirror.
an overview of a marketplace sale with child toys
Several surf boarders at a city wave pool.
a man lays down on a surf board as he paddles through the water
two teddy bears sitting on a chair and wearing costumes
A small single sink in a home bathroom cluttered with items.
This painting shows a perplexed fellow staring at a laptop computer.
a small bathroom with a sink and a toilet the toilet lid is raised.
A modern living room in a cabin with food.
A bunch of horses are walking two by two down a road in a city with a few riders.
A little league batter await a pitch at home plate.
An up close shot of a woman wearing a badge on a lanyard opening a banana.
A couple of people riding skis down a snow covered slope.
a plate of meat and bananas on a table
Various different animals that are standing in the grass.
A dog laying on the floor chewing a toy while a man laying on a couch watches.
A father and a daughter flying a kite in a park.
A train on the tracks blowing smoke out of the engine.
A group of ninjas wearing all black hold up small white fans.
Two people in orange jackets smile as they ski up a road.
A Michael Jackson birthday set is shown in gems
A woman is standing outside in the snow holding a snowboard.
Two zebras cross a dirt road outside a village.
A banana laying next to a plastic container with lid.
A small living room area with black furniture and curtains.
a kitchen with brown cabinets and a big door
A guy with a cast does some flips with a skateboard
A giraffe towers over thorny treetops in the day.
A little boy that is holding an umbrella.
a couple of people play a game of wii
A living room scene complete with two couches.
A family plays with a Frisbee on cobblestones near the water.
a dog in a field with a frisbee in its mouth
a polar bear standing next to a cliff
A snowboarder gets some big air off a ramp.
An airplane sits alone on an empty tarmac.
A small family of Giraffes are together near a couple of trees.
2 professional tennis players competing in a game of tennis
A herd of sheep standing in a muddy pen with a chicken.
A slice of cake with a single birthday candle sits on a plate.
A bird is jumping off of a branch.
A teddy bear sitting on the ground next to a garbage container.
a bench that is outside in the woods
A lady is sitting in a restaurant eating while holding a jar of peanut butter containing a comb.
A very fancy wooden mantle clock with ornate design.
A large white boat floating on top of the ocean.
A dog sitting at a picnic table peeking out from behind someone's legs.
A DOG QUIETLY SLEEPING IN HIS BED ENJOYING THE SUN.
two males are playing a video game and chairs
Black and white photograph of a bowl of apples.
Man in a black jacket snowboarding down a hill.
Three horse grazing on grass near a street sign.
A person in a ball cap and holding a Frisbee with a dog.
A bunch of bananas on a banana tree.
a big man running to hit a tennis ball
a light colored bear in a grassy field
A base ball game in progress behind a fenced in park.
This is a picture of a kitchen that is also used as an office
A man riding a skateboard through orange cones.
A table covered with arts and craft supplies.
A man riding a skateboard on the side of a rail.
A very cute old looking fire hydrant on the curb.
A stop sign is standing in front of a palm tree.
A man plays a video game as a woman sleeps nearby.
A group of people standing in the middle of a walkway.
The zebra is walking through the short green grass.
Four cows are grazing on the short green grass.
a person jumping in the air with a skateboard
The mounted officers ride near buildings with flags on them.
A pineapple, orange, and bananas sit on a plate in a kitchen.
A city street has diners eating on outside tables.
A chair and a couple of pieces of furniture in a room that had been burned.
THERE ARE CHRISTMAS DECROATION ALL OVER THE PLACE
A mirror sits on the side of the tracks of a subway.
A Japan Airlines plane waits at the gate while it is towed in.
A person holding a wine glass with a dark beverage in it, in front of a television that has a cartoon on it.
A steer is walking through the grass with large horns.
A young man in a sweat shirt is standing on a wooden walkway.
A vase with flowers on the table
Two men hold a kite together outside surrounded by chairs.
Several men looking at phone in one's hand.
A piece of pizza sitting on a plate.
A polar bear keeping cool in the water.
a table that has all kinds of plates of food on it
Two glasses vases are next to each other with flowers in them.
A snowboarder is in midair preparing to land.
A street sign, with two signs on it.
A young child that is sitting in front of a birthday cake.
A bowl filled with oranges on top of a wooden surface.
A kitchen scene looking toward the living room in the background.
A very pretty dog laying on a person on a couch.
A white tub sitting next to a sink and a toilet.
a woman is hitting a tennis ball across the tennis court
A bathroom with white vanity, toilet and tub and open frosted windows.
A baseball player takes a swing at a low ball.
A Jeep towing a boat out of a body of water.
A couple of men standing on top of a soccer field.
A person holding an electric tooth brush next to a cat sleeping on a bed.
A vegetable pizza on the edge of a table
An older woman preparing cookies and bread at a table.
a photo of a man wearing a tie with a tv monitor in front of him
An umbrella is tied to a bike on a rainy day.
A sign warning drivers to slow down because of the presence of children.
Tired dog rests on top of a teddy bear.
A bike parked in front of a red brick building.
Three people walking toward a small airplane on a tarmac.
Airplane with smoke coming out flying through blue skies.
two women out in the snow with their skiis
A tray of food in foil and a fork.
Cross country skiers are engaged in a race.
A basket filled with food and a cup of salsa.
Group of cars parked in front of a large building.
Several signs posted on a metal pole near a pharmacy.
Small boy in yellow shirt holding onto a white frisbee.
A person on a surfboard in the water.
A large bird is flying over a beach.
A black cat with crazy eyes wearing a bib.
A man with a suitcase walking in the road.
Three giraffes standing in a zoo enclosure with trees.
A group of people on a field playing baseball.
The side of a stainless steel vehicle with large wheels.
An adult in a wetsuit surfs a small wave.
A beach with people flying their kites in the sky.
A zebra walks by an alligator near a watering hole.
A kitchen area with a stove, sink and dishwasher.
A man sitting down holding a brown dog wearing a blue tie.
A suitcase sitting next to the subway rail.
a man taking a nap at the end of a bench
The bedroom with the bedspread is dimly lit.
A woman in white shirt climbing onto an elephant.
Two women in skis standing by a sign and trees.
an image of a child that is playing tennis on the court
A small air craft is heading in for landing.
A large black bear standing next to a stone cave.
A boy is sitting in front of a laptop.
A woman kneeling down next to a fire hydrant with cans of paint.
A father helping his child brush his teeth.
A photograph of a thing in the picture.
A man is standing under an umbrella next to a tent containing clothes for sale.
Two small children in green shirts on a baseball field.
People walk in a narrow alley way while clutching umbrellas.
A horse has a harness on its face.
A dog that just caught a frisbee.
A cat is laying on a laptop on a coach
A red fire hydrant next to the curb with parking meters in the back ground.
A kid in a car hiding from a zebra that is poking it's head in the window
A man swinging a baseball bat in front of a man with a glove on.
The adult black bear is inside of a pool of water.
The two green military vehicle are parked in the field
A man sitting next to a large pile of luggage.
three women stand by an elevator with their luggage
The two teens are on the sand dune, racing  to catch the frisbee.
A road bike rests against a park bench.
A living room filled with furniture and a wooden book shelf filled with books.
A man jumps his skateboard over a fire hydrant
Man serves tennis ball at high speed while other watches.
A toilet that is on the ground near a trash bin.
A poster that indicates the letter S stands for sandwich.
An opened stick of butter sitting near some scissors
a street pole with a sign on top of it
A woman with a child in a carrier standing in front of a giraffe exhibit.
Outside view of white horse in the window
Two shots of a woman swinging at a tennis ball.
A bed above a desk with a computer
A half-eaten pizza sits in an open takeaway box.
A colorful dish of several fruits and vegetables
A sports motorcycle is parked on a gravel road by a river.
A very large orange cat lying on the roof of a vehicle.
A small very messy rest room with many books.
A woman throws a frisbee into the goal in frisbee golf.
A very cute bright red fire hydrant by some bushes.
A Mack truck parked in a parking lot.
Fruit, grain and vegetables have been putted in separate bows.
A giraffe walking through a jungle next to a large tree.
Man looks at another man that is holding a Wii controller in his hands.
A bowl of vegetables containing carrots sitting on the stove.
two long lines of boys paddle a canoe
A lone elephant walking through the desert grasses.
A woman sitting at a table cutting a princess cake.
A man sitting on a high chair on a tennis court.
This person is riding their horse near the water.
A woman holding the head of a horse wearing a bridle.
Street signs on lamp post in large city.
Chefs working in a kitchen at a restaurant.
A man and woman posing with tennis rackets
The man talking on a cell phone has glasses on his head.
The red bus is driving down the street.
The pizza is on the dish and ready to be eaten.
Small boy in dress clothing sitting down on a white bench.
A teenage girl with black hair and black makeup wearing kandi bracelets on her hand and holding up a sandwich.
People are on the beach with water fun equipment.
A man with his arm around a woman in front of several skiers.
A person laying on top of a bed next to a white dog.
A rock wall extends out from a stone building and tower.
A pair of scissors sitting on a plastic chair in an office.
a white plate with eggs ketchup and a fork and a cup
a chocolate doughnut on a saucer, coffee in cup.
The young woman is selling many types of cupcakes.
Three adults on the beach fly a very odd kite.
A pastry is lying on a blanket on grass.
A white airplane is on a asphalt lot as the sky is covered with clouds.
A single giraffe looking into the camera on the plain.
A view of a mountain range from an airplane.
A family holding ski's posing for a picture on a mountain.
A man is on the beach playing with a frisbee.
there is a woman that is standing in the snow with her skies
A person loading a bite of cake onto a fork.
A stop sign by a cross roads on the roads.
A family is in a living room playing the Wii.
Two  large elephants laying down in the dirt.
All the items that are going to be packed for a trip.
there are many lights that are on in all of the buildings
Crowd of people with backpacks line up on the runway to enter the plane
A herd of giraffe walking across a field.
a vase with bright flowers sitting next to a man usiing a platform
two hotdogs topped with a dill pickle tomatoes and tofu
a man is holding up a box of doughnuts
some people standing around by a table and chairs
a desktop computer monitor with a keyboard and mouse
The horse is in the water with a man.
A close-up of the dirt in a garden with a small umbrella in the ground.
A toilet seat with a picture of a dolphin on it.
A horse looking over a fence on a snowy day
a round window overlooking a parking lot filled with cars
tree is a man holding a small red guitar
A group of people playing a game of frisbee on a beach.
a number of zebras near one another on a dirt ground
A plate of food with mushrooms, beans, sausage and two kinds of meat on it.
A woman about to enjoy a good lunch of a sub.
A small park with benches and buildings in the back round.
View of down town in a city and traffic driving on the opposite side of the road.
A white bus driving down a street past a semi tall building.
Hot dogs are being cooked next to bins of toppings.
A herd of giraffe standing around a pile of rocks.
lemons and limes in baskets in the produce section
A man in a white outfit, holding a tennis racquet.
Two zebras are walking in front of some trees.
several multicolored scarves hanging on a display case.
A beautiful woman holding a brown dog in her arms near a refrigerator.
A bedroom with a bedspread and a window.
a white and brown cat is laying on top of a keyboard
A person helping another person fix their skis.
there is a man with a pink shirt holding two surf boards
A lot of cows are walking on a field.
Three zebras that are standing in the grass.
A young person riding a skateboard at a skate park.
Collection of vintage motorcycles sitting on display at a museum.
A group of cows standing on a road with a vehicle looking on.
Two elephants walk along the bank of a river.
A pine tree branch in a vase decorated with a dove and colorful star.
A person riding their bicycle in the rain.
A dog sitting in front of a open book.
scones sitting on a plate at a cafe
A train travel at high speed with buildings reflected in the windows.
The man is ready to throw the frisbee.
A man and a woman standing their surf boards next to each other at the beach.
A small Christmas teddy bear is hanging on a tree.
an image of a bedroom bed with a bookshelf in the background
Five dessert samples, on clear glass plates, are displayed on a wood spoke wheel.
A steamer filled with different types of vegetables.
A fan sitting in the middle of a room next to a sink.
Woman looking at cell phone while outside in the bright light.
A man on a surf board riding a big wave.
a person walking with a cow in a parking lot
A cordless land line phone is all lit up.
A tractor and a herd of cows in a farming field.
A woman and two young girls are blowing out a candle.
a person standing in a living room playing nintendo wii
A burnt pizza covered in cheese and toppings.
This woman is playing tennis on a court.
Street signs at the intersection of Partridge Way and Pear Tree Lane.
A room with chairs and a clock and a floor.
Window display of a suit and sewing machine.
there is a sign that has whoa on it and there as a truck behind it
A brown and black cat underneath an umbrella.
A man standing on a tennis court holding a racquet.
Man on large open area covered with snow.
A very long large train at a station.
a couple of buildings surrounding a pond with boats
A man holding a tennis racquet in his right hand.
a sprinkled piece of cake on a pink polka dot plate
there is a man on the beach flying a kite
A van parked on a road side, covered in snow, ice and sleet.
a woman holding a pole skying on the snow
A suitcase has been re purposed into a charming bench seat.
A large dog sleeps in front of a tv.
A vintage image of a lady holding a baseball bat.
A row of motorcycles parked next to each other.
A man sitting at an office desk utilizing a computer.
Three people in suits posing outside of a bus
A view of bathroom with a sink, toilet, tub , and mirror.
Baseball player wearing protective hat with a bat warming up before his turn.
A skier in green snow pants recovers from a fall
Passenger train at stop waiting for consumers to load
A woman in a swimsuit with a racket in her hands on a tennis court.
a dining room table that is in a room
A clock tower on the side of a brick building
There is a cross country skier wearing full gear
An elephant,fanning his ears is standing on the ground.
A plate of food with meat and other vegetables.
A woman surfing a wave on her surfboard.
A lady walking down the street with a red umbrella.
A young boy is standing on a skateboard.
Pizza, orange juice, and red wine sit on the table.
A white kitten is sitting on a laptop computer.
Three men are sitting on the couch, one is on the laptop.
Two giraffes eating together from a feeding station.
A batter standing at home plate has just swung at the ball.
a cabinet with a coffee pot, toaster radio and microwve
A lamp sitting next to a red vase filled with flowers.
a skateboarder with white tennis shoes is doing a trick
A chocolate style cake with candles on it by a cutting knife.
A woman brushes her teeth and looks at the camera.
A building with a stop sign next to it with a man on a horse.
A woman and child are about to cut a cake
A red stop sign sitting under two street signs.
A man sitting in field next to a herd of cows.
A bed with an orange headboard, a green pillow, 3 regular pillows and the bedspread turned down.
There are two people watching another one play tennis.
A pan with carrots, apples, meat, and potatoes.
A group of cars that are parked on a beach.
A sandwich sitting on top of a white plate.
A person standing on a sandy beach next to the ocean.
Men are standing together outside of an old train.
A man is flying kite in the park.
Young couple cutting white cake at indoor celebration.
Giraffes walking around outside in a wildlife park.
a toilet sits inside of a cramped bathroom
Two people on hard ground throwing a frisbee.
A passenger sign on the tracks at a station.
A young man riding a skateboard through a  puddle of water.
A group of people enjoying a cake and pizza.
A herd of zebra standing on top of a lush grass covered field.
Two skiers are going cross country in opposite directions, one taking the high road and other the low road.
A toy chicken standing beside a flower vase.
Cat sitting on top of a chair near door.
a mixture of vegetables including broccoli and squash
a little kid that is standing next to a suitcase
Two pizzas being placed on top of a column of plates with an employee checking the pizza on a stone stove.
A man petting a cat that's sitting on a kitchen counter.
Man posing in front of a pair of giraffes in background.
Plated lunch with condiments and utensils on dark table.
An old cellphone stand next to a mug and a statue of Jesus.
a tie on a pole outdoors in a field of grass
The meal is prepared and ready to be eaten.
There is a clock on the side of a building
A family posing on skis with a young child in the snow.
This is a portrait of a bench next to the ocean.
Two sheep standing next to each other in the snow.
Three people in uniform cutting a cake with others watching.
Three bikers in a busy street riding in front of a bus.
A woman bundled up in the snow skiing.
A person in a purple shirt standing on a couch playing wii
A silver train traveling down train tracks next to two men.
A chick is siting on the edge of a bathtub.
The man has just thrown the frisbee in the air.
a group of zebras grazing on dry grass in a large field.
A man returning a tennis ball in a tennis game.
jockeys riding horses in a fast horse race
A seaboard soars majestically over the green-blue ocean.
Two men holding hands while holding a snowboard
a man standing on top of two horses
a woman wearing a wig holding a tennis racket
Three donuts are on paper next to a coffee cup.
A man skiing is doing a rail grind.
Male surfer in wet suit, just thrown off surfboard at the peak of a wave.
Red Oral B toothbrush in a blue cup.
A pitcher, batter, umpire, and other baseball players on the field
Two menus sit atop some colorful decorations next to a green box with lights on it in front of a restaurant.
three people sitting on a motorcycle in a street
Little girl holding up a sheet of uncooked rolls by oven.
two elephants in a encloseur at a zoo
A colorful chain with a note attached is wrapped around a parking meter's post.
The child is jumping on the beach above a body board.
Street signs on the corner of Fillmore and Filbert
Baseball players take various poses as a ball floats above the pitcher's mound
A sink and toilet in a bathroom being remodeled.
two ripe fruits on the floor ready to be eaten
A woman is playing tennis on a hard green surface.
Young boys playing soccer trying to kick the ball.
Two giraffes stare at a crane from behind a fence.
a bunch of cupcakes stacked up on trays
A person reaches out to pet a pony.
a close up of a cat sitting at a table
A cat's head sticking out of a leather bag.
A glove laying on a stuffed animal in the grass
A woman is walking through the park texting on her cell phone.
A table has potatoes, carrots, onions and broccoli.
A group of lambs standing in a grassy field.
A boxed lunch with a sandwich, veggies, fruit, pickles, and a dessert.
A man sitting at a table at a diner with a basket of food in front of him.
A microwave oven with a plate of nachos inside of it.
Cat sitting on top of a person's computer.
A women in mid swing hitting a tennis ball.
Two plates of food in front of two dogs.
A baseball player hitting a baseball with a bat.
A cluttered desk filled with monitors and various items.
A clock on an outside information board with snow all around it
Four pieces of a television remote disassembled or taken apart.
A couple of police officers in the middle of a street.
A white paper topped with square slices of pizza.
A spoon is resting in a bowl of cooked noodles and vegetables.
Three guys are in the kitchen together preparing some type of meal.
A device fashioned to look like a yellow car sits atop the desk blotter.
a bowl sitting on a table with flowers inside of it
A person riding a wave on top of a surfboard.
A black cat with a conspicuous look on its face in a bag.
Two hot dogs in cardboard plate one with pickle and the other with cheese.
A reproduction steam train waiting at the station
The man watches the little boy on the surf board.
a black and white photo of children siting posing for a photo
A little boy reading his book on top of a toilet.
A clock sitting next to a brick sign under palm trees.
A person on snow skis is pulling a rope that is attached to something heavy.
A man and woman toasting with martinis with olives.
a public transit bus in a field with a sky background
At least nine giraffes live in the enclosure.
Four boys with skateboard relax by an iron fence.
there are many people gathered here in the snow
Several people interacting in a spacious living room.
A blue and aqua colored train and people on the platform.
A group of brown horses standing on a snow covered ground.
The clock tower stands tall and reads almost five-o-clock.
A toilet, shower, and sink in a bathroom.
A young man on a skateboard near a half pipe
A picture of a modern looking kitchen area
A row of parked jetliners sitting on top of a dirt field.
A vehicle pulls up next to a building.
A train on the tracks under a walkway from one building to the next
A baseball player holding a bat during a game.
an orange caution sign stating fresh oil in the street
A lady is playing tennis game in a tennis court.
Two surfers carrying their surfboards in the sand at the ocean
A Macbook sitting near a clock and a lamp on a desk.
A couple of men in skies on a snowy slope
A little girl holding a baseball bat on a field.
A dog and a cat laying on some platforms.
A person holding a cellphone that is opened upright on a table.
A man in a suit standing in front of bookshelves.
The woman is playing tennis on the court.
A fire place sitting below a brick and plaster mantel.
A group of men in hats next to planes on a runway.
a lady on a horse and people taking a photo
A person is watching animals in the wild with a camera.
A stop light is shown over a road.
A train is going down the track under a bridge.
Here is a compact kitchen that uses it's limited space well.
an image of a group of people outside for an event
A subway train is parked at the station
A jet plane flying through clear blue skies.
A table topped with food and a remote control.
A clock tower with elaborate details decorating it.
People riding motorcycles along a street with a lady riding on the back of one giving the peace signal.
a plate of food with a banana and a sanwich
A large long train on a steel track.
A man hitting a tennis ball with a tennis racket at the tennis courts.
Several people walk up a slope as others are coming down at an intersect.
a man and woman are sitting on the back of an elephant
A refrigerator sits in a temporary spot in front of a doorway.
A herd of giraffes and two zebras are grazing in a field near a fence.
a bus stop with a white bus picking up lots of people
Something outside the window has captured the dogs attention.
The woman sitting in a red chair is smiling while holding a cell phone.
a person riding skis on a body of water tethered to a boat
a person riding a surf board on a wave
A man holding a device and a coke bottle in a clearing in a wood.
A large jet sits at the gate at the airport.
a close up of food on a plate on a table
A person hitting a tennis ball with a racquet.
Four people standing on balcony and a parking meter
A white toielt with a standing rail in front of it for support
Three backpacks loaded with a variety of stuff sitting on a tile floor.
a bright yellow 'watch for rocks' sign in front of the blue sky.
People are riding bicycles and walking across an intersection.
Two men jumping in the air across sand to catch a frisbee.
Scissors and material being made into small purse
People in a stadium watching some men play baseball.
A man has his hand up to his ear as he walks past a bridge.
Three people are cutting into a yellow dinosaur cake.
some fireworks in the air above a clock tower
A red stop sign sitting on top of a yellow gate.
A man with black suits next to a surfboard
A person with dark hair throws a frisbee.
a bike that is parked next to a brick wall
Yellow fire hydrant in between two blue posts.
A row of boats on a beach with a dog near the boats.
A blue and white train pulling up to the train station.
a woman eating out of a small bowl next to a computer
A woman with her hand on a blender on a bicycle
Young women playing a game of softball in the hot sun.
An old photo of a group drinking in a restaurant.
Some sushi rolls, apples and vegetables are in lunch containers.
A boy holding a Frisbee on the beach.
Two small children hiding their faces behind umbrellas.
a boy wearing shorts and tennis shoes riding a skate board
A man throws Frisbees in to the dark
colorful umbrellas and chairs in the sand on a beach
Birds sitting on wires are silhouetted against the yellow sky.
A desk with a computer, office items, and CDs on it.
a custom motor bike is parked on some gravel
A street sign in grass with building in the background.
A man in a shop working on some motorcycles.
Animals walk around a grassy area together.
People walking on a snowy road in a village
a person sitting on a motorcycle on a city street
A man in a hat and sunglasses eating a banana.
A pastry of sliced banana on a white plate.
A plate with a sandwich and fries on a table.
A very tan man driving a wooden boat on the open water.
A close up of a woman smiling while looking at her cell phone.
A sign that says stop under a red light.
A bay view with a city in the far distance.
A herd of horses in a rocky field.
A woman wearing red with a red purse while holding her cell phone.
A black and white picture of a man wearing a turban walking down a street.
Several vehicles providing ground transportation are shown in the photo streetcar, tourbus, classic car and family cars
A white fence in front of a house next to a yellow fire hydrant.
A clock tower is on the side of a building.
The sign on the pole says Wall Street.
Two skiers sitting on top of a snowy mountain.
The bird is an owl flying low above the grass.
A dog and a little girl riding a tricycle.
A semitrailer truck as seen in its outer rear view mirror
A skier taking a leap off a pile of snow.
single guy on a skate board skating on a roof top
An underneath view on a beach umbrella with a table to the side, and some people in rows of chairs on the beach.
a laptop placed on a wooden table in a room
A large crowd is watching a baseball game.
A man riding a wave on top of a surfboard.
Children in a room with many beds
A fire hydrant sitting on the side of  a road.
A red and white airplane is on the runway.
Two people sitting at a table across from each other.
A group of people at the beach flying kites
the man is holding on to a small boat craft in the water
A man standing in a sport coat and looking down at his hands as a woman passes in front of him.
A covered horse grazing on grass while being fenced in.
People are sitting on the ground petting a cat.
A train on some train tracks near trees
A train passing by fields and greenery on a track.
Two zebras on top of a dirt terrain.
A group of people standing in line to get on a red bus in the city.
a plate that has some food on it
A surfer crouching in to a choppy wave
A empty, set table in a modern style kitchen.
a table with some food and beverages on it
Two gentleman in suits smiling and posing for a picture.
A young boy eating out of a can.
A baseball player standing on a field holding a baseball bat.
a cargo train being led by an orange and black engine
Two shaggy white sheep together in a fence.
A young boy dunking a basketball into a yellow hoop.
A person with their pants down next to a smart phone.
A couple of people sitting on top of a bench.
A stop sign and several other road signs attached to metal posts.
A produce section of a grocery store with a wide variety of fruits.
There is a gray cat sitting on top of a gray luggage
Taco salad bowls full of taco salad and a salsa container.
Cat sitting on a bookcase intently watching out a window.
A child's hands holding a fresh orange with a leaf and twig attached
A could people stand around a food truck to get their dinner
A cat is standing on a desk in front of a computer.
Two men standing on either side of a pink inflatable object.
some zebras are standing on a green hill and rocks
A man holding food and smiling with a full plate of food on a table.
A man standing along side of a truck trailer.
A man with a superman custom under neath his clothes posing
A man riding skis across a snow covered countryside.
a cat laying on the keyboard of a computer
A large wall clock on a white wall.
Pair of kites flown on grassy area with several onlookers.
A person on a racing motorcycle making a sharp right turn.
A man holding a pizza above a table filled with bowls food.
A man is sitting on the couch and watching TV while holding the channel selector in his hand and a black guitar is sitting in a corner.
a man and woman cut into a wedding cake
A man playing tennis prepares to hit the ball.
Two women are dancing with video game remotes.
baseball player swinging metal bat at home plate.
A microwave that is wrapped in plastic and is inside of a larger piece of furniture.
A couple of people in the water with surfboards.
a strawberry pie with whip cream and strawberries on a green plate
A couple of men on hot rod motorcycles parked in a lot.
A small bathroom features a small sink, toilet and mirror.
Close up of the over-used bristles of a tooth brush
An open white  box of assorted decorated doughnuts
A little boy swinging at a pitch during a baseball game.
A group of boats that are sitting in the water.
Two giraffes are in the enclosure surrounded by a group of people.
a dual screen computer on a desk in a room
there are three giraffes embracing in the wild
A man with a hat in the air with a skateboard.
A table topped with breakfast food and a cup of orange juice.
A hand holding a mouse next to a laptop on a table.
Young girl with brown hair and a flowery blue hat in kitchen looking downward
a man on a surfboard in the water
Many pieces of luggage sitting neatly beside one another.
A ripe banana, a pear, an orange and a strawberry.
An art exhibit with two chairs and a blue vase.
A flower pot that is sitting on top of a chair.
Three older individuals with luggage, standing near a sidewalk.
An old red VW van sitting on the street
Two different slices of pizza on a plate.
A microwave in a puddle with leaves scattered around it.
Two people run for the Frisbee in a local park
A man kissing a woman's forehead while laying in bed together.
A dog sitting with a woman looking soulful
A man riding a wave on a surfboard.
Two children in blue shirts squatting under an umbrella.
A monster size truck moving down a quiet city street.
a young man playing tennis on a sunny day
A white cat sitting on top of a woman sitting on a couch.
A multi colored train riding on the tracks
an image of a cat that is playing with a pair of tennis shoes
Professional baseball player hold a bat and scratching his armpit.
a bunch of cars drive in different directions on two sides of a street in a city
Giraffes huddled next to a tree in their natural environment.
A passenger jet taxiing on the tarmac of an airport.
A cow inside a brick building with people looking at it through the door way.
Some baseball players sitting in a dugout watching a game
Skateboarder jumping off his board on a concrete course.
A chapel filled with benches, a book stand, and other accessories.
A cute little dog sitting on top of luggage.
A young boy playing with a toy oven with a fake plastic sink.
A dog inside a pin wearing a hat.
Laptop computer sitting on top of a table in a personal office.
A boy in a grey sweater is holding a blue kite with a whale picture on it.
A small dog sitting inside a red duffle bag next to a frisbee.
A woman crossing the street in the rain.
Small white toilet sitting in a small corner next to a wall.
A small family seated at a table in a pizza parlor about to enjoy a meal.
A large grey elephant walking through the middle of an auditorium.
A cow in a barn cage looking towards a camera.
A boat with a long cabin sits in the water close to shore.
A baseball player swinging at a ball with a catcher and referee behind him.
A white bathroom sink with a crack and a mirror.
A baby elephant walking into a pool of water.
A bathroom with a sink and several towels on the counter
A batter poses with a bat over his head.
A grass umbrella and two chairs on a tropical beach.
This bathroom has a toilet, tissue roll, bathtub, and two towel racks.
a couple of men are standing on a snowy mountain
Wearing a red shirt, a surfer rides a wave on a white surfboard.
A baseball game in action with a man at the plate with a bat.
A toilet that is next to a bathtub.
Men in army shorts on skate boards near ramp.
A couple play tennis on the tennis court.
A brown plush teddy bear holding a heart
A small boy holding up a tennis racket
A picture of a vegetable that is starting to grow.
an open book laid on top of a bed
A person that is on his computer on a table.
Orange placed in bowl next wet marsh land
A man swinging a tennis racquet at a clock.
An outdoor garden area with verdant plants and a tree.
A purse has a cellphone located in a side pocket.
A woman hitting a tennis ball on a court.
Several skiers are standing on a snow covered area.
a couple of men ride on some horses as they race
busy city  showing a big blue moving truck with graffiti  on it next to a  white van.
A group of young boys standing on a lush green field.
Black and white photograph of a busy city beach
A laundry room in a dimly lit place.
A woman in a yellow apron ties the top of a bag of popcorn in her concession stand.
A red train engine sitting next to a tree.
Four zebras stand in a meadow in the black and white photo.
a close up of a person holding a hot dog
An individual on a kayak riding through waves of water.
A plant sits on top of a refrigerator in an empty room.
A woman sitting next to a child on a large grey teddy bear.
A elephant that is standing in the dirt.
A sandwich on a white plate on a table.
This is a picture of two bowls in a restaurant.
an ostrich walking sneakily towards a couple of zebra
a woman rides on a bike down a street
A big pile of building material is placed on the floor in the wooden structure.
A man in a business suit in an office building.
A man that is on a bike next to a woman.
A group of birds sitting on a horizontal pole.
This is someones couch in their living room in their home.
A baseball player is bunting the ball at a game
A piece of cake with a dollop of cream filling next to it.
An outdoor swimming pool has people in it.
A picture of some people holding a sign.
The bathroom is white with the shower curtain open
A television playing on a desk in a room with colorful art on the walls
A man steering cattle in a water puddle.
an image of a couple that are on the couch
An escalator with a guy standing a kayak next to him.
A group of people sit next to each other on a bus.
A child sitting at a table smiling with its eyes closed .
A desk that is cluttered and has two laptop screens.
A skateboarder reaches the top of a ramp.
Several boats filled with goods sitting in the water.
A table full go delicious meals, the closest being seasoned shrimp over broccoli.
A living room with a covered couch and coffee table.
Man on a snowboard going down a hill.
a young boy standing on a surfboard at an amusement park
Sunset scene with surfers coming out of the water
A dog is staring out over a body of water.
a boy is looking at his cellphone in a bathroom
An adult teaching a small girl how to play tennis.
A man holding tie devices in his hands while he looks at his laptop.
A body of water filled with lots of boats.
Many people are waiting with bags and possessions.
I am unable to see the image above.
A red bus is at a bus stop.
A sheep standing on the side of a lush green grass covered hill.
A bear climbing across limbs and fallen trees.
Man in black uniform holding a soccer ball in front of a net.
A bathroom with mirror, lights, sink and bath tub.
A man standing in a kitchen holding a bottle of ketchup and a hot dog.
A woman is reaching for the ball on the court.
A green street sign near a palm tree in a city.
A giraffe is looked at by many people on a balcony.
A naked baby lays on a towel in a bathroom and chews on a toothbrush.
A dog lying on a couch next to a computer.
Several glazed doughnuts in a white box container.
A fire hydrant that is sitting on the sidewalk.
Man in black and white uniform swinging at a baseball.
A living area with a television and various places to sit.
there is a police man riding a motorcycle on the street
A red fire hydrant sitting beside a lake.
Two trains traveling along a snowy railroad track.
A woman sitting at a table in front of a pizza.
A bird is taking flight during the day.
A fire hydrant is partially under a tree.
A child on a snow board stands in the snow.
People are riding on bikes on a road after it has rained.
a man riding a wave with a colorful surfboard
The zebras are eating grass in the field.
a child practicing his bating in a batting cage
A dirty bathroom stall with white toilet and papers
various pieces of pottery lining the shelves in a workshop
A blender on display next to some small glasses.
He rides his motorcycle through a narrow alley.
A black and white photo of a dormitory with several beds in rows.
Person in yellow shirt playing tennis on a court
A man in red jersey standing on a pitchers mound.
A man riding on top of a brown horse while wearing a hat.
Two giraffes are standing amongst a bunch of trees.
There is a woman that is riding a bike
A large metal clock hanging with chains from a roof.
A man pushing a surfboard with a small boy standing on it
A very plain and dull bathroom that's in someone's house.
a man walks next to a giant bike piled high with garbage bags
A man in white shirt riding a skateboard down a hill.
A blue street sign sitting on the side of a road.
The young catcher in black is throwing a baseball.
a food dish containing red peppers, broccoli potato and chicken.
thERE IS A CLOCK IN THE MIDDLE OF A LARGE TRAIN STATION
A cat sleeping on top of a blue towel.
a squat down toilet with a door
there's a white building with gold trim and a clock
A woman is raising her hands at her desk.
A small child sits in front of a decorated cake.
a collage of photos with a child near a cake
Male and female at a party celebrating in front of balloons.
A woman and a man flying a kite against a city background.
A white sailboat floating across the ocean over waves.
A bus is parked on the corner beside a large stone building.
A train loaded with cargo crossing a bridge
A photo of a bed that has been made,
A flock of birds floating in the ocean next to a cement wall.
A man reaches under his leg to catch a frisbee.
A bus driving on a brick street
a lady in a canoe with fruits and her personal items
A display of apples and tomatoes in their own crate.
adult and baby sheep walk across a field
A train on the train tracks surrounded by greenery.
A city street has a fire hydrant, trash bin, and parked vehicles.
a stove with a pot cooking tomatoes and another holding a strainer
A green and white bus parked in front of a small building.
a person riding skis in the middle of a snowy street
a group of birds sitting on back of a bench
A stop sign on the side of a street
Multiple white cars passing next to train at a train station.
a city street with a car and traffic lights
A group of people that are standing in front of a surfboard.
A stern man is speaking in the center of a political rally.
Some zebras are standing in the middle of a grassland.
A woman with soccer ball playing with two boys next to a fence.
An airplane in a very bright blue sky.
A man getting ready to hit a ball in baseball.
an image of two zebras side by side
A man laying stretched out on the back of a boat.
A country pasture with cows, grass and trees.
The woman is laughing as she gets ready to eat the sandwich.
A baseball player is in the outfield of a baseball field.
Black and white photograph of a man with an umbrella.
Two boys sitting next to each other holding stuffed animals.
A plate topped with three donuts next to a cup of coffee.
Two men are talking to each other during a presentation
A woman performing in an arena with her horse.
A man is playing Frisbee with a group of other people.
A woman standing with a cell phone in her hand.
A plate of fruit with bananas oranges and other fruits.
Old black and white image of a man starting an airplane propeller.
white and green street signs at an intersection next to buildings
Two cows in a field are staring at a motorcycle
A white faced clock with roman numerals surrounded by a painting.
Two men on a dirt path in a grassy field.
Several different types of apples sit in white bins.
A person is leading a horse with a saddle down a beach.
a small child holding a tennis racket with two hands
A man surfing inside a half pipe wave.
A giraffe walking in a grassy area with a tall bird.
A boy and a girl play on the Wii gaming system.
a person taking a photo in a bath room mirror
A kid in black glasses pretends riding a red motorcycle.
Skateboarders are attempting tricks in a concrete skate park.
A skateboarder is in the air as he performs a stunt.
TWO OF THE SAME PICTURE OF A BLACK DOG BY A WOOD CHAIR
Two teddy bears sitting next to a plush hello kitty.
a male is wearing a white shirt and black jacket
Sculpture fashioned to look like a cat holding a pole.
A hand made felt sloth with a button nose.
A group of motorcycle races flying down a race track.
A fridge in the kitchen of a house with blue walls
a lady on her bed with a laptop smiling
An old fire hydrant in the middle of the woods.
a man on a surfboard riding the top of a wave
a white counter top in a home kitchen
A flowered plate of meat and vegetables on a flat surface.
A brick patio with a bench and flower pots.
Multi-colored miniature stuffed bears that appear to float at the ceiling.
a little girl is dressed in a uniform outside
A man at the beach leaping in the air to catch a frisbee.
A man is paddleboarding in the ocean on a cloudy day
An unkempt bed, with a pillow, a blanket, and a book on it.
a plate with a sandwich on it with a side of salad and ketchup
a person sitting on a couch with a cat
A young boy getting ready to fly a kite with his father on the beach
A slice of macaroni and cheese pizza on a plate.
A table filled with several different camera's and people sitting around them.
A kitchen and dining room area with a fireplace.
a couple of stuffed animals sits on a street corner
A variety of food dishes are shown on display.
A train station with an incoming or departing train.
A girl sitting on a stone wall and eating.
Two women are on an advertisement on the side of a pink bus.
a young woman walking on a sidewalk next to a firehydrant
A dog laying in a room near a television and dresser.
Baby elephant standing in the grass beside a truck.
A small, green bathroom with a sink and a toilet.
A person is flying off of they're skateboard
a family in the living room playing with a wii video game
A man sitting in a chair drinking something out of a cup.
Man man setting up a network inside a business.
Two men are sitting at a world economic panel.
Flowers are in a vase on top of a table under some pictures.
Woman in white jacket holding a snowboard in the snow.
a couple of people that are playing a wii
A bunch of ceramic containers that are on a shelf.
A man posing for the camera on his skis
A plate of food that includes broccoli and white dough balls.
A model train countryside scene with a bridge and plants
A makeshift bathroom is equipped with a foot landing and a tiny hole for eliminating.
Bathroom with destroyed walls, a sink and a mirrored cabinet.
Smiling woman standing with luggage in front of her car
A television sitting on top of a television stand.
A group of sheep are being herded by a dog as people watch.
Commode with unusual bowl displayed in bathroom stall.
a teddy bear wearing a red dress and shoes sitting in a chair
A couple of men playing a game with remote controllers.
A laptop with a phone sits on a desk.
Corner kitchen with refrigerator and counter space next to table
A street scene where a vendor is standing and some ladies are doing window shopping.
Three double decker buses are parked outside of a building.
Two fire trucks in front of the station.
A group of different mopeds sitting in the street.
Six men standing on stairs in front of building with large columns.
A white swan standing on a lake next to small waves.
Clocks on the face of a building below a steeple.
A person with a toothbrush in their mouth with a baby.
One man leaning on a parking meter talking to another man.
A man with takeout sitting on the floor watching television.
Woman sitting on the bus with her dog next to her in other seat
A female professional tennis player preparing to serve the ball
A couple sitting together on a bench in a park near water.
A city intersection with several street signs and instructional signs.
A stop light and a home built chair on a brick floor
several people i the water para sailing near the beach
A train is pulling into the station beside waiting passengers.
a group of tennis players chatting with one another
A red train traveling down a track driven by an engineer.
Two signal lights displaying the 'red' stop light.
A bus that is sitting on the side of the street.
A person in a room with a television and a fireplace.
A close up view of an open laptop in a room.
a desk with a monitor and some remote controls
Family poses in front of their house with horses next to them.
A man in a suit helps a smiling boy straighten his tie.
Green apples, lemons and oranges are in a sink.
A view of a kitten sniffing a pair of high hill shoes.
Two people sitting on the back of a horse carriage.
A woman sitting at a table painting brown vases.
A person biking in a roller skating lane during sunset.
a little girl that has a big doughnut in hand
A serving of meat covered with gravy and a side salad on plate with utensils.
Two cats that are looking at a camera.
A grey and white cat watches a cup of tea brew.
An elephant peers through a wired fence as far as his tusks will let him.
Two men working in the back of a pickup truck.
A woman holding a red umbrella in the rain.
A commercial airplane with the door open and people walking in.
Black and red bird standing in front of a caged in area.
A man with a piercing in his left ear smiling.
Some cars are stopping at a stop light.
A couple of women standing next to a couple of soldiers.
A women sitting in front of several laptops looking at her cell phone.
A tennis player poses, racket in his right hand, left arm behind him.
A group of people riding horses in a line along a trail.
Three horses are in a pen and they are blind folded.
A red fire hydrant sitting in the middle of a green field.
A hand holding a pair of scissors next to a chair.
Some people that are hanging outside my car.
A plate full of food with potatoes and cheese.
an airplane is flying past a large city
A cow standing in the grass with a tag in its ear.
The legs of a person resting on a train with a backpack nearby.
Three people look at paper work in a hospital room.
A hot dog sitting on top of a bun in a wrapper.
a close up of a plate of food with broccoli
A blender with a mixture in it sitting on a counter.
Several different kinds of donuts on a tray.
A bunch of airplanes parked at the airport
A person snowboards down a large snowy mountain.
Two men and two brown horses pulling a cart in barn
A man plays tennis on a tennis court.
A lonely zebra galloping through a wildlife enclosure.
A little girl puts something into her mouth while looking at the camera
A small very neat kitchen near a bedroom and another room.
A man paddling a surfboard on a lake.
A wooden bench on the side of a trail has a backpack left on it.
A man drinks wine while another man chops vegetables.
A bunch of street signs sitting on the side of streets covered in snow.
a train traveling on an elevated train track.
A bridge and clock tower are lit at night.
Someone takes a photo as they stand in a bathroom, near the mirror
A  train that is parked in front of a large cruise ship, with a blue crane next to it.
A motorcycle stands in an exhibits beneath some roofing.
Many flat bottomed boats on a swampy river.
A computer station with monitor, keyboard and personal items.
A bathroom with a double-sink and some mirrors.
A baseball game in progress with the batter starting to run.
An industrial type bathroom with an open shower.
A yellow dump truck that is near a building.
A boy with a kite in his hands in a grassy field.
a woman standing outdoors with a cat on her shoulders
A LOAF OF BREAD IS ON THE TOP COUNTER
A black and white picture of dunes, two benches and a trash can
a group of men play soccer in a dirt area
A skier skiing down a slope wearing a dark snow suit.
a male skateboarder in a white shirt doing a trick
The woman is talking on her cellphone while walking down the street.
A young man riding a bike past a car while talking on a cell phone.
A motorcycle sitting on top of a wooden book shelf.
A man and a women who are running toward a Frisbee.
A black dog running across a green field with a frisbee in it's mouth.
a dog that is sitting in front of a frizbee
The reflection of two men in the mirrors of a public restroom.
A cake donut sitting on a plate at a bistro.
A boy pouring some drink into a cup at a counter.
a group of giraffes sit inside of a caged area
a man is sitting in front of some food at a table
A bird flying into the side mirror of a red vehicle.
A skier is posing in front of the sunset.
A large sheep grazes at a countryside farm.
a photo of a man over a table of food smiling at the camera
A man kneeling down next to two large dogs.
A landscape of some mountains with a plane flying above them.
A pan filled with food sitting on a stove top.
some baseball players are playing a batter and catcher
Men with suitcases at an airport ticket counter.
Man in the motion of running and throwing a frisbee from his hand.
Man with a yellow jacket riding a scooter.
Man standing holding a remote control towards a component.
A plate with a brownie and vanilla ice cream.
A family gathers around a table with cake and beverages on a deck at night.
a number of people holding surf boards close to one another
a man is standing in front of a table
An old classic red truck is parked in front of bank as a man stands near the window and a woman stands in the background.
Two elephants walking near a pool of water and a forest.
A woman carrying a cake with lit candles towards a young boy.
A man making a phone call has no shirt on.
some white sheep are eating grass on a hill
Two girls posing for a picture with painted on neckties.
A clock and two vases sitting on a small table.
A woman with a stuffed animal on a train platform
a glass of wine a table with dishes of food
A man grabs the back end of his snowboard as he soars off a jump.
A crowd of people are standing in line.
A cake cover is made to look like a wire birdcage.
Back view of three men on a baseball field.
A boy is hugging his stuffed animal toy
A construction working holding a stop sign while standing in the street.
Two ladies are riding horses on the beach.
A baseball field filled with players and an umpire.
A white bowl filled with lots of ripe bananas.
A man taking a swing at a tennis ball
There is a country styled kitchen with wood flooring and white walls.
a plane at the airport landing and people besides it
An office with a desk and chair with the door open.
Three urinals in a restroom each urinal is at a different height to accommodate adults and children.
a man riding a snowboard into the air.
a person holds a horse that stands on some beach
A man looking into a refrigerator door for ingredients.
Someone holding on to a dog collar while the dog has a frisbee in his mouth
a big window showing the reflection of a building across the street
The men are going to ride their bikes in the dirt.
A white polar bear is laying his head on his paw.
Sandwich sitting on a plate next to a glass of juice.
A living room with a Christmas tree beside two couches.
A man in a courtyard reaches out to catch a Frisbee.
A red and white stuffed animal with a tv remote in a bed.
The bathroom has a toilet, sink, and mirror in it.
a baseball player that is at home plate with a bat
An overhead view of a man sweeping the street by a sidewalk.
A black and white dog looking out a window.
A black cat with white paw laying in a hanging cat bed.
A woman with long blonde hair wearing a men's neck tie.
At the birthday party there are plenty of snacks.
A man jumps to catch a frisbee with two hands
A dog is asleep on a white blanket.
A group of giraffe standing next to each other in front of a building.
a black and white clock on a pole a building and a flag
an old and nasty bathroom with a toilet and shelf
A person lying on the ground posing with a snowboard.
a man and a woman along with a baby sit an watch a lap top
A duck and elephant stuffed animal sitting next to each other.
A cat is sitting on top of a toilet seat.
A person holding a hot dog with yellow mustard and onions on it, at a sports stadium.
The back of an Apple iPhone with the front on the table.
A woman holding a container while milking a cow.
A bird sits on the thin branches above colorful leaves.
A herd of elephants walking along side of a river.
A man wearing red is skiing down a hill.
A red train with cars traveling with a mountain in the background.
Some people in a very big area flying some kites.
two guys play firsbe on a grass field
a red double-decker bus next to a bus stop.
a group of people that are posing for a picture
A historic clock tower turret still keeps the time.
A lounge with chairs, shelves, and a fireplace
A living room couch with a display of large mirror and flowers.
there is a large truck that is carrying many things on it
A window in a room with different shelves nearby.
A woman buys a bunch of bananas from another woman.
A salad and a partially eaten sandwich on a plate.
A man looks at what he is currently holding in his hand.
A white cat that has yellow eyes looking straight ahead.
Two zebras are battling each other on hind legs.
three surfers wearing we suits are riding the same wave
A city sign that is underneath a stop light.
A woman in a striped shirt in the kitchen next to the fridge.
A man wearing skis standing in a victory pose.
There is a truck parked on the side of the road.
a man reading the label on a food package
A person who is standing up holding a frisbee.
A couple of kids standing next to each other.
a man that is standing in front of a stop sign
A person reaching for a wii controller
A chocolate cake sitting on a plate with ice cream
A square white plate is holding a vegetable heavy entree.
There is elephants both young and old on this African bush land.
a baby and a bear play on a sofa
A giraffe in the middle of the street blocking traffic.
people with a carmel at the beach playing
a plate with some pizza, salad, and some sauce on it
A baby sleeping next to a brown teddy bear.
Modernistic couches and chairs surrounding a big-screen television.
There is a box that has a lot if wired inside of it
Two guys are sitting at table.  One is looking at a cell phone and a computer.
A living room with blue seating and wooden tables and cabinets.
A refrigerator that has its door closed and then opened.
There is a person sitting at the tablr
A woman laying on top of a bed in red shoes.
three zebras standing next to each other looking into the camera
A man laying on top of a sandy beach laying next to a surfboard.
A snow boarder jumping off a ramp at night
A small boy laying on the ground with a large stuffed animal.
A BOY WITH A BLUE SHIRT AND JEAN PANTS DOING A TRICK WITH HIS SKATEBOARD
A few cars are parked in a parking lot at night.
An apple is being cut into slices on a cutting board.
A series of little weird cars in fron of an european arch.
A picture of a very green plant and red flower.
A large organ van is parked next to a smaller van.
A bathroom has a diaper changing table in it.
A woman standing on a blue mat with two broken tv's and a bat in her hands.
A white bridled horse carrying blankets in the desert.
a bunch of bears that are in cases
A person riding a wave on top of a surfboard.
A male surfer on a white board in the water.
A person lifts a slice of gooey pizza
A large white polar bear walking through the snow.
a close up picture of President Obama
A bear that is going towards some water.
A beach with flags in the ground and kites overhead in the sky.
Man sitting on the floor with a case full of pamphlets.
A man is poised to hit a tennis ball.
The building has a large clock on the front.
A group of people sitting around a table with glasses of wine.
A sandwich on a plate and full wine glass are under blurry lights.
A street lined with buildings and red double deck buses.
A large white parked airplane and some trucks
A white sink and toilet in a room.
Three black bears on rocks on the side of the river.
An online game player playing while two other men look on.
A bedroom with a desk, bed and entertainment center.
A group of boys playing with kites in a field.
A zebra eats grass with another zebra beside them and a third zebra nearby.
The man is riding his dirt bike on the street.
A sandwhich in a deli tray, with a soda and a book sitting next to it.
a lady driving a wagon with red spoke wheels being pulled by a horse
a pedestrian traffic light with street name and pedestrian crossing signs
A painting of a woman holding a Frisbee.
The table has meat and donuts sitting on it.
A man walking on the sidewalk with a cart that is piled with a stack of luggage.
A woman standing on top of a tennis court with a racquet.
A couch and a table in a room.
Two giraffes lick a branch on a grassy field.
A clock tower in an open space with decorative plaques under the clock.
A dog is hiding half under a bed with its nose and rump sticking out.
People posing with a white two door refrigerator
A long train traveling across a road on train tracks.
Cars are driving past two tour buses on the road
A calico cat lounges in a blue chair in a home.
A kitchen sink with kitchen utensils in containers.
A double-decker bus is parked in a large field.
Dessert on a white plate next to a silver fork.
A woman reaches to hit an approaching tennis ball.
Guy holding his mug why sitting in front of the computer
an iron bed with a hand made quilt on it
A stuffed pink teddy bear laying next to a doll in a dress.
some baseball players are playing baseball on a field
Two cows with big horns are on a dirt road.
a woman is taking her surf board out to the sea
a person on a skateboard in front of a car on the road
A ferry docked at a ramp with people exiting.
The two snowboarders are relaxing at the bottom of the slope.
Two men sitting on the backs of horses in a field.
a city bus drives down a city street
Two towels arranged in a heart shape on a bed.
a boy is holding a tennis racket outside
Several cows laying down in a hilly area near a body of water
A traffic light flashes green against the backdrop of a city.
A man with an umbrella and other pedestrians walk down a street.
Four people brushing their teeth in a bathroom.
Player preparing to return volley during major tennis match.
A bathroom sink with a towel rack, soap bottle and an air freshener sitting next to it.
A man is riding a skateboard over a ramp while wearing a helmet.
A group of guys playing basketball on a city street
Many different types of toppings on multiple pizzas.
a living room decorated in beautiful red, white and black oriental imagery with vases and scrolls
A giraffe eating something out of a persons hand.
a bunch of giraffes are in a large pin
A man holding a metal cup on top of a wooden table next to a window.
A cat standing in a laundry hamper looking down.
A motorcycle parked near a curb with a man on a bicycle riding by.
There are two men preparing their boards for a sport
A little girl standing on top of a wooden chair.
A man standing next to a pair of sheet while biting his clothes and holding a meat cleaver.
A zebra walking across a dry grass field.
a close up of two people talking on cell phones
A brown bear walking through an enclosure.
A cheese pizza made with mac and cheese and flat bread.
An airplane monument placed beside of a road.
A pile of ripe bananas sitting on top of a table under an umbrella.
An elephant and a bunch of cattle at a watering hole.
A man on his phone in front of his laptop at a cafe
A banana and some sliced cheese are on a cutting board.
A tan muscle car sits outside a home on a gravel drive.
Steamed white rice and a variety of dishes for lunch
Two people are riding on top of some elephants.
a man standing on a surfboard inside the water
a couple of men are playing tennis on a court
THERE IS A HORSE THAT IS EATING GRASS
Adult baseball player preparing to throw ball from infield area.
two men in the park playing with a frisbee
A very tall building that has a clock.
Large elephant walking forward down a dirt road.
a person riding on a horse behind a fence
a close up of a young child eating something
A kitchen with cabinets, wine glasses and a refrigerator in it.
A small bird sits among a bunch of branches.
Two bags are full of fruits on the table.
A person reaches to catch an incoming Frisbee.
A cat curls up on a soft and comfortable bed.
a train is preparing to leave a train station
A boat with equipment on it riding through a waterway.
Warning signs outside a fence at a transit station
The young person is jumping over the back of a blue bench.
A bunch of very cute fluffy sheep in some hay.
A clock that is on the side of a building.
A couple of men moving a large book shelf
A man at a podium with another holding an umbrella over him.
A smiling woman sitting on a motorcycle in front of a building.
an overhead view of many people on motorcycles
A cat on flora fabric with Obama on tv behind it
an old car sitting on the side of the road
There is a close up picture of bread and eggs
A red scooter is parked on the side of the road.
a bird on a beach with a ship in the back ground
A toll booth next to a highway at night.
An owl sits in the grass with his eyes shut.
a person holding a skateboard riding an escalator
A herd of sheep with two sherds moving down a road in the mountains.
A white and black motorcycle sits in a parking spot.
A flock of ducks swimming across a lake.
a woman clips her babies finger nails off
A kite that is sitting up against a house
a big brown bear with two young cubs
A peacock with very large feathers walking down a street.
A watermelon pound cake with icing with a slice taken out.
A cat is snuggled up in a black backpack sleeping.
Two motorcycles parked outside a building on a busy street.
A girl throwing her frisbee so her dog can go catch it
a bunch of cars sit parked down a side walk
A plate of food next to glasses and bottles of wine.
A woman standing on a beach, holding a kite.
A huge bathroom with a large window overlooks the ocean below.
A man in a suit drives his car.
A plate is piled high with a meat and broccoli entre.
Wine glasses and several items used in photography sit in a studio.
A couple of people standing on a beach holding surfboards.
A man para glides on the water near land.
A bear sticks out its tongue while climbing.
An array of vegetables including tomatoes, turnips and others.
a room with drawers full of books and a screen
People riding a sky lift watching others ski down the slopes.
Kids plastic tools and toys on a table.
A large living room filled with art pieces
a bathroom with a black counter and a big mirror
a living room with a big black couch in the middle of it
a white horse with a white cover and some grass
A woman wearing fishnet stockings sitting on a bed.
A bunch of people are sitting together eating pizza and talking.
A yellow finch perched on a white fence.
A crossroad displaying the signs for Creek Road and Amethyst Street.
A crowd of people are watching two teams of athletes perform.
A woman in a living room playing a game system.
Three people in the water, one of a surfboard
Two green freight trucks parked on the side of the road.
A brown stuffed animal dog with a black collar sitting in front of the mirror.
Bowl of oranges on a wood surface with more oranges on the side.
A shirtless man riding on a large motorcycle on the beach
A tennis player is being watched by a crowd.
The man has his hand on a rack of small yellow objects.
Man flying a kite from a roof top in an urban area.
A man in a black dress jacket is talking on a cell phone.
A very attractive young lady using her cell phone.
A dog is sitting under a bench outside
A little boy that is standing in front of a counter.
A cameraman taking a photo of a skateboarder in action.
This black and white photo was taken by water.
A man standing on top of a sandy beach near the ocean.
A black bear sitting on a rock surface.
A group of people surfing in some water.
A red fire hydrant is leaking onto a side walk.
A man dodging a frisbee flying at his face.
Three giraffe standing next to each other at a zoo.
A little girl standing in front of tall wooden doors next to a dog.
A man with a tie and glasses is by a house.
There are people camping and flying kites in a field.
A group of people standing on top of a dirt field.
A boy smiles at his friend while his kite soars high.
A baseball player that is standing in the dirt.
A cat is lying on the hood of a black car.
A couple of birds are flying over the beach
Two black cats are casually laying on a computer desk.
An assortment of food and four wine glasses.
a person is pulling apart a eggplant
Several streamers float above people on a beach
a person standing on skis on a snow covered slope.
A silver colored video monitor sitting on a gray table.
A stop sign and a no u-turn sign.
Skiers of all ages skiing down a slope and gathering at the bottom.
A boy and two girls taste testing different vegetables
A metallic refrigerator freezer sitting in a kitchen.
A cat that is sitting on top of a speaker.
One of the giraffes is peering into the building.
Large dog laying down on a blanket next to a table.
A crowd watches a batter in a baseball game.
A man standing in front of a pile of food under an umbrella.
A breakfast of bacon, waffles, and fried banana slices
A crowd of people sitting in a room on to of a wooden floor.
A white toilet sitting next to a white sink in a bathroom.
An umbrella and camera equipment sitting in the corner.
Giraffe standing tall in open grassy field with fencing.
A man on a surfboard on the waves surfing
The view of a busy urban area at night.
A family stands at the top of a mountain while skiing.
Man in a tiger suit in front of another man on the phone
A table with a chicken sandwich and a cellphone.
A view of a modern building with skylight and a fire hydrant.
A baseball team in the dugout preparing to bat.
Man in boxers on couch with two laptops
Three limes are next to a small bushel of bananas.
Four seagulls are standing in a line on a large logs in the middle of the sea.
A pitcher partly covers another baseball player during a game sponsored by Comcast.
The man is skiing down the snow slop.
A young boy eating a custard covered donut.
A sandwich on a white plate on a table.
The foreheads of two zebras standing side by side.
A girl on a bench outside a salon checks her phone.
A long commuter train passing by a train station.
A dog running in the snow with a Frisbee.
Four horses and a man with a hat sitting on one.
A plate filled with meat and different kinds of vegetables.
a living room area with a two-person couch and various living room furniture
there are many different donuts on a yellow plate
A white plane getting ready to take off on a runway.
Two people in a small boat floating by some greenery.
Three adult and one baby giraffe standing outside.
A table topped with coffee cups and plates of food.
A bathroom with a toilet, sink and bathtub.
A man has laid out all of the items he plans to pack.
A ski resort area with various skiers in the snow and several in line on an automatic transport belt.
A bird on a table drinks from a tea cup.
A man standing in a dry field, holding a Frisbee.
A person on a cell phone on a street.
A bird floating on top of water in the rain.
A man in black jacket skiing down a hill with a kite.
A green suitcase sitting on a wood floor.
A bathroom area with a toilet, trashcan and tiled floor.
A skateboarder coming up out of a dry pool.
there is a small lap top surrounded by other things
A dog and man rest on the bottom of an overturned boat sitting on the bank of a body of water.
A picture of a man wearing a suite and tie in a picture frame.
a bride and groom a purple table and a purple and white cake
A group of people walk down the pa towards the beach
Two sailors are shown walking in a parking lot.
There is a dog wrapped up in a blanket.
People ski at a ski lodge during a snowstore.
Cross country skiers in a competition with number 33 in front
A white bathroom with a sink and mirror next to a shower.
a plate that has a table full of food
A person in a purple jacket is on a snowboard on a snowy hillside.
A surfer carries his board as he runs through the water.
a bath room with a toilet and a sink
A young baseball player is getting ready to hit.
Two dogs plays together on the ground in the dirt.
The passenger train drives around the curve of the tracks.
A man and a young woman walking down an alley way.
A man swinging a baseball bat at a ball during a game.
A man riding on the back of a parked motorcycle.
A birthday cake with a number one and three candle.
a child sitting on a car eating a hot dog
A large colored bird perched on a power line
A man with a surfboard walking along a beach.
A red double decker bus on street next to buildings.
A plate of Mexican food with beans and tortillas.
A gold and blue clock that is on a building.
a person laying on a bed while reading a book
The crowd of people are looking to fly their kites.
A toilet stall that is white all around.
a large green and yellow train on a track
Cars are parked on the street next to an old fire hydrant.
A close-up of a hawk with a group of people in the background.
A newborn foal nursing his mother in a corral.
Dump truck alone on road with buildings and bare trees and shrubs behind it.
Two men in the Navy cut a cake shaped like an aircraft carrier.
A woman riding a wave on top of a surfboard.
there is a plane flying very high in the sky
A crowd of people walk along a sidewalk near a busy road.
A person is standing at the edge of the water on a beach.
Adults and children gather near a dock on the beach.
The ride attendant watches over the wave park.
A residential street with large houses during sunset.
A street scene with two men napping on a bench, a woman walking, and two other men looking at their own reflections in a shop window.
A red stop sign targeted specifically at bicyclists.
A large plate is adorned with broccoli and a rather small piece of meat.
The man on a bicycle is using a cell phone.
An Asian lady in a red dress petting a small elephant at a zoo.
A small bird on a sandy beach near the water.
a close up of a plate of food with broccoli
Two people standing at a food truck placing an order.
A red stop sign next to a street corner.
A woman holding up a fairly large pizza.
A plate of vegetable stir fry with sauce.
A batter and catcher assume their stances as an umpire looks on.
A dirt bike rider is racing through the dirt track.
Two giraffes standing next to one another and interlocking their necks.
a white bathtub in the center floor of a bathroom with a sitting chair and a window with drapes.
A bus parked at a stop beside a small home
there is a man that is throwing a frisbee between his legs
A man eating a piece of pizza at a table.
a person walking holding an open umbrella
People and dogs sitting in a boat floating on water.
Two flowers are allowed to grow in a beer bottle.
Girls reaching for the  basketball in a gym
Ocean fairing ship near land seen passing markers.
a couple of bowls with some food inside of it
A man playing swinging at the ball during a tennis match in front of spectators.
A couple of single beds with a phone and remote control by them.
a small child stands in a tennis court, about to serve a tennis ball
a girl in glasses is sitting at a laptop
A wooden surface with three frosted doughnuts on the top.
A tiny suit case full of girl's doll clothes
a city bus parked on the side of the road
a person jumping with their skateboard by some stairs
An image of a city skyline taken at night.
a blender with mixed fruit sitting in a container
A shower stall set up with handrails and a seat.
A slice of chocolate cake is on a small plate.
Two photos of a living room- one without a ceiling fan, one with the fan installed.
Young girl walking up steps to dog at pier area.
a cat that is standing on a red chair
A man standing next to a truck parked on the side of a road.
a building with a large clock above an archway
Birthday cake with a three candle and six other candles.
A young precocious girl clutching her teddy bear.
Five baseball bats on a silent auction table.
Two brown cows looking at the camera.
Two people laying on a green bunk beds
A silver bin holding different kinds of vegetables.
An old, rusting, yellow fire hydrant n weeds.
A stuffed animal is standing on a table
A computer that is turned on with piles of paper to the side
A person riding a board through the air.
A dozen doughnuts sitting in a box and ready to eat.
Close-up view of skateboarders lower body performing a trick on a high wall.
A table with food and a drink on it
Skiers make their way down the trail through some trees.
a bird sitting on a shore next to a lake.
A motorcycle with a side car parked with other motorcycles.
Glass enclosed shower with white tile walls,brown floor
A man riding a board while hooked up to a parachute.
People are standing in front of a castle type building with an eerie gray background.
A small boy cutting out things from paper at a kitchen table.
A dog is sleeping on the bed and having fun.
A set of lights on a light blue motor vehicle.
A man is standing in the water next to a boat.
A small bathroom with a commode and sink, and empty corner.
The refrigerator, stove and microwave are on the same side of the kitchen.
a man on a skateboard performing a trick at a skate park
A cat sitting on a motorcycle that is parked in a driveway.
A dog leaps in the air to catch a Frisbee.
A man in a grassy field throwing a Frisbee.
Statute of a horse and rider on top of a block wall.
a black cat laying on a bed with a colorful blanket
Three trucks with lawn mowers in the bed and people near by are parked side-by-side.
A man brushing his teeth with a tooth brush.
there are two sandwiches that are on two white plates
A stadium full of people are watching a baseball game.
The large bathroom has two beds in it.
A group of people riding skis down a snow covered slope.
A man doing a trick on a skateboard in a park.
A man holds an umbrella and looks over a flowery hill to the sea beyond.
A man talking on a phone while standing on a corner.
A girl holding a tennis racket up with both hands
A woman looks over her shoulder as she pauses while cross-country skiing.
The stands are full as a man in a blue and white uniform holds a bat in front of a catcher and umpire.
A pug dog with a pirates hat licking a bottle.
A living room area with eclectic furniture and accessories
Two birds sitting on top of a rear view mirror on a car
A man surfing on a green surfboard in front of mountains.
The man on the grass is playing with his soccer ball.
a dog on a table on a porch
A bathroom with yellow walls and a picture of  man over the toilet
A vase filled with flowers sitting on top of a table.
a blender with a bunch of food inside of it
A pizza cutter slicing up a food item on a cutting board.
A wooden table topped with lots of veggies and greens.
A horse-drawn carriage ride stopped at the gates of a European castle with three towers.
A hummingbird hovers near a bird feeder.
A man in his ski gear is in the air.
A small bird perched on a metal bar next to a tree
They are selling a bunch of bananas at the fruit stand.
A horse walks through the grass near sand.
Dinner plate with prepared steak, broccoli and sauteed mushrooms
A man in a blue jersey swinging a bat on a baseball field.
Three horse drawn carriages in front of a huge house with a clock on it.
A view of a small plate of food with a orange.
Bananas and coconuts are sitting on an old fruit stand.
Guy in a hat flies a kite on the beach while other people are in the ocean
Group of kids eating some food on a table
A woman sitting down next to some bananas.
Men hitting ball with round discs near brick building.
A man that is standing up in a grass field and holding a kite that is over his head.
Flat pizza like object sitting on table with a person taking a slice
A photo taken within a sleeper car on the train looking at the window.
A man poses in a double-breasted coat with a fur hat.
Donuts in an open box on top of a table.
A bathroom with a sink and toilet next to tile wall.
A large commercial jet in the air with the landing gear down.
There is a woman sitting under her blankets
A plastic hand reaching towards a plastic toy blender.
a woman is sitting with a red guitar and bananas
A man is sitting on a chair holding a sign up
this is a man on a bike in the woods
A pizza with basil, cheese and tomatoes displayed on a table.
Young girl posing at table with cake lit with candles.
A cat is sitting on a cushion on a sofa.
There is a woman swinging at a tennis ball
A man on a skateboard is going down a ramp.
A jet airliner flying over a building with sky in background.
Four zebras at the edge of a lake with a multitude of flamingos in front of them.
This bathroom has a toilet and a duvet.
there are two men standing and playing a video game
A city block intersection with cars stopped on a corner.
A large herd of sheep standing near each other
A stuffed dog with a wizards hat on it's head.
Herd of black cows grazing on a hillside.
there is a skateboarder doing a trick in the air
a short yellow school bus parked between two cars
An man taking a picture of a sink through a mirror.
a man is riding a skateboard in a bowl
A giraffe with his head out of sight over a covering.
a toilet attached to a wall in a bath room
Hands putting motorcycle models onto a birthday cake.
That is using physical motions to play the video game.
a zebra standing alone in a pool of water
A group of people sitting at tables with paper and laptops.
A bunch of cows that are standing in the grass.
Looking out from under a frayed sunshade at a beach and water view.
A child brushing teeth in a blue sink.
Three sheep eating grass near a water source.
A boy is jumping off his skateboard a the top of a skateboard ramp.
The giraffes are bending their necks down to eat from the bush.
A woman with a scarf and sunglasses standing next to an human size stuffed dog that has an outfit on.
A silver mirror hangs above a sink in a bathroom.
A shelf containing books, stationery, and a clock.
Tourists riding in a British double-decker bus that is making a stop.
Man in yellow and black body suit on skateboard.
Child wearing a red jacket skiing down a slope near the trees.
A man in yellow shirt doing a trick on skateboard.
A skateboarder doing tricks in a half pipe at a skate park.
A lone woman stands posing in a large kitchen.
A young girl in a chef's outfit cuts raw broccoli in a kitchen
A man in a coat and tie and biker shorts carrying a backpack.
Several graduates call friends and family on cell phones.
A meal of french fries, salad, and meat is sitting on a table.
Black and white photograph of a skateboard with its rider leaping above it
An opened door to a bathroom with a counter and a tiled wall.
A peeled banana sitting on a wooden fence.
people standing around in the snow with some snowboards
A person gets ready to release a kite.
A tennis player in an orange shirt and black shorts holds black tennis racket on a tennis court surrounded by onlookers.
Two people in ski gear standing at the top of a mountain.
A tree filled with unripe apples in an apple orchard.
A busy New York city street at night.
there is a pair if scissors leaning on a rock and paper
a couple of men are playing video games in a room
An old man standing next to a forest of trees.
A group of men on skateboards on a ramp.
A crowd is watching horses go down the street.
A cluttered kitchen with white cabinets and tiled floor.
A man is who is kiteboarding on the ocean is airborne.
A red traffic light sitting on the corner of a street.
A photograph of papers and a computer at a desk.
A red and white sign reading "Whoa" and a red a white sign reading "Caution children at play".
A baseball player holding a ball and a glove.
a black and white photo of a person in a suit and a person in a dress
A yellow and green train traveling under signals.
A reflection of a dog sticking its head out a car window
Black container sitting on top of a white toilet and a bathroom.
A woman standing in front of a door with a broken surf board next to her.
Tennis player about to hit a ball in front of an ad.
A man reaching his arm to catch a frisbee.
there is a green bike parked by a red bus
People lined up on a sidewalk near a bus.
A bathroom featuring toilet paper hung from a chain.
a street sign next to a tree lined street.
A soldier wearing an Army uniform rides a regulation motorcycle.
Their is a little kid using a phone
A man walking a brown horse wearing a red blanket.
Two giraffes inside a building near a beam.
A cat sits on a wooden park bench.
A orange tabby next to some black birds
A horse drawn carriage going down a city street.
People are flying kites on a beach near the boardwalk.
A scooter with a helmet hanging off it's handlebars.
a person on a skate board does a trick
A man with a catcher's mitt reaches out to catch a baseball.
A child in a living room is swinging a bat.
Two small children are laying in a bed under blankets.
a tennis player hitting a serve on a court
A plate with meat, onions, gravy, broccoli and cheese.
Refrigerator and freezer are filled with soft drinks and beer.
A giraffe standing next to some tall building
A little boy holds a small dog while he sits on a bench
A child in a giraffe costume and a child in shorts cooking in a kitchen on chairs.
A shelf filled with organic mango peach juice, bananas, oranges and eggs.
A dog celebrating its birthday with a cake.
a couple people on the beach flying a kite.
a bowl of fruit in black and white.
Some traffic signs in front of a church.
A small brown monkey sitting down while holding a banana.
A woman cutting a cake with a knife.
A young boy holding a toothbrush and toothpaste getting ready to brush his teeth.
A kitchen with a black automatic dishwasher next to a  doorway.
there is a farmer market with lots of fruits
The white cat is sitting underneath an umbrella
a young boy standing in a living room holding a wii controller
a train on a track near many trees with a sky background
A van is pulled up to a boat docking area while a cow stands alongside the signs.
A person sits with their feet up with a boxed pizza.
A boy sitting down with a shoe in his hand.
A silver train traveling down train tacks near other trains.
A demonic looking life like doll sitting on a bed next to pile of human skulls.
Two teddy bears, one a police officer bear sitting in the lap of the other, a white bear, both of them sitting on a wooden chair.
A wide view of the patrons of a large library.
A picture of a trolley that is on some train tracks.
Two male chefs cooking in a kitchen while another staff member uses a mobile phone.
a man riding a motorcycle down a city street with luggage and a sleeping bag attached
A cat that has curled up in a bowl.
FOUR SHEEP IN AN ENCLOSURE WITH SNOW AROUND THEM
Basil, cheese, tomatoes and bread on a plate.
The elephant is an extremely large animal.It has a bug tusk.
A couple of men that are standing near luggage.
A man laying on a blue couch in a living room under  mirror.
A glass vase of yellow daffodils sits on a checkered table cloth.
A kitten is eating cat food from its dish.
A man is holding a tennis racquet and hitting the ball.
An open refrigerator door with very little contents.
Two small brown sheep in a fenced in pen
A man rides a skate ramp on his skateboard.
A sink in the bathroom next to an open toilet.
A post with several street signs on it, including the name.
A teddy bear and another stuffed animal next to bookshelves.
The man who uses this bathroom shaved this morning
A man wearing sunglasses talking on a cell phone.
a man is standing and holding a controller
A man looks somewhat blurry on bike as others look on.
a person riding a race bike doing a trick
Four zebras drinking water in a sandy field.
A long desk area with a desktop computer at one end and a laptop computer and Wii video game system on the other end.
An old Gothic style church with a clock in the tower.
A rhododendron bush is in full bloom beside a park bench.
A lot of colorful umbrellas lay out on the grass.
A young man sitting on a couch using a laptop computer.
a desk with many laptops a monitor and a mouse
A woman standing in front of a cabin in the snow.
a bed with two tables a purse and books stacked in front of
There are two red and white street signs that show directions
A white and green bus on road next to a car.
The woman runs to hit the tennis ball coming towards her.
Five delivery bicycles are parked aligned along the wall.
A herd of horses in a grassy field near a hill top.
An elephant statue sitting in front of a clock.
A food combo has noodles, cabbage, eggs and meat.
A train sits on tracks near power lines  and a street sign.
A group of people at a long table eating dinner together.
A couple of people playing a video game with remote controllers.
A boy is putting peanut butter on a sandwich
a person in a living room with a emote control
A blue and white plate with ham and vegetables on it.
The kitchen counter has a cutting board with chopped vegetables on it.
A bus driving through traffic in a city with skyscrapers.
a bathroom with shower, toilet, and sink with shelves
A bunch of men standing in a building and one of them is on a cell phone.
A train in the middle of tracks with people.
A group of snowboarders riding in the white snow
A skier makes a jump on a very steep hill.
Baseball memorabilia is displayed in glass stacked casings.
A teddy bear is sitting on the rail of a wire fence.
A cable car in front of a tall building.
A little girl sitting at a table with lots of food.
A dog sits and stares at the TV.
A person that is playing in a tennis game.
A young women in wet suits carrying surfboards.
A man riding a snowboard down a snow covered slope.
The skateboarder is about to perform a trick at the cones.
Three giraffes in an outdoor setting with one giraffe drooling.
A cat sitting on a couch , with a shirt covering it.
Iguana eating fruit in fruit stand not intended for him.
These young grey hours are playing Frisbee with their owner
A man holding a container of two hotdogs.
a messy kitchen counter and sink covered with dirty bowls and other cooking ingredients
A compact kitchen with white appliances and shelving units for storage.
A pile of luggage at a transportation hub.
Herd of happy zebras in a field of grass
A lot of flowers that are by a walk way.
A man and a woman eating lunch at a restaurant.
A glass vase with flowers resting on a grave.
A quiet highway with a street sign up ahead.
The room in the house needs to be picked up.
Two people with their arms wrapped around each other sitting on a bench.
a purple mug is next to a bowl
There are some men playing a game of baseball.
Two giraffes stand in their enclosure at the zoo.
A large clock on a pole on a street.
A red fishing boat floating on the water.
A group of three men riding in the snow.
A woman having fun with a baby elephant
A woman showing her hot dog to the camera.
There is a cat walking along the edge of a sink
A large herd of cattle is in a field.
Two men skiing across a snow covered slope.
a woman leaning on a counter poses for a picture
two zebra standing next to each other while one kisses the other in forest field.
there is a bench under a very large tree
A lady puts a frisbee in a frisbee goal.
A small blue car that has been hit by a  city bus
A grouping of bananas and other fruits against a wall.
a group of men play a game of frisbee in a park
A group of people in a park flying kites.
A semi truck parked at a rest stop.
a close up of uncooked pizza on a surface
A variety of kitchen utensils hanging from a peg board.
a cat is sitting in front of a television
a sink with soap a towel rack and a towel
An open marina with boats on both sides
A hanging traffic light at an intersection with another traffic light visible in the distance.
a bedroom with a big bed, and a lamp.
A toilet and sink side by side in a bathroom and a mirror.
A train parked inside of a train station next to a loading platform.
A linden tree overlooks a park bench on the banks of a lake.
A blue counter top with lots of pairs of scissors on them.
The cat is playing with the shoes on the floor.
The bulldog has a mean look and is protecting his home.
A couple of street signs hanging from the side of a pole.
A very nice motorcycle in a drive way.
a baseball player is swinging his bat at a ball
Two skiers race while a crowd looks on.
some people a stool a counter some lights and bottles
An intersection with a crosswalk and street lights.
A red frisbee stuck in a tree at a park.
A black bear is surrounded by black birds on grass.
A person with a lighter lighting candles on a cake.
A man looks a donut hanging from a string.
A women sits in bed with her white dog and she is looking at her cat.
a cow stands in front of tall stacks of hay on a grassy field
A bunch of shirtless dudes walk down a road
a bathtub with bed behind it and big window.
Someone looking out their window at vehicles on the street.
A living area with a  coffee table with food on it.
A kid and an adult are flying a kite.
a small brown and white bird sitting on a branch
couple sitting with a dog wearing a cowboy hat
Four young men sitting on a bench with four skateboards.
A woman and two men on the beach with surfboards.
Young lady with her legs in the air laying on a bed in a room.
A person sitting in a chair watching a computer screen while playing a guitar.
Kitchen with wooden cabinets and a center island.
Several boats are docked along the side of a river.
A brown and white dog laying on a floor.
A Safeway truck that carries merchandise for the stores.
Two zebras are standing in the shade of a building
The people are posing for a photo out of an airplane.
A colorful bird sitting on a branch full of leaves.
A bathroom with a toilet next to a sink.
A boat that is sitting in the water with a sail.
A man riding a blue motorcycle on the road.
A bedroom with a large, unmade be, a ceiling fan and other bedroom items
a giraffe in its pen and two people are feeding it
A woman is standing on a tennis court and holding a racket.
The glass bowl holds a broccoli noodle dish.
A red fire hydrant surrounded by yellow flowers and grass.
A person with a ring smiling holding a object.
A man and woman playing tennis on an asphalt court.
A red fancy bus is parked by a standing man.
Four umbrellas lying down a beach during the day.
a white bathroom with a urinal and two framed pictures of clowns
A lady is playing doubles tennis with a man.
Group of zebras standing on a dirt field together.
A salad that contains broccoli and oranges in a blue bowl.
A small dog sitting on the back of a cow.
An animal is covering up the keyboard with it's long tail.
A man skiing with a dog close to him
A colorful plate of vegetables, fruit and beans
a grey cat sitting on top of a couple of plants
A man making a vase on a pottery wheel.
blue and white working truck sitting on the street
A black and white image of a baseball game.
A cat laying on a pink couch with a large brown hat on
A man standing on skis next to a sign.
A cat in a room with an assortment of luggage.
A pretty young woman sitting at a desk working on a desktop computer.
Person on the beach flying a black and red kite.
A woman sitting at a restaurant getting ready to eat her food.
The man and the dog walk near tall stacks of plastic chairs.
Carrots and dressing on a plate with some yogurt.
A giraffe in a pen looks down towards the ground.
A plane sitting on a runway in the middle of the day.
Three mountain goats on a rock with grass around it.
A man wearing jeans sitting on a parked motorcycle.
This is an image of three children with play phones.
A person riding on the back of a white horse.
a cat sitting between a window and security bar
A bunch of different types of doughnuts together.
A couple of air planes flying through a blue sky.
A train's bathroom with a sink and a toilet.
A large group of skiers waiting in a formation.
a coffee maker is sitting on a marble counter top
A young man tossing a Frisbee in a  park.
a fridge sits in a kitchen next to a door
The bathroom has a sink, toilet, and a shower.
a bench in a field  looking at snowcapped mountains.
Two roosters walking next to a fence, near a fire hydrant.
Little children on a field playing soccer in a park.
a woman on a train holds up her camera to take a picture of something outside the window
Three boats in the green and blue water.
Close up of a street sign in front of a water tower.
Woman in a jersey standing next to a large elephant.
A man hosing a dog off while talking on the phone.
A red stop sign sitting next to a street sign.
A yellow vespa parked in a lot with other cars.
A meal from Japan or China on a tray.
A man standing near the ocean with his surf board
A chair and a clock attached to the side of a building.
A person on a blue snowboard going sledding between trees
Snow boarder riding during the night over a fence.
A train moving through the station with a man on the bench.
A little boy is standing on a refrigerator shelf.
A woman is sitting at her jewelry display and talking on the phone.
Fresh flowers and produce sitting on a counter top.
A group of people sitting at different tables.
A brown teddy bear holding three pizza boxes.
The Big Ben clock tower towering over the city of London
A woman on a court with a tennis racket.
A group of sheep gathered together standing next to a donkey .
A young person on skis flying high through the air.
A large bed sitting inside of a bedroom next to a  lamp.
A keyboard, computer screen and mouse are on a table.
A teddy bear in a chair dressed in clothes
a cat that is standing in front of a person
A living room with a Christmas tree couchs and a black dog.
A couple of cops riding on the back of motorcycles.
A woman in a skirt is side saddling on a horse.
lots of snow on the ground and the ocean is ahead.
Small boy smiling with his head tilted to the side.
A heard of animals in a field approaching the water.
A large ship is on the water near docked small boats.
A decorated Chinese vase on a side board.
A HERD OF GIRAFFES STANDING AND LYING UNDER THE TREES
Many birds gather in the middle of buildings
A dog is approaching a statue of a white bull.
A baby wears sunglasses and plays with a pink suitcase.
A group of people with drinks watching a game be played.
A frisbee barely hits the surface at a lake
A man and little girl are sitting on a bench in front of an airplane.
A wine glass set on a counter of a kitchen area with a reflection of the kitchen in the wine glass.
An airplane outside of buildings near people sitting in chairs.
A dimly lit remote control and image on screen
An adult bear and three babies cross a road
A small bathroom stall has a maroon toilet rug.
A man in a burgundy shirt playing Wii bowling.
A dog is looking out the window of a car.
Trainer shows man his elephant in tropical setting.
A person riding a moto bike in the mud.
A gas stove in a small simple kitchen.
A person holding some some of electronic device.
A cat that is sitting on a dogs back.
Two cats cuddle on the chair in the living room
The desert cake is frosted in two shades of pin, and topped with fancy frosting flowers.
a laptop besides an alarm clock maroon in color
A bedroom with two beds sitting under four framed pictures.
A dog lowers its head to the ground
A person holding a snow board in the mountains.
A couple of cows standing next to a building.
A group of zebras on a grassy plain.
two dogs laying beside each other on a couch
A man is getting a haircut while another man sits.
Group of three players in a baseball game.
A group of jet perform in the sky.
A man flying through the air on top of a skateboard.
This shack has a small table to the left, a stove in the back, and a counter top on the right.
a man sits against a wall with punk accessories
A man in a safety vest standing next to water hoses
A display case in a bakery filled with lots of dessert.
A tall building with a clock embedded at its top.
this paper plat has the word cat and a cat drawn on it
A woman has an apron and head scarf while touching carrots at a produce market.
A mother and baby zebra standing in their enclosure.
A person on the water flying a kite.
A person with their feet on a desk with a plate of pizza and a can of soda.
A fork holding a pink food item on an upside down plate.
A bed is in the middle of a well lit room
A woman in glasses is sitting on a butterfly bench.
A guy and a boy on a motorcycle with a side car.
A man is looking down at a small cake.
A man on a pink and blue bicycle on a crosswalk in a city.
A bird sits on a car's rear view mirror.
A close up image of a giraffes face while eating.
there is a small baby that is holding a small racket
A pile of debris in front of a purple and red building.
a few ladies are playing tennis at school
Two cats are crouched in the refrigerator, among food.
A vintage photo taken of a street sign on a dusty road.
This piece of paper has three hot dogs on it.
A group of people at a wine tasting with a variety of wines.
two bears touching noses standing on rocks
Three people sit under umbrellas at the beach.
a number of sadnwiches and wine on a cloth near a body of water
Table of sampled chocolate cake and ice cream on a table.
A person walking over to a black and yellow kite in the park.
A pan with a crust filled with raw broccoli, carrots and cheese.
A table with a bunch of kids tools sitting on it and other items.
Hotdogs cooking on a commercial grill with condiments nearby
Bottles of infused oil and a glass vase full of glass flowers
A large white cruise ship sitting in a harbor.
a stop sign and a pole in a dark knight
A left hand holding a partially eaten, pink, iced donut
A red fire hydrant in front of a building.
A woman in a bikini standing next to a man on the beach.
The action during baseball as the pitcher throws
Several baby and parent giraffes sitting around a cut down tree.
A black and white photo of two female skiers in a mountainous landscape.
A woman standing in front of a table of baked goods.
A man is cutting an onion on the cutting board
A man filming a women holding a microphone on a street corner.
A man with a red hat, tie and white shirt
A baby laying on a colorful quilt with a bib around his neck and a string in his mouth.
A living room opens up into a kitchen.
A large herd of sheep are grazing in a field.
A young boy posing in a baseball uniform.
There is a woman standing n a field around kites
a batter holding a bat waiting for a ball to come
Snowboarder performing trick on snow with trees in background
A man winding up with a frisbee on a court.
A women in military uniform who is giving a cow a shot.
A lady is touching her lip, holding her purse, on the bench.
A young boy eating breakfast in bed.
The space shuttle ridding "piggy-back" on a NASA 747 airplane.
Woman in a white shirt laying in bed looking at a laptop.
a stop sign that has some signs on top
A herd of sheep make their way down a rural path.
A man plowing with oxen on a dirt road.
Duck leaning forward towards a body of water from a concrete footing.
A fire hydrant is covered with graffiti and spray paint as it stands in front of colorful building in the background.
A large white building on the corner of a street .
A view of a woman sitting on a chair with a guitar blocking her face.
A stop sign has been amended with "driving" bumper sticker.
a church with a tower with clocks on the top of it
A dessert is sitting on a small dessert plate.
Table set with black and white dishes with a scissors and dotted line motif.
Picture of a plate of food and a drink.
There is a lot of traffic outside because of the fire truck
A picture of an airplane that is sitting at a terminal
A street sign on top of a stop sign outdoors.
A living room with a glass coffee table, couch and television.
The clock has many different measurements on it.
A kitchen with three tall bar stools next to an island.
a bathroom with two toilets and a bunch of toilet paper
a few people that are walking down the street with some umbrellas
Three birds flying high in the overcast sky
The back of a woman's head in church.
A fancy bathroom with a stand up shower.
A horse pokes his head over the metal railing.
An empty bathroom with white tile and a large mirror
A guy lying in bed with a bag of munchies and holding a game controller in his hand.
Two girl holding tennis rackets on the court.
A man in a suit a and tie with a umbrella.
A white fire hydrant sitting on a street corner with a face painted on it.
A man is sitting on a one wheeled bicycle next to a smoothie.
A herd of cattle standing on top of a grass field.
A group of people that are sitting in the grass.
A group of people sitting around a table with food.
A newly remodeled kitchen with stainless steel appliances.
a close up of an elephant walking on a dirt ground
A couple is walking by a store front windo
An older man takes a pizza out of the oven.
A bunch of pans that are hanging on the wall.
a man is riding a board at the beach
A man wearing a wetsuit in the water on a surfboard.
Two elephants are locking trunks with each other.
A reflection of a kitchen microwave and cabinets.
A bathroom is adorned with a quilt pattern-inspired floor and walls.
A person in a hooded jacket is near a transit bus.
A person is holding an umbrella in a snowstorm.
a person wearing gloves kneeled down in front of a toilet
a wash room with toilet and wash basin are seen.
A long white paddle boat with people riding on top of it.
A somewhat dark image of a laptop sitting in the background of a bedroom.
Two men at the beach one of which is holding a surfboard and a para sail.
a ferry boat and a jet flying over head
A computer and a laptop sitting next to one another.
An aircraft is releasing a red substance below them.
A box of pizza that is opened has tomatoes, cheese, and spinach on top.
A luscious desert tray to satisfy all tastes.
A train on the tracks is parked while people board.
woman in long, light red dress with orange umbrella.
A row of motorcycles posed on a floor next to a flag.
A white bird with it's wing extended floating in the air.
A group of people sitting around a living room together.
A picture of a man and woman on the screen of a lap top computer.
A worker in front of a kiln holding a vase.
An older man drinking white wine from a glass.
Clock tower and official buildings on the other side of the river.
A white cup holding a tooth brush on top of an orange table.
A lone polar bear walking across a frozen landscape.
A coin meter next to a trash can on the sidewalk.
many people riding horse drawn carriages with umbrellas
A group of surfboards and people at a beach festival.
A bunch of birds in the air flying with kites.
a basket is behind a brown bicycle seat
A very small plant is inside a cup.
A busy street is blocked by a crane truck while a construction worker walks by.
A family sits on the gravel of a beach flying a kite.
Two people enjoying a picnic by a river.
Three zebra standing next to each other on a lush green field.
A dark room lit only by one lamp and a computer screen
Guy riding his  gold motorcycle giving a signal.
a young girl standing above a teddy bear taped to a chair
A Pacific National train is stopped at the station
A black and white dog laying on top of a pink and black frisbee.
a close up of a plate of food on a table
Young boy knocking over his t-ball stand in the backyard
A sparse room with a bed sitting in the corner.
a yellow fire hydrant standing behind the tall grass
Someone is trying to eat a slice of vegetable pizza with a knife and fork.
A professional motorcycle rider leaning into a curve.
A group of men on a field playing baseball.
two shots of a man climbing stairs, then jumping down them with a skateboard
a big plane flies through the blue sky
Three people sit around a table eating a meal.
A herd of four zebras in an open field.
A desk with a computer, printer and other various items.
A small blue and white gazebo sitting underneath a lush green tree.
A pink double decker bus driving down a street.
A dog is shown in a car rear view mirror.
A skateboarder is using a ramp to jump into the air.
A jockey with his horse and dog standing in a field.
A metal bowl filled with oranges and tomatoes.
Traditional narrow boats on a river with fruit and people.
A bathroom with two sinks and a large mirror.
there are two men playing Frisbee one is jumping in the air to get it
A batter prepares to hit a ball in a professional baseball game.
A large black train on a track with steam coming out.
Three people holding wine glasses in a bar.
A man is jumping and doing a skateboard trick.
They have a variety of pizzas to choose from.
A street with people in cars and bikes is shown.
a bus driving down a street with people seated on the roof of the bus.
a vintage photo of some people getting ready to cross a street
a dog laying on a bed with a stuffed animal
People are shopping at a farmers market on the street.
Children standing in the grass on a field.
A cat that is laying on the back of a chair and sleeping.
A white sink and towels in a room.
A city bus is slowly making its way down a very crowded street.
A sport team is posing in a park.
A man and woman are playing doubles in a tennis match.
People laying in the sun on the beach on a sunny day
Two hot dogs covered in toppings on a blue tray.
A person on a skateboard does an air trick.
A picture of a bunkbed that is very clean.
A lady in a bath robe touching something near the ceiling.
A couple of boats parked on top of a beach.
A group of sheep eating grass on a very sunny day.
A man holds a large hot dog and hamburger
A couple of people standing in a room with remotes.
A large group of people sitting on the ground.
A bed that has been made in a small room.
A woman taking a swing at a tennis ball
A silver microwave oven sits near a wooden cabinet that has a silver handle.
A seagull is standing on a ledge and one is flying across a river that is flowing.
a woman stands in a bathroom blow drying her hair
A group of boats on a body of water with clock tower in the background.
A group of young women standing around in a half circle holding tennis racquet.
A bird is chirping out of its nest.
A steer and a baby brown cow staring into the camera.
a bell tower with a clock face on it
One woman leaps to hit a tennis ball while her teammate guards the net
A bus driving down a street next to buildings.
A television screen that has a video on it.
A group of people standing and sitting on the sidewalk, watching a parade with horses.
A really nice hotel room with a gorgeous view.
A herd of zebra standing below a tall hillside.
Two ladies are sitting on their laptops at the table and one of them is on their phone.
A photograph of a train traveling down some tracks.
Items are laying on a long table in a narrow kitchen.
A desert that has some Oreo cookies crumbled on top of it.
A boat with a man fishing on it on a lake.
A oneworld passenger plane taking off from an airport.
A group of people running and being sprayed by a fire hydrant.
A couple of buses parked in front of a two story home.
A woman is taking a picture of herself in a mirror.
A young man with acne holds up his necktie.
Two one way signs are on the same pole as a stop light.
A woman sitting next to an older man holding a Nintendo Wii game controller.
a vase and flowers are sitting on a table
A vase full of some yellow flowers sets on top of a counter.
A large Banana tree on an island near the beach.
A train traveling through a jungle next to a  bridge.
A row of seats have closed off a stairwell.
The view of a large kitchen with a breakfast bar and stools.
A man flying through the air while riding a skateboard.
A little blonde girl standing in front of a fridge.
A cow and calf sitting on the ground.
three people standing wearing umbrella hats near one another
A large airplane flying through a gray cloudy sky.
A picture of a building and some grass.
A pizza cut into 8 pieces on a pizza pan.
Two elephants walking in the dirt near water.
Adult giraffe with offspring in structured zoo enclosure.
A card showing the right position to ride a horse.
A donut with white and brown swirled frosting.
A man is doing a trick on a skateboard.
a tray holding three plates of food including vegetables and fruit
Two cameras on a pole near a stoplight.
A train is on the tracks in a country area.
A city street with a fire truck, school bus and taxis.
a kitchen with a double sink a refrigerator and a counter top
A boy cutting a piece of paper at a table.
a man doing a trick with his skate board
A brightly colored train and a santa clause.
AN ADULT BEAR IS STANDING IN THE FIELD
The view from the inside of a large clock tower with several people and bells inside.
a modern looking bathroom with solid wood paneling
A bus driving down the road with several other cars.
A messy bed with many books on top of it
Skier on top of a mountain admiring view as sun rises.
A group of people in the woods holding up clocks.
Two women and a pink umbrella riding a bicycle down the street.
A boy is waiting by a train and train tracks
A front view of a street stop sign.
A group of skiers trekking a mountain in snow
A man sleeping under a book bag on a floor.
there are many bike riders racing in a street race
An infant sitting on a table with a pink cake and pink decorations
a green plate of food with a fork.
a man is using a banana as a smiling mouth
A dog looking around while standing in a window.
A hotdog with mustard put on it by a mustard bottle hanging upside down.
There are many different vegetables grouped together here.
Umpire makes a signal during a baseball game.
A kitchen with all white cupboards and appliances.
A person in a yellow shirt is standing on a long holding a water ski.
A parking meter on the side of the road is covered in snow.
A couple of men wearing uniforms playing a game of baseball.
A white horse and a black horse standing in a field eating grass
Young boy gets ready to kick a ball.
An old suitcase with several worn stickers on it.
a man with a beard  is holding some food and some people walking
A brick oven with pizza baking inside next to fire.
Teenagers siting on crates are gathered around a small campfire.
A piece of newspaper holding bananas with drawings on them.
a large bacon, spinach and cheese pizza with a large crust
Male and female rams climb search for food on the side of a snowy hill.
a tall clock tower with a sky background
A clock that is hanging on a wall above a window.
Skiers skiing down a snow covered ski slope.
a white plate with some broccoli and some noodles
a man playing with his kids with a kite
A bright orange and yellow engine pulls this train.
A large blue and white airplane on the ground.
A cupcake with frosting  and a star on top
Two young man playing soccer together on the field.
a large herd of horses standing in a field eating the grass
a man on a skate board grinds on a ramp
A smiling blond haired little girl is hugging a teddy bear.
A diamond shaped sign is sitting in the middle of the street as cars are riding on the side.
Three giraffes eating in a heavily shrubbed area.
A skier in a red jacket walking along a snowy forest.
The surfboard is painted in grey and pink splatters.
A skier in the air over a jump.
A couch on a trailer hooked to a bicycle.
A small bed next do a daybed and coffee table.
A meal sits on a table next to the ocean.
Three young women hanging out on a bed.
People standing near luggage placed on the floor.
An Equestrian jumping their horse over a white jump.
A tusked elephant is walking among the greenery.
A giraffes head peaking over bushes and trees.
A team of ultimate frisbee players jump for the frisbee.
A man with beard and tie on a subway car.
Male skateboarder displaying leaping ability over steps with handrails.
female surfer walking carrying surfboard on her side
One slice of pizza let with toppings on a pan.
A man and woman posing for a picture.
Many people are gathered to shop and eat.
Young girl having a meal in outdoor setting.
A man with blue jersey holding a baseball bat.
A mouse that is sitting next to a keyboard.
A man an woman are sitting under an umbrella on a park bench.
A mauve colored toilet bowl on the sidewalk
A girl sits on top of a bouncy house texting on a phone.
A man taking a turkey out of the oven
a locker with some books and school supplies in it
A room with a couch, bookcase, and flat-screen television
a toothbrush is laying on a white sink
A small child sits on the floor and watches tv.
Two young children are playing a video game.
Two dogs looking at some fenced in white cows.
A cat sits looking out of a window.
An airplane with people under the wings at a field.
A baseball player signing a baseball bat for a fan.
Five old fashioned looking airplanes in formation in the sky.
A pizza is shown on a plate with a serving knife.
An elephant in a fenced off area under a shaded tent.
A brown purse is sitting on a green bench.
a desk with a laptop and a monitor sitting next to it
A salad that has a white dressing on it.
A bayside cafe with piers and boats in the water
Two kids that are standing in a living room.
A man walks around with two sheep on leashes.
a man sitting alone on a black bench
a couple of small figures of a man and a horse
Ah, look at these sumptuous desserts under glass.
A woman that is sitting in front of a cake.
A living room with expensive furniture and a large window.
A woman in a bikini showing a type of food
A man holding a baseball bat in front of a catcher and umpire.
a small cat and small dog looking in the opposite direction.
A slice of pizza sits on top of a plate.
The back of a semi truck on the freeway.
A group of people that are standing in the snow.
Some people are sitting and playing Wii in a family room.
A small dog sitting on the ground at some ones feet.
A red city bus parked on the street
A woman standing near a large green pillar with a clock on it
Two red street lights that are on a wire.
Two plates filled with lots of hot dogs on buns.
A guy's hat falls off as he plays tennis
A women on ski's going through the air .
A group of  people gathered around a laptop computer.
An airplane that has just taken off into the sky.
a bathtub with a small shelf above it
Girl standing with a Wii controller in her hand
A space shuttle is parked in a museum while visitors look around.
A mother carries a dish to the sink, and a young man carries a beer bottle toward a counter, as a young girl looks on.
One kite flying and two stuck in a tree.
A series of photographs about dinner at a skyscraper restaurant
A woman holding a baby and sitting next to a dog.
A tennis player's feet and shadow on a court made of clay.
A red velvet cake next to an alcoholic drink.
A couple of giraffe standing next to each other.
a building with art work and a sidewlak with afire hydrant on it
a cake that is less then half on a plate
This living room is large and has a glass sliding door
Two zebras and a giraffe in a dirt and rock covered area in front of a muddy pond.
Men on a horseback at a polo competition.
The vegetables are sitting in the white bowl.
Bear behind fence of enclosure as official inspects him.
A horse grazes on grass in the shadow of a mountain.
Guy patiently waits on his surfboard for the best wave
Adults shopping in produce section of grocery market.
A young man skating boarding on a half-pipe.
A small metal bowl holding an orange flower on purple sheet.
A dingy with some pigeons on it in the water
A sign on the side of a building on a street.
A happy little girl lies in bed with a stuffed bunny.
This man is holding a breadstick and a bun.
A Juicerator sits on a counter and dispenses a yellow juice.
A crush soda on a white back ground with orange halves.
a close up of a motorcycle license plate
A close-up of a table with three boxes of pizza.
A cow and a bull walking down a skinny alley.
Two orange and silver trains passing on a street.
Five chocolate donuts and three unfrosted ones and a Canadian penny sits on a blue pokadot cloth.
A pole with multiple traffic signs near trees and bushes.
Group of men in white shirts and white hats holding tennis balls and tennis rackets.
A bride and groom on their cell phones.
A pizza sitting on top of a wooden table.
A couple of girls sitting in a bed in a bedroom.
a brown and white owl and some green bushes
An upscale bathroom sunken tub with chandelier above.
A man smiles as he holds a baseball bat in an historic photo.
Two zebra walking past a grassy forest in the daytime.
Two chairs and a glass table sitting in the middle of a well put together room.
A room that is divided by pillars has two overstuffed chairs, coffee table, piano, a table with flowers in a vase.
A toilet is in a small room with windows.
Three elephants are on a dirt road.
A white bowl filled with soup sitting on top of a counter.
A woman holding a skateboard on the sidewalk.
A mouse and a computer sit on top of a wooden desk.
Two people are walking in the shore of the ocean.
A clock is shown on the side of a sidewalk.
A man stands in a tree with an umbrella, observing birds,
Man ironically holding up holes of scissors to eyes
a short woman helping a tall man fix his collar
Five birthday cakes all in different and unique shapes for kids.
Small boy holding a bat above his head on a cobblestone street.
Pedestrians walking underneath a traffic light by a city road.
A brown and white cow standing next to a tub.
There is a plate of broccoli  and vegetables
A woman rides a horse quickly around barrels.
a person in a living room watching a television
A black and white small dog sitting on a  foot stool.
A man in a kitchen prepping a tray of food.
A vase filled with flowers next to bottles of wine.
The stripes on the zebra almost disappear on its legs.
A wooden table holding a white laptop and glass of wine.
A teddy bear on a table and some red jello desserts
Two zookeepers feeding two giraffes in a zoo.
a living room with two chairs and a tv
A large boat floating on top of a large body of water.
A make shift office space in a bedroom.
A large cock sitting in the middle of a street.
Three boys look on at a little league baseball game.
Four people are on a bench next to a store.
a large herd of sheep walking down a dirt road.
A fire hydrant next to a sign of a fire hydrant.
Kitchen corner with refrigeratorfreezer and microwave next to an open closet door
A person in a police uniform sitting on a motorcycle.
A white terrier dog on a leash with a brown spot on his eye.
A skateboard is skating down the sidewalk on his skate board.
A computer desk with a turned on computer in front of a book rack.
A man taking a bite of a large piece of chocolate cake.
A group of people watching a black cow eat from a blue pot.
A small silver and red airplane sitting on the ground.
A pigeon on a brick street under a park bench.
A few people are getting ready to ski.
A large tiled  bathroom with glass sliding doors
A crowd is watching a man on snow skis.
Some food is about to be served for a meal.
several boats docked at a marina with clear water
a lady and her dog on a paddle boat he dog as a life jacket on and hey are happy
THERE IS A YELLOW FIRE HYDRANT THAT IS ON THE GROUND WITH A BLUE CAP
A man with a surfboard about to go surfing.
A cowboy sitting on a horse at a festival.
a bunch of hot dogs that are in a bowl
Two guys that are sitting on horses in the dirt.
Two birds are perched upon a snowy bank.
A bathroom with white fixtures and tiled floor
there is a piece of chocolate cake on a paper plate
a little dog trying to pick up a Frisbee
A skateboarder is doing a trick in the air.
A horse figure is on a snowy track
A person who is holding a hotdog in a napkin.
A horse and a dog positions for a picture outside.
a mom and her son eating at a restaurant
A white outhouse toilet sitting inside of a stall.
A man and a child on a ski containing a seat.
A black bear crossing a road as a bus draws near.
A dude in shorts playing baseball with a bat.
A container that has a bratwurst in it.
A do not enter sign sitting on the side of a road.
A group of jets that are flying in the air.
A Continental airplane is waiting for takeoff at the airport.
A man prepares to fly a kite in a grassy area.
A woman in black jacket holding skis next to trees.
Three middle age men looking at a piece of machinery.
I bet he will finish this entire meal in no time at all.
A man power sliding on a long board
Players and a referee playing on a football field.
A motorcycle rally is attended by numerous riders.
a collection of lemons, limes and oranges in front of books and a mug
a computer desk area with electronic devices on it
a white purple and red double decker bus and some buildings
A group of people at a park flying kites.
A close up image of a bike gear and chain.
Two indians with pony tails are with some horses.
Chocolate and caramel sauces are on a tray with sliced bananas and strawberries.
A train driving past mural of working men while billowing smoke.
A plate with a large pancake cut in half.
A white pickup trucking is lacking doors, bumpers, grill and one headlight.
A photo taken outside a restaurant with tables and chairs.
a male in a blue shirt is playing ping pong
A little red-haired boy standing in front of the refrigerator.
A computer is shown with a keyboard and a mouse.
A bathroom sink at a hotel with the usual amenities on the counter
A bunch of people stand in front of a car and next to the nose of an airplane on the tarmac.
A green and white bus on street next to dirt area.
A person riding a wave on top of a surfboard.
A person flying a kite in the snow
An elephant standing on top of a wooden stool.
Many people and a dog under an umbrella on a beach.
The vase is holding the budding flowers on the table.
A kitchen scene with wood floors and wood style cabinets.
Blue umbrella on picnic table in front of food truck
The skier is carefully descending a snowy slope.
A little girl riding on top of a skateboard in the street.
A very sexy woman laying on top of a bd wearing fish next stockings.
The animals look to be walking a one direction.
This is a train on the tracks that is filled with doors for houses.
A cat is lying on a cushion on a couch.
a man is on the court playing tennis outside
A tennis player striking the tennis ball for his next shot.
A man walks down the road with some cows.
Crates used as tables, full of fresh produce at an outdoor market.
a person in a sweater holding a cake over a paper plate
a large group of children holding their kites
A bus stopped at an intersection in front of a church.
a red fire hydrant is between a couple of poles
A woman in a pink hat looks at her phone in a crowd.
Man posing for a shot wearing a suit and tie and carrying a briefcase.
there is a dresser that has a mirror and many things on it
a double decked bus drives down a city street
Three birds on some rocks near the ocean.
A photo of several bunches of yellow bananas.
A grizzly bear sitting outside in the grass.
a man dressed in riot gear wearing a face mask and holding a red and white umbrella
A refrigerator in a corner of a room.
A group of birds flying over the water looking for food
An alsation dog paddling through some water in front of a building
A child playing with a baseball bat and a ball.
Some leafy trees are hiding a black bear.
A giraffe looking over a fence on a summers day.
An attractive young woman speaking on a cell phone.
A silver and black computer mouse stands next to an open laptop.
The man is standing by a large herd of cows.
A person on skis coming up a snowy path.
Two adults and a child walk on the beach in front of a cruise ship.
A refrigerator covered in pictures and stickers in a kitchen.
Four young men in a sitting area stand looking towards the opposite side of the room.
Plates of food and two glasses of red wine are on a table.
a man and a small boy standing on a tennis court holding tennis racketts
A bird displaying its decorative plumage among some leaves.
A man and woman sitting on a park bench
A GIRAFFE STANDING NEAR TO TWO DEER IN A SEMI-ARID GRASSLAND.
A row of chairs and some umbrella's on a beach near the water.
A man carrying his surf board out of the ocean.
people sitting, walking around and some are in groups
Two people cooking bowls of ramen in a kitchen.
A boy in a plaid shirt holding an umbrella.
A frisbee will be thrown to a girl's dad in time
Kites being flown by a crowd of young children on a cloudy day.
A row of vespas parked next to a bunch of motorcycles
A group of young ladies sitting around a table sharing a meal.
Two elephants in a concrete enclosure at a zoo.
people standing at counters of booths being served
Looking down the length of a city street while cars pass by.
A baseball player hitting a ball on the field.
A person laying on a couch with a cat laying in their arms, covering part of the face.
A counter filled with vases, candles, and fruit.
The entrance for a subway on a city street.
a large crowd of people at an airport terminal
A small kitten is walking on a computer keyboard.
A bowl containing meat, lo-mein noodles and broccoli.
Yellow umbrella stands sitting on a beach with one chair.
Two purple flowers sitting in a green vase.
The bedroom is decorated for a female and includes a breakfast tray.
A young man kicking around a blue soccer ball.
A train is traveling down the tracks in the open field.
A woman holds her hand out to feed a giraffe.
A person who is all bundled up standing in the snow on skis.
A fire silver and red fire hydrant is in the grass near a curb.
Woman in shopping aisle with bear on her head
A woman holding a video game controller is playing games.
The tall tower in the middle is framed by two large buildings.
A man in a yellow jacket washing an elephant.
A man riding up the side of a pink ramp on a snowboard.
A tennis player hitting the tennis ball with the racket.
A small grey and white kitten stands next to a foot.
An airplane is flying high in the sky after taking off.
The car has two different shades on green on it
Two cats are siting right next to each other.
A man sits on a couch in a sitting room with coffee table with an open laptop on it.
A TALL VASE OF FLOWERS IS SITTING IN A WINDOW SEAL
a person holding a cell phone to a gerbil
a huge brown bear standing at the edge of a small hill
Open toilet, basin, and shower stall in compact bathroom.
A bunch of flowers in a clear vase of some sort.
A man looking at a computer game on the counter
A bot stands in front of a bus, while other men look on.
a group of people in a field playing frisbee
An animal leaning against a bare tree relaxing.
A side table with a lamp and books net to a home library.
A surfer in a wet suit carrying a surfboard as he walks into the water.
Man doing a skateboard trick while others casually watch.
A boy is cutting slips of paper with scissors.
a photo of someones living room complete with, bookshelf full of dvds, two leather chairs, a flat screen tv, fireplace, and a overly large decorative clock.
A woman sitting a table holding two hotdogs.
a snowboarder with a blue jacket walking up a hill
Woman and child watching people row in water.
A dour young man sits on a horse.
A room with several types of luggage against a wall next to a mirror.
This bathroom is decorated with wood and has several mirrors
Asian men at a a white board talking with a Samsung sign behind them.
A boy is in a courtyard on a skateboard in the air.
A large yellow dump truck parked and empty.
a close up of a giraffe and people holding a bucket
Animals eating grass in a hill by the ocean.
A banana tree filled with lots of unripe bananas.
The airplane is flying high above many clouds.
Three tall urinals and one short one in a restroom.
This picture shows the details of a red colored skateboard.
a close up of a vase near other vases and a plate
A bus underneath a large crane at a factory
A small airplane is parked on the runway.
People walking in the middle of a snowy street on a campus.
Two adults and one baby elephant walking in the wilderness
A dog sitting on a chair next to a soccer ball.
Two truck cabs facing each other on a road.
A man standing on a surf board with a paddle.
A man flying through the air while riding a snowboard.
A woman is standing over a stove holding a cup.
A flock of birds flying over a light house near the ocean.
A water fountain that has a pigeon perches on it.
people in a boat moving in the deepest place
A person walks a dog with little shoes on.
A group of sheep grazing in the field
A boat on the ocean with a grouping of birds flying around.
A man riding the waves on his surf board.
Plastic containers and a bowl filled with lots of food.
Young man in white playing tennis at a tennis club.
Miniature pizzas and skewered heart shaped pretzel bites.
two boats sitting on the shore close to the water
a cat that is laying down on a couch
A red double decker bus driving down a busy street.
A crowd watches two people at a tennis match.
Church cathedral with decorative arches, marble floors and high vaulted ceilings.
A man with his shirt off, is flying a kite.
A close up shot of a red apple beside an orange.
A man on a dirt bike riding on a dirt road.
A man riding a board on top of a skate park.
A dog is sitting in the back of a pickup truck.
Man painted in gold paint standing next to a horse.
A street scene looking at a clock on a pole.
A train traveling down tracks near a station.
Wooden benches in the middle of a forest.
A building is shown with tables in front of it.
A yellow commuter train traveling through a train station.
A long red train o the side of a field.
Two pieces of cake are arranged on a table
Two kids with joysticks and remotes seated on a couch playing a game
A man in a purple shirt trying to catch a frisbee.
some water boats bushes trees and buildings and a train
A young girl walking on a road carrying an umbrella.
The baseball players are about ready to take the field.
A person showing a selfie of themself to the camera
A couple of judges judging some sheep at a county fair.
a cow in a field on a very foggy day
a living room with two laptops and a tv
A clock between two archways on a castle
A group of men sitting around a laptop at a table.
Several people on motorcycles sitting parked on the road.
Four cats inside a caged in area, two yellow, two not.
A group of people sitting at a table eating.
It's hard to tell if these are tennis players from the thirties, forties, or fifties.
baby in pajama's sitting on the bed playing with an object
Horses are eating grass on a large pasture.
A bowl of vegetables is set next to a blender.
A long freight train crossing on a bridge over the ocean.
An airplane flying with dark and light clouds in the background.
A hot dog with cheese, mayo and a vegetable on it.
A woman stands in front of a neatly made hotel bed.
The woman sits at the table overlooking the pink and white cake with lit candle.
Assortment of shells and soaps displayed on commode with dental care products.
A person flies their kites above people by water.
two sheep next to a wooden structure behind a fence
A train bed with a blue sheet and various items on it.
A pitcher winds up for a throw at a neighborhood baseball field.
A family and a dog playing frisbee very near to the edge of one of the cliff of the Grand Canyon.
cows resting in the shade and relaxing for a moment.
Skateboarders are doing tricks as a crowd watches.
two brown bears on some rocks in their pen
A woman walking down a street in a dress with a bag.
A couple of zebra standing on top of a grass covered field.
Three men stand on a beach watching a kite fly.
many different sinks near one another with mirrors
A woman covering her face sitting next to a man on a log.
A snowboarder decked on in great poses for a picture.
a train moving on a snowy area and besides an ocean
A lady with a dog in the snow waiting to cross the street.
A police officer standing next to his motorcycle after pulling someone over.
A smiley face sitting on top of a wood table made out of fruit.
The bathroom contains a bathtub and shower, toilet and sink.
there is a male skate boarder doing a trick inside of a parking lot
A girl taking a picture of herself in the mirror.
The train is on a railroad track, under a signal light.
A train maintenance vehicle sits on train tracks.
There is a view of a bench and houses down the hill
A man a woman pose on a tennis court.
A couple of pictures of a cat sleeping on a hair brush.
A white toilet sitting next to a white bath tub.
A man and a woman surrounded by people.
There are red benches near the grassy area.
A police officer rides his motorcycle next to the protesters.
The lights and sights of a busy, populated city in Asia.
A group of men and women rowing a boat in the middle of the sea.
A young girl in pink snow gear on a snowboard.
A woman being pulled on her water skis.
An infant girl sitting in a shopping cart.
A women who is holding an odd shaped carrot.
A blue and yellow plate of food that includes rice and beans.
A baby zebra hiding among the tall grass.
A person with a camera taking a picture through a mirror.
A clock mounted on the face of a building next to an eagle statue.
A lady and a baby at a pizza parlor during the day.
A man riding a skateboard in a covered skate park.
A large brown and grey cat sits on top of a desk.
A train rounding a corner on the tracks.
Several crafting items laid out on a white linen.
The red city bus is driving next to a construction truck.
A man holds the string of a kite, as many kites fly in the sky.
The girl is wearing a jacket with fur and has a yellow frisbee.
a woman standing on a tennis court and holding a racket
Blurry shot of man at the intersection of busy street.
people holding a surfboard and walking down the beach
a cat with a big fluffy tail sitting on top of a car tire
Two birds are sitting in their respected area.
A cat standing close to and looking at two geese.
A small house stands in a small constraining carriage.
A fire hydrant spraying acroos an empty street
A person attempts to remove something from a large oven.
a close up of a stop sign with a sky background
Two elephants walking in a dirt field next to trees.
An old train makes its way down the track in the country.
Bunches of bananas hanging from wooden rafters by string.
Giraffes walking in their enclosure at the zoo
Three birds are sitting on the branches of a tree.
A dog that is playing in the snow.
A white SUV parked in front of a train.
A green and white bus traveling down the street
A street sign where there is currently construction.
a child is blowing out the candles on the cake.
A styrofoam plate with cats with noodles on it.
A hawk perches on a tree branch in a forest.
A gray cat is sitting in an empty red suitcase.
A man and woman walking past a fire hydrant.
A stuffed animal laying across the steering wheel.
a red plate a table drinks and a sandwich
A woman is swinging a tennis racket at a ball.
an elephant is eating grass and a bike is nearby
A bunch of surfboards that are on the ground.
A cat is sitting on the seat of a blue motor-bike.
Several men are standing or walking on a soccer field.
A cake that is well decorated with green stuff on it.
two surfers one in a white shirt and water
London Olympic games statistics statue with many tourists and visitors nearby.
a number of motorcycles parked near one another
A produce market displaying racks of fresh fruits and vegetables.
two kayakers enjoy the clear open water
a dog is siting behind a large window
A parking meter in a parking garage that has a lot of cars.
Horse held by two leads in passageway of large stable.
A plate topped with pancakes next to a cup of coffee.
an empty truck parked next to a building
A man wearing an american eagle tie in a suit.
A platter of donuts sits on a wooden surface.
Four men with smiles on their face, in a kitchen.
this is a close up picture of a giraffes head
a red orange double decker bus smoking on the road
a large building with words scrolled across it
The man holds a pig foot next to his mouth.
Paved highway with several cars moving past an exit
A small residential bathroom featuring oddly shaped furnishings.
A glass vase holding a flower on a wood table.
There are several animals in the grassy field.
People and sheep traveling down a long country road.
A man with a surfboard stands in the water.
A red bus is next to a curb and trash bag.
An old, small residential bathroom with blue curtains
A group of friends waering skirts and dressing are walking down the street.
A young girl appears to be enjoying a biscuit of some sort.
An Indian man and woman in the water on the edge of a river.
A dog is chasing along behind a cow in a field.
Several double decker buses driving down the road.
A picture of a bathroom with a large shower.
Two people on a long rowboat in a river or lake.
a shed with giraffes near it behind a fence
A bench sitting by some very pretty assorted plants.
A bath tub sitting next to a white toilet.
a man in a suit glares while standing outside
A metallic refrigerator freezer sitting next to a stove.
A large long train on a steel track.
A giraffe in the brush standing facing away from the camera.
A man standing behind another man helping him with his tie.
People sitting on a curb watch a parade and horses walking in the street.
Two gentlemen doing a show with umbrellas and colorful suits.
Two guys walking and talking in the room.
A woman and a man pass food between their mouths.
A table with a coffee and a salad on it.
Three large kites flying in the sky near the water.
A cat pawing at a television picture of some penguins.
Two women are sitting on a bench outdoors having a conversation.
There are two woman in bathing suits and a cat
A herd of elephants in their natural habitat.
A street is lined with people and buildings.
Man checks wheel on mule drawn cart driven by girl.
Boys standing in front of microphones outside in front of cameras.
Two people walking in the ocean away from a boat.
A pizza with olives is on a plate.
A street sign that is in front of a cemetery.
Two laptops next to each other are open on the desk.
A sign in front of the airplane warns that tobacco is not allowed in the area.
A person holding a blue umbrella in the rain.
A young girl adjusts her pink sunglasses in a park.
A grouping of luggage with tags and luggage trolleys.
A baseball game in progress with the player running the bases.
a dog sitting under a desk with a monitor
Many people ride on surfboards as one man catches a wave.
A big bird stands between a trail and some trees.
A bamboo bench with a backpack sitting on top of it.
Cellular phone displayed on display case with other phones.
The horses have made this patch of ground quite bare.
A HERD OF SHEEP GRAZING ALONG SIDE A HILL.
odd, four street signs on a hill away from the traffic
A pizza sitting on an outdoor table in the sun
two legs a toilet a stall door and white tile
A man holding a child on top of a skateboard.
A motorcycle parked in front of a wall.
A red stop sign mounted to a black pole.
A teddy bear sitting on top of a red plastic basket.
a large crowd of people at the park with some playing with a large kite
A woman tennis player in a black army shirt and tennis skirt, swinging a tennis racket.
People have set up tents near picnic tables on a beach.
many people and line of parked scooters and motorcycles at night
A horse stands near a colonial era stone furnace
Two young women are eating hot dogs while walking down the sidewalk.
A large neon sign at a market square
A little girl is playing with a hair dryer
Toothbrushes sit in holders arranged around a sink.
A young man in a suit, tie and glasses is smiling
Cows are trying to kiss the girls on the arm.
a bathroom with a sink, toilet , and tiled floor
A man wearing a tie, jacket and white shirt.
A girl alone on a beach flying a kite.
A large stack of trunks and luggage on a sidewalk with people behind it.
A painting of a vase with a polka dot gray background.
The plate of food has meat and cooked vegetables.
Tennis player with the teeth of a predator.
The elderly woman uses a video game remote near her companion.
The sleeping child is holding onto a teddy bear.
a little car sitting by a wall with a picture on it
A young blond beautiful woman standing on a tennis court.
A wall with lots of weird things mounted to it's side.
A MAN IS HITTING A TENNIS WITH A RACKET
A man that is on a snowboard that is in the snow.
The cow sticks it's large tongue out of his mouth.
There are street signs and a traffic light at a downtown street.
there is a woman sitting on the ground making food
A woman in a bikini laying under a red umbrella.
A giraffe standing next to a tree that it is chewing on.
Giraffes mill about in their pen at the zoo.
An elephant pokes his nose in the brush.
A large red bus on a city street.
There are two zebras walking side by side.
vintage black and white photo of old motorcycle
A large white bed sitting under two framed pictures.
A small orange train traveling down tracks near a station.
A sandwich is on a long bun in paper wrapping.
A HERD OF SHEEP GATHERED AROUND AN OLD BARN
There is a birthday cake with chocolate icing on the table.
The three teens are talking to each other on the sidewalk
Black and white image of a woman and a man petting a horse.
Three animals are standing near a body of water.
A couple of plates of food on a table.
All ages can have a good time using the Nintendo Wii.
A man in a suit poses for a picture with each of his arms around a boy in a suit.
A little girl on the beach playing with a frisbee.
A keyboard and mouse are sitting on a desk in front of a laptop and monitor.
All of the planes are flying in the same direction.
A white plate topped with three donuts covered in frosting.
a number of baseball players on a field
A personal single engine jet, on the runway
A building seen through a rain and fog covered window.
A man riding a wind sail over a large body of water.
Lots of colorful flower vases hang on a wall.
a woman on a phone is waiting for a bus
A cake with tow layer smothered in white frosting.
Carefully sculpted pieces of wood in a display case.
A black man opens his fridge and looks inside.
An airplane during takeoff ascends into the clouds.
A red light rail trains passes through a station
A little girl is sitting with an umbrella
a man in a suit grabs his head while screaming
Four persons skiing on snow clad mountains and slopes.
a small kid holds on to some balloons
a guy that is skateboarding on some kind of concrete
A person in skis going tightly around a flag.
The desk has a desktop computer and a laptop on it.
a man stands in a kitchen by a table
A person skiing in an open area of snow.
A number of pizzaz sitting on a wood table
The group of skateboarders is headed towards the park.
There are many seagulls standing on the ledge over water
A picture of a man swinging a tennis racket.
a parked air plane sit at a airfield
An old woman getting vegetables from a heavy loaded cart.
A girl looks in the mirror as she brushes her teeth
A wide variety of produce is for sale including apples, pears, and onions.
A commercial district street with a sign pointing where to stop for a crosswalk.
The teddy bear is posed as if he was working out.
A white cake with red designs and two cups next to it.
A horse on its back with a man watching.
a kid on a snow board stands in the snow
There are two males on a vintage red train.
A pizza pie with vegetable toppings and cheese
A bathroom with the light on and a painting hanging over the toilet.
Two suitcases that are sitting on a chair.
A laptop, Furby toy and books on top of a desk.
This is an image of a woman getting her hair styled.
A refrigerator has a note pinned to it with a magnet.
A small weiner dog that is cooling off in the pool.
A young man wearing black does a trick on his skateboard where he is almost parallel to the street.
A man standing in front of a flag holding a plaque.
there are many young men on the field playing soccer
A market area displaying various fruits that include plumbs and pears.
Some flowers in a vase on a table
A lady petting her dog and a man standing on a log
a polar bear on a field near many trees
A man getting ready to hit a tennis ball with a racket on a tennis court.
A man is sitting in front of a desk with a coffee mug.
A man helps his friend fix his tie before a photo shoot
a baseball player swings his bat at a ball
a dog that is on the lap of a women
The two cats are looking out the high window.
A small group of people on the sidewalk with a few holding umbrellas.
A man standing next to a woman in font of a tray of food.
A young boy is holding a baseball mitt in a grassy field.
a man is riding a board in the water
Two doughnuts sit on a plate with drinks surrounding.
The is train cross a bridge over water.
A child sitting at a table with a plate in front of him.
A woman holds a birthday cake as a man lights the candles while another man looks on.
A man in black shirt and apron in a kitchen.
A man standing in the grass flying a kite.
A person riding a skateboard up the side of a wall.
A man in a brown shirt is playing a video game.
A white and blue jet airliner docked at an airport.
A man tossing a frisbee on a lush green field.
The bed has mosquito netting hanging around it.
A man running after frisbees in a wooded area.
A black boat with a dog on it going down the river.
A person flipping a skateboard with his feet in the air
A multi colored dog jumping up to catch a frisbee.
A very big grassy field with a bunch of bats together.
clear vase filled with white and yellow flowers with water
A black sign with directions stands in front of the blue sky.
A dog in a mirror with a person in a room.
Yellow passenger buses ride side by side down a crowded street.
A black and white photo of old cars and a boat, all sitting in front of a lake.
A desktop computer with a note attached to the screen.
Two men have thrown their ties over their shoulders during a meal.
The woman wearing a coat stands near sheep behind a fence.
some pasta in a bowl sits on the table
A man holds up an x-ray and looks at the camera.
A man wearing a backpack pauses to talk on his cell phone.
Two tennis players consult with the referee during a tennis match.
A room with two pictures on the wall and a table with a computer monitor on it.  A wooden floor and a table with a yellow bowl and a grey and white rug.
a person cutting a small cake on a table
A traffic speed limit sign sitting in the middle of a road.
a bunch of cows are in a field
Several zebras eating together in a fenced in area.
A bunch of lumberjacks moving logs in the woods.
Many pots and pans have been hung over a kitchen bar.
A cell phone being held by someone is showing two women on the screen.
A sign indicating the historical site that is the Nathan Hale Homestead.
A completely shattered television lying on the sidewalk.
A very delicious sandwich with black eyed peas on the side.
Two men are sitting side by side as they are eating and smiling, they both are cutting their food with a knife.
Public transportation train with blue front approaching the station
Three people preparing to launch a small boat in a river.
FedEx trucks parked on the side of the street while cars wait in traffic.
a man skat boarding down a concrete pathway
an elephant is standing behind a wooden fence area
We are looking at a crowded city street.
a man standing at a table with wine in a supermarket
A sandwich with a drink and a bag of chips.
the woman is holding a cat with a hat on
A row of fire hydrants sitting on the edge of a road.
A clock stands alongside a busy street at night.
The two skiers are eager for the finish line to come.
The bench is chained to the outside door handle.
there is a woman sitting at a table eating
Two elephants in an animal sanctuary with trees
The back of a car that is pulling up to a stoplight.
A produce stand with a variety of fruits and nuts on display.
This public restroom has no toilet but instead a simple porcelain hole in the floor.  .
A white and black street sing covered in snow next to trees.
A woman holding a pizza box and a paper bag.
Many skiers are traveling along through the snow.
A red, white and blue airplane is high in the clear blue sky.
A group of people in a restaurant eating a meal.
A group of softball bats leaned against each other on a field.
A table with crafting supplies next to a cell phone.
A bird sits on a wire over a street sign.
A bird sits in a tree branch with leaves.
An old Boston baseball player sits while holding his bat.
A man that is inside of an elevator shaft
Inside of a bathroom with a sink and mirror.
A red kitchen with metallic appliances and paintings on the wall.
A man holds a bat at the base.
A man by a book case has a guitar.
Many bananas and apples are on the kitchen counter.
A cat is sitting alone in the middle of a large patio area with a historical building in the background.
A man wearing a yellow and white striped vest and hat
A person holds a bag while walking on train tracks.
A large sandwich with meat, cheese, and vegetables.
A man flying through the air on a skateboard.
Fancy standing clock sitting in a nice setting.
A bed with pillows where the blanket is slightly pulled back.
A woman in a bikini with a surfboard in her hand
Two cats are staring at a light spot on a floor.
A snow boarder boarding down a snow covered mountain.
A young man standing over a pan filled with food.
A women who is eating some food and looking out a window.
A man in skis holding a stuffed animal near a group of other skiers.
People with their faces blurred out play Wii on a mounted TV.
A collage of photos of cats and goats.
A soldier riding on the back of a black horse.
The man is a wet suit is catching a wave.
Street sign advising to turn left for Shanks Avenue
A woman holding a tennis racket swinging at a ball.
Three businessmen who are crossing a city street together
a brown cow standing next some other cows
There is a surfboard sitting next to a car.
There is a dog that is walking on the beach at sun set
And elephant behind a low log fence and someone leaning on the fence, taking a picture in another direction.
A little girl using a laptop on a table
An employee slices a large piece of pizza, pretzels hang bear by
The man on the skateboard and the dog are getting their picture taken.
a large crowd of people is outside a building
A tennis player stands before a net and waves while a camera man films him.
A man playing a game with a remote controller.
A suitcase, sitting on the floor, opened is full of clothes and a curtain is behind it.
Two cats standing under a windowsill with each other.
A person with an umbrella near a building.
A bottle of beer sits next to a gourmet pizza pie.
A guy wearing a blue shirt is skiing.
A man in striped shirt looking into an open refrigerator.
A red stop sign that is on top of a pole.
A bathroom sink with a facet and soap dish and three mirrors that reflect three sides of the sink.
A man that is standing in a kitchen near a bowl.
The front of a city bus rolls down the street.
The bicyclists have formed a train, and are being towed by the city bus.
A small giraffe with its head down, standing next to a tree.
A man in a black wet suit is about to stand on his surf board.
A man standing in front of a clock.
The person has fallen asleep while holding their skateboard.
A street with people walking on it and items on the sides of the street.
A bathtub sits against a wall with a sink and toilet in the foreground.
An expressway with street signs in Chinese.
This is a decorated red velvet cake on a red tablet cloth.
A kitchen counter that has various objects on it.
Two people playing a video game on a projector
THERE IS A MAN THAT IS ON A SKATE BOARD IN THE STREET
A little boy sitting in front of a computer keyboard.
Three birds are lined in a row in a grassy area.
A man wearing a helmet rides a skateboard
A WOMAN CARRYING FOOD ON TOP OF HER HEAD
a close up of a bowl of food with broccoli
A young woman in a gray, long sleeved t-shirt sits on top of a yellow structure looking at her cell phone.
A man holding a dog sitting outside looking down.
A fleet of airplanes rest at their gates at the airport.
Snowboarder displaying aerial tricks in populated urban setting.
A dog is in a living room sitting on the back of a couch.
A clock that is above a pedestrian walk way.
A pair of scissors next to some pieces of paper.
A ripe banana sitting on top of a wooden table.
A woman holding a cat in her arms in a car.
A small white bird standing on top of a dirt field.
a small plate that has some food on it
Sheeps and goats eat food in their pen
A man standing on top of a skateboard.
A man up to hit in the middle of a baseball game
A glider is flying over the beach on a foggy day.
Women walking down the street holding an umbrella
Woman in midst of a Wii activity, holding the remote and smiling.
A clock sits on an iron part with lights above it.
Two gray and white cats laying around a toilet.
A red and white train sitting on the train tracks.
A small inverted airplane flying in the sky.
Double decker bus in front of store on empty street.
Two men in a small living room are playing with the Wii.
Many people prepping large kites on a beach.
A guy in a hat skateboards across a ramp.
A big black bear lays down in a lush green open field
The uncooked pizza has raw tomatoes and lettuce on it.
A man walking on a tennis court with a racket in his hand.
A horse drawn carriage riding past a city trash truck.
A man leads a horse cart carrying four people including two ladies with headscarves.
A car parked in the street next to a parking meter.
Four men in military uniforms are smiling while holding an item next to a table as other people look on.
A black and gray goose standing in the sand
A boy doing a trick in the air on his skateboard
The back of the garbage truck has rotten bananas on the bottom of it.
A man and a woman walking past a bus with an umbrella.
The meal being eaten at the table is on a blue and white plate with spoon, fork and knife.
A cutting board topped with two sandwiches next to drinks..
The sink of a large modern bathroom is full of water.
A pizza with two slices missing from it.
a cat that is laying down on a bed
A hot dog on a bun with an abundance of yellow mustard.
A man beside a valley stands beneath an umbrella in the rain
A woman wearing goggles skies down a large hill.
A person on a snowboard in the snow.
a girl flies a kite near some other people
a female in a white top is playing tennis
The Asian market has a large quantity of pears available as well as other produce.
Looking up at a traffic sign and street light.
A woman looks at her reflection in a handheld mirror.
A man removes food from an oven with hot pads.
Street sign light on a traffic light pole
Adult men standing in living room playing video game.
A white bowl of tangerine slices on a wooden surface.
A pile of veggies next to meat covered in gravy.
A green backpack with a computer mouse poking out.
A person with a pair of scissors about to cut hair.
Dog laying down on the sofa next to a cat.
A cat that is standing on a bench.
Herd of cattle laying on a beach that has people on it
a bowl with liquid flavors in containers lemon orange banana and pineapple
A group of skateboarders riding down a city street
A white plate topped with a sandwich and chips.
A man and his son eating donuts at a restaurant.
A person on  a skate board in mid air by a rail.
Older style single engine airplane being displayed at air show.
There are several people walking in a street parade.
A bird perched on brick ledge with a hole in it.
A beautiful blonde holding a Nintendo Wii controller with another beautiful woman holding another Nintendo Wii controller.
Grey fighter jet, with pilot, on a runway.
A woman is pulling on a man's tie.
A city street with traffic caught in motion at night time.
A man in an inter tube by a boat in a lake.
a man riding the side of a wall with a skateboard
A skateboarder rides on the side of a large pipe.
A dog is looking out of the window of a car.
Cattle are crossing the road to a beach front.
A pair of scissors stabbed onto a wooden counter top.
Two photos of a tennis player rushing to hit a ball.
A flatbed truck carrying the remains of a crashed light airplane.
A black and white dog carrying a frisbee in a field
A person is laying tennis with racket in hand
Three guys at a table eating a giant pizza.
A bathroom being remodeled with toilet set aside
A man is taking a picture of his bathroom sink.
A man and young woman fighting over a frisbee.
Two bowls of food next to a pack of lemonade.
We are looking down on a market square.
A large boat is motoring toward the shore.
Two men drink wine with their eyes closed.
A small gray elephant  standing in an exhibit at a zoo.
a bridge with a train driving over some water
A woman taking a picture of the back of her top.
A desktop computer on top of a wooden desk.
The zebras are grazing in the open field.
The young man is talking on his cel phone.
A man holding up a phone and pointing to it.
a man in red is sitting on a barrel
A tennis player getting ready to hit the ball.
Several people who are skiing pose for a picture.
The dog is all dressed up and ready to ride.
A van and car driving down a street.
A clock is on a pole under a set of windows.
A person jumping up into the air for a Frisbee.
A young woman sitting on a rock under an umbrella
A skier performs a somersault on a ski slope.
Cars are parked on the street near a traffic signal.
A man drinking from a wine glass in a polo shirt
Tourists among taxi and double decker bus traffic
A computer is on a desk in a blue room.
The road sign is visible for all to see.
a man typing on a desk top computer at a desk
A baseball player is swinging his bat at a pitch
A man making a goofy face while sitting near a cake.
A man laying on top of a couch.
A huge bundle of bananas is hanging from a tree.
A man at the beach flies a red, white and blue kite.
A great shot of a very lit up city.
A close up of the edge of a table looking at a keyboard and a mouse.
A black and red train traveling down tracks.
A group of people sit on a dirty boat.
A man is brushing his teeth while a piece of tissue sticks out of his ear.
Glass and stained wood entertainment center, with decor and a flat screen television.
a couple of giraffes stand next to each other
A bird perched on top of a wooden power pole.
A sign is shown pointing two ways with a dog.
a man walking on the beach with a red surfboard
A zebra standing on a dirt road next to a bunch of deer.
A plate of food showing broccoli, fish, lemon and rice.
the man is leaning over taking a picture of another man
A sheep is standing in the grass near water.
A man holding a blue, red and green frisbee in his hands.
A man carries a bulky, stuffed piece of luggage.
A man holding a pair of headphones in his left hand.
A para sailor goes airborne over waves in the water
A high mountain of snow with a cross country skier.
A fighter jet flying through a blue sky with smoke behind it.
An airplane is mounted on a stand in a park.
this is a woman using her cell phone
A couch is looking quite dark with the blinds down.
A pig head on a plate surrounded by a bunch of apples
A skateboarder spreads her arms to balance herself as she circles the rim of a bowl shaped course.
A man in a kitchen concentrating on cutting an onion on a board with a knife.
Two attached train cars on a track.
Cross-roads sign for Jekyll and Hyde roads attached to top of stop sign
A man sitting at a table eating a sandwich next to a marker board.
a man hitting a baseball during a baseball game
a bathroom with some knobs built into the wall
A truck sitting in the middle of heavy traffic.
A large bus with several people standing out side waiting to get on.
Two women playing paddle ball on a sandy surface.
A computer desk sits in the corner next to a dresser.
a shadowy looking man jumping over a ramp
A tropical bird in flight on a sunny day.
A small ham and pineapple pizza on a plate next to a spicy pepper shaker.
A pizza with tomatoes, corn and a pizza cutter is laying next to it.
Herd of Wilde beast and zebra walk through grass by shore line
A bird standing next to a partially eaten apple.
A row of surfboards sitting on the beach near the ocean.
Group of people watching two skiers come down a slope.
Black and white photograph of people with bicycles and skateboards next to a ramp.
A salad bar filled with lots of different foods.
Little kid in a cap stands next to a fire hydrant
many red and white stuffed bears holding hearts grouped together
A boat sailing in the water near a beach and grass.
A baseball bat hanging to the side of a wall near a sign.
Two women sitting on a couch with remotes in their hands.
A very cute green city bus on a busy street.
a young boy about to take off his helmit after playing baseball.
A white sink and a shower in a room.
A man flying a kite stands next to a young boy.
A close up of multiple vegetables including broccoli.
A brown horse standing on top of a grass covered hillside.
A row of wooden shelves with lots of glass pottery on it.
The bears look like they are hugging each other.
A bed has no sheets or pillow cases
some white jets are lined up on a runway
An outfielder watching what is going on at home plate.
A bathroom that has a yellow floor mat in it.
Boy in midair while skateboarding on indoor course
more than one yellow public transit bus in the road
A person is skate boarding on a sidewalk.
A collection of apples and oranges in wooden crates.
A picture of someones bed and dresser in a bedroom.
Tulips about to bloom in vase in vacant room
a man rides on top of a race horse
A cat peeking into a room from a curtained window
A small puppy chews on a dog toy shaped like a pizza slice.
A table sitting inside of a room next to a window.
a man rides on a horse near a blue car
Asian vegetable stir fry dish with wreath of broccoli and assorted mushroom varieties.
A group of people are sitting around a wooden table.
Two motorcycles side-by-side parked in a grassy area.
A bunch of people on holiday at the beach.
A giraffe walking in grass on a sunny day
A baseball game where a player is running to 3rd base.
A tennis player on the tennis court in the middle of the swing.
A fritata on a plate with chicken and broccoli and tomatoes
A bald man plays an informal tennis game.
The tall vase on the table is holding small flowers.
A man and woman brushing their teeth and taking a selfie photo with a camera in a bathroom mirror.
A woman is sitting on a bench looking sad
A cat next to a windows behind cans and bottles.
A person typing and working on a hp laptop
A green bus is in a parking lot.
Man feeding a costumed woman's head chocolate cake.
there is a black cat laying on a desk next to a computer
A boat of some sort near a harbour.
A person who is working on a laptop computer.
A school bus sits in a parking lot with other cars.
A cat sitting next to a banana on a shelf.
Two men who are wearing suits and hats standing next to each other.
A person falling off a skateboard onto the ground.
A bathroom with a large square mirror over the sink and a brown shower curtain with circle designs.
A double decker bus passes a fellow motorist on the street.
A woman riding on a motorcycle inside of a show room.
A bagel, cream cheese and lox is served with fresh cucumber and tomato slices.
A person in a wet suit is parasailing.
Flowers arranged in vases on a shelf against a wall.
A young person stands on a beach with a kite board.
A tractor trailer is parked in a grassy field while people lean against it.
A family of giraffe walking around a stone filled hillside.
A train makes its way down the tracks in a wooded area.
A TV has a cartoon-like screen with a keyboard sitting idly.
A baby with bib on sitting on the floor putting an unidentified object in mouth.
A man standing in front of an open refrigerator filled with food.
Several senior citizens are at the table, posing for the camera.
A dog is sleeping on a couch in a living room.
The woman in the red shirt jumps up to catch a Frisbee.
A woman eating a doughnut and pointing at other doughnuts in a bowl.
A man with glasses is wearing a white shirt and tie.
two horses at the sunset in the field feeding
Four people carrying surf boards on the beach in wet suits.
A group of people on land looking at a flying boat
A man talks on a cell phone while holding a camera.
A man with a tennis racquet stands on a court.
Guys in the gym playing soccer with teams
A bowl of food and a spoon on a table.
A woman riding on a bike past a busy intersection.
A man outside in snow gear on a snowboard.
The edge of a bed and a closed window.
Michael Jackson hat and glove to celebrate a birthday.
A surfer is atop a wave with arms steadying from an upward position.
A lady in the dark holding a remote up.
A young man is waiting on a table of people at an asian restaurant.
A woman talks on the phone while touches a yellow cup that sits on he table.
Young baseball player up to bat poised to hit the ball
A bunch of plates of food such as fish, pork, watermelon, pasta salad and cocktail sauce.
two pieces of toast, bacon and potatoes on a table with a cup of coffee
Small dog sitting on covered table with orange toy.
Two plates of food  and two glasses of wine placed on a table.
A man in blue shirt holding two bowls full of ice cream.
a street sign attached to a pole on a street.
A black hair dryer sits in a tan chair.
A plate with a sandwich, fries, and a pickle are sitting on the table.
A person walks along the beach with some dogs
A large red double decker bus driving down a city street.
Two photos of a man sitting on a private jet .
A girl standing and holding a sweatshirt next to a stop sign.
A brown dog on wooden floor next to a window.
A closeup view of a clock on a Christmas tree.
two vases on a table with flowers in it
A baby holding a i phone sleeping in it's mother's arms.
A traffic light is shown next to a tunnel entrance.
A large white clock on the side of a wall inside of a building.
A photograph hangs above the tank of a toilet with a spare roll of toilet paper.
A large selection of fruit of different types in baskets.
A partially eaten taco pizza is in the foreground, while another type of pizza is in the background.
Three giraffes pressing their mouths to each others heads.
A man is laying in bed with headphones on.
A dish of pie on a wooden table.
A young man sitting in a car talking on a cell phone.
A gray elephant walking around inside of an enclosure at a zoo.
A small boy chewing on a blue and white toy.
Two microwaves sitting side by side on a countertop are marked with signs printed with the symbols for man and woman.
Someone using a cell phone while brushing their teeth
A pepperoni pizza and a bottle of beer
A dimly lit bathroom just has a toilet and dirty sink.
A stop sign on a street corner with building, crane, and blue sky with clouds in background.
A child holding up a baseball in a mitt.
The white bathroom is very sleek and modern.
A man who is eating a pizza and looking out a window.
A surfer sits on his surfboard while waiting for a wave.
People with red suitcases walk towards a large building.
A bathroom is shown with a shower and a toilet.
there is a stuffed animal that has a small stuffed animal inside it
A man laying on top of a couch in a living room.
There are two people holding glasses of orange juice.
A cup of coffee sits next to a keyboard and mouse.
Two women on a bus, one talking on a cell phone.
Three boys are playing soccer underneath a bridge.
an elephant picks up riders from a platform
Woman sitting with bananas in camp with people in background
Several people are on a lake with kayaks, boards, and boats.
A young man sits on a bed that is made-up with lots of pillows.
A person standing on top of a tennis court while wearing a white hat.
Two old suitcases, a blue one and a brown one, are stacked one on top of the other.
A man riding an elephant plays basketball while others watch.
A woman makes a crazy face over a plate of food.
A man holding a toilet seat on a square toilet in a bathroom.
Old photo of man with a beer sitting in the ground with others.
A bathroom sink under a mirror on top of a counter.
A girl plays with a cat on the ground
Two black birds sitting on the branches of a tree.
A subway train that is crossing over a river by a bridge.
A picture of a snowboarder jumping right into the air.
A computer on a desk with a bottle of beer next to it
A woman is posing next to a stop sign.
A man with a tennis racket jumping on the grass
a man wearing a back pack walking toward another man
A white airplane with two large propellers sitting on a runway.
A clock on a pole in front of a tree.
A woman standing on top of a tennis court holding a racquet.
A man crosses a street at a corner with a market on it.
a couple of people riding on some big elephants
a kid poses on a side walk as a baseball player
A bunch of zebras that are standing in the dirt.
Bathroom with glass shower door and art work hang above the toliet
A boat is in the dimly lit water by the city.
There is a bathroom with a toilet and a bidet.
A women who is riding a skateboard in the street.
A nighttime picture of Big Ben in London, England.
Here is an image of an outdoor place.
A couple of women are playing tennis on astro turf.
A kitchen is being installed with stainless steel refrigerator and glue is on the island.
A man giving a thumbs up while on a cell phone.
A group of young people throw a frisbee back and forth.
A brown bear in the woods under a tree.
A man is frowning while standing in an empty room.
Two giraffes graze on treetops in the distance.
A woman cutting a portion of pizza from a tray next to a bowl of fruit
A plane flying over a river in a rural area.
a man and a woman sitting at a table eating food
A man riding a bike on a dirt path through a forest.
An elephant walking down the side of a dirt road.
The hood a street motorcycle, that has the Italian color, the number 7 and ALITala on it
a brown bear is laying on a rock and some trees
A vase of flowers is sitting on a white table.
A sausage link is strung out on a board ready to be cut.
A colorful vase of flowers sitting on a glass table
She makes riding the waves look fun and easy.
A small restroom with a single toilet and wooden toilet seat.
The kitten is enjoying the treats on the plate.
A lovely cat have a cup to his face.
A group of animals walking in the grass next to a road.
A few men carrying some surfboards on a beach.
a small plate of cake on a table
A person that is doing a skateboard trick in the air.
a close up of a pizza on a pan on a table
A red umbrella with the ruins of a building in the background.
A large gray cat laying on the floor next to a couch.
A man standing under a blue cloudy sky.
People sit around a table full of hot dogs and fries.
A bus traveling on a road with other vehicles beside a large building.
A sandwich with eggs and cheese on paper
Skiers riding a ski lift and looking back behind them.
A woman in skies is standing in the snow
The young boy is walking with his glove on.
A man standing on a surfboard catching a wave.
Rows of Pullman bags for sale at a store.
An open door on a train at the end of a platform.
A cat climbing down beside a t.v. screen.
A computer sitting on top of a wooden desk near a window.
A large bridge spanning the width of a bridge near a tall building.
A car perched on a table looking closely at the television screen.
Three friends look past a bottle of wine to the end of the table.
An old building with rote iron railings and landings.
An extreme close up of an expensive gaming keyboard.
a young boy sliding down a snowy hill on a snowboard
A woman walks down an empty street next to a large street clock.
Assorted food items with paper wrapper ready for consumption.
A oven made of iron filled with pots and pans.
That building looks like the building downtown in Atlanta.
A picture of a man's face next to another picture of a person's arm holding a glass of wine and a remote control.
A woman holding two pairs of scissors next to a display.
Man in black shirt and jeans doing a skateboard trick.
A woman and black cat together in a bed.
an image of a man that is riding his bike up high
Shot of bathroom with bath on far side near toilet.
A dog is laying and resting on a walkway.
A young persons clean and orderly bedroom and desk.
a man in a blazer uses a cell phone
someone having a  chili cheese hot dog for lunch
some glass ware is on a wood shelf
A bathroom with a white toilet next to a shower.
A horse is standing inside a pen next to a smaller horse.
A girl has her pony by the harness.
A kitchen with an oven, stove, microwave, and refrigerator.
Image of a bathroom showing the vanity and sink area.
A large bathroom with a toilet and sink.
The person is deciding whether to try the skateboard trick.
A photo of a horse on the back drop of an ocean
A work desk cluttered with stamps and work supplies
A woman walks down the street with an umbrella.
A man is riding his bike down a subway area under a Clearance sign.
Flowers in a vase sitting on a window seal.
Black and white cats laying down in the green grass.
Pastel umbrellas hang above a garden in the lobby of a fancy building.
A zebra carefully walking around in a zoo pen.
Stud farm with horses and trainers in a vast ground.
a cat and a sheep are standing in a field
THERE IS A WHITE WII CONNSOLE AND GAME ON THE TABLE
A fish eye view of part of a bathroom
An owl is eating the flesh of another bird.
A white parrot standing next to a jungle covered hillside.
A group of people standing in the middle of the street.
Two women are cutting a heart shaped cake together
A farm stand selling plants and apples by the pound or quart.
Sidewalk under construction with safety cones by the fire hydrant.
A man sitting on top of a pole next to a fire hydrant.
A small tree is covered in snow from the storm.
A man is dressed like a clown magician while pointing at a picture on his cell phone.
A man is standing outside of the water observing the huge flock of birds.
Black and white photograph of people observing sheep in a field
there are many computers and lap tops on this desk
Smiling indoor tennis players and their racquets with a football
Some young children preparing for a baseball pitch.
A kitchen area features a silver refrigerator, stove and counters on dark, wood flooring.
Young woman dressed in black and white playing soccer.
A flower in a pot standing on a table.
a baseball player swinging a bat at a ball
A road sign next to a building and tree
A man dives in to catch a frisbee
A giraffe in an enclosed area eats from branches up high.
a very pretty kite is flying high in the sky.
An orange cat sitting in the passenger seat of a car.
An assortment of remote controls lined up on the table
A skateboarder who is jumping down a flight of stairs
A skateboarder performs a difficult skill in a skate park.
A bedroom has pink walls and a blue bedspread.
A stack of donuts sitting on a piece of paper.
A white boat on water with seagulls and umbrella in the foreground.
A train coming into the train station
A zebra is outside enjoying the grass before him
The bear is on the table in front of a glass of beer.
An intersection with a pole that has signs on it.
A crowd of people on the sidewalk and an airplane overhead.
A bathroom with a tub, toilet, sink and a mirror with red edging.
A photo taken from the ground of a person standing with their skateboard.
There is a stop sign in a field.
A group of people sitting around dinner tables.
A senior tennis player prepares to backhand the ball.
A woman standing next to a tree holding a pink frisbee.
A baseball player taking a swing at a ball
A woman standing next to a standing toilet.
a person riding skis jumping in the air
this is a woman on skis posing for a picture
an elephant with its mouth open and some bushes and trees
A dog sitting in a wooden rocking chair outside.
Close up view of a large glass of wine.
A woman talking on a phone while wearing glasses.
There is a hanging clock over a set of stairs.
An eagle flying past a group of green trees.
A bento box with chopsticks containing strawberries, carrots, sandwiches, broccoli, lettuce, and some other foods.
Interiors of a kitchen containing several household items.
A man standing in a kitchen with a large pan of batter.
A train traveling across a snow covered hillside.
Two traffic sings sit above a Parking sign.
A man riding a skateboard up the side of a ramp.
Baseball players at the pitch playing and a crowd watch
this is a man picking over bunches of bananas
A Muslim man is being interviewed on TV
an image of two urinals inside of a public restroom
Trains sitting side by side on a train track.
An empty intersection in a mountainous area.
A group of people eating food at a table together.
A man walking on a brick sidewalk with an umbrella.
A surfer riding a large wave in the ocean.
Neckties are tied together around the circumference of the pole.
A man holding a tennis racket up in the air
A man taking a selfie while brushing his teeth and looking in the mirror
A person that is trying to get a frisbee.
Commercial airliner flying  near mast on cloudy day.
Some pizza with toppings and some pasta on a plate.
an image of a rotten fruit and burnt hot dog
A young lady sitting at a table covered in food.
A horse standing in a snow covered field in front of some buildings.
A group of people sitting around a couple of benches.
A baseball player with one leg kicked up preparing to throw a ball
An empty kitchen is shown with empty counters.
a white bus driving in a parking lot with a truck beside it
A toilet in a bathroom that is being built.
A vase and lids are sitting on a table.
a small passenger plane sitting in a filed of airplanes
A motorbike with blue and silver bones painted on it
A baseball game with players in uniform and one player swinging the bat at home plate.
An empty bed with gray sheets and a small lamp
A woman carrying a pink umbrella wearing a blue scarf.
group of people on bicycles waiting at a stop light
a train going down a track by a platform with stairs
A kitchen table that has a vase on it.
A cake shaped like a bear has a sparkler and candle on it.
The large cat fell asleep in the chair when no one was home.
The animals look very skinny and unhealthy as they walk around.
A couple of zebras graze in their zoo habitat.
A woman is taking a picture of herself in the mirror with a camera.
A grey cat is being held by a woman at a cat show.
A bird flying over a small city with small buildings.
Two young children playing in a living room
A man with graying hair looks down at a stand full of yellow bananas.
A train on the tracks under the electrical lines.
This person ordered this dish at a restaurant
The dog is on the couch in the room with the large TV.
A large kitchen with wood floors and cabinets
Soldiers on a train saying, "goodbye," to nurses.
A bathroom with a toilet, sink, towel rack and paper roll.
A blue car that is parked on the side of the street.
A slightly knocked over stop sign next to a small empty road.
A line of buses parked in a bush lot with a fence.
A guy doing skateboard tricks in front of a crowd of people.
Giraffe and small dog stare at each other at the zoo
A group of guys playing Frisbee in a park
The retro looking living room has blue couches and pictures on the wall.
A pile of luggage is secured to the top of a small car.
A plate holds a large salad with broccoli.
Many horses are on the beach near the ocean.
some people are walking around a city with umbrellas
Two ladies wearing black texting on their cell phones.
Two men in a park playing a game of basketball
Metal street signs with street names and a stop sign.
A bird is standing upright in the water and leaves.
Two guys playing a game on the WII.
Young girl perched on rock about to rcieve thrown frisbee.
a person in a living room playing nintendo wii near a window
A tennis player is playing tennis on the court.
A sign on the street that lets you know where you are.
A cat is using the toilet to go to the bathroom.
a person on a beach holding a surf board
A white and black boat traveling near the Golden Gate bridge.
A boat sailing close to shore near a lighthouse.
Looking up at a dirt bike rider leaping over a jump
Elephants at the zoo holding each other's trunks.
A man and a woman that are sitting on a couch.
A very young girl brushing her blonde hair.
an image of a man drawing pictures on the sidewalk
That concrete is going to be hard on his body if he misses this skateboard trick.
Boys sitting on a bench at a baseball game.
A man riding on the back of a motorcycle down a road.
A row of red fire hydrants sitting in the middle of green bushes.
A table with a television and a picture of electrical gadgets.
A person wiping out on a surfboard on a wave.
A bus traveling down the street next to a bunch of cars.
A plane makes a landing at an airport.
A man standing on a tennis court holding a tennis raquet.
A woman standing talking to her cellphone next to a man in glasses.
A sink and tub with towels in a room.
A soccer player runs up to kick the ball while the crowd watches.
A pregnant belly with a teddy bear on top of it
A small table space that is in a tiny motel room.
a glass vase with some flowers inside of it
Blue and green passenger train passing down the side of the small valley.
A lone zebra stands under a tree branch.
a large pile of teddy bears in many different designs
A couple of men standing on top of a field.
Tourists photographing a steam locomotive pulling into a station.
An old style truck that is parked on the grass.
a table that has a banana and some ice cream on it
A horse is being led away by its bridle
A tall building with a massive clock on it's face.
Two people standing in a room playing video games
A covered dish beside a sandwich and other dishes of food up to the right.
One person cutting a cake while the other pulls out slices on a spatula.
A tie with the picture of a deer on it sitting on a shirt.
A woman is wearing sunglasses and holding a parasol.
Guys in the park playing Frisbee golf on a cold day.
a bathroom with cream colored walls and a broken counter on the floor
A horse or zebra in the middle of some shade trees.
a black and white kitty laying next to a chair leg
A light that is on a table next to a laptop.
A man rared back with his racquet on a tennis court.
some stuff blended up in a blender for some serious gainz
a person at a desk with a laptop and a note book
A man flying through the air while riding a skateboard.
a platter and assortment of different desserts and cakes
A group of people sitting at a table around a pizza.
Two skateboarders doing tricks at a skate park
A pair of skis are placed in the snow.
Cat carefully examining a skateboard on a hardwood floor.
Two girls in red chasing a white soccer ball.
Water spews into the air from a fire hydrant.
A street corner with trees that are covered in snow.
A horse is running down the dirt path.
Two young ladies are sleeping side-by-side in a subway station.
A picture of a man holding a  remote.
a person lays on the snow with their feet up
A large crane sitting next to a building under construction.
A red train leaving a train station with man watching.
A narrow lane runs between rows of parked buses on a rainy day.
A glass of wine and a smart phone sits next to a laptop computer.
A little girl that is sitting on a kitchen counter.
A hot dog, french fries, and a spread.
THERE IS A CAKE ON THE TABLE
A close up photo of a train set with a little train going by.
A zebra standing in a grassy field by a woods.
Near a wooden bench, a baby in blue places her rubber boot upon a skateboard.
An old fashioned train is parked as workers gather around it.
a refrigerator with stickers on it sits in a corner in front of a window
The young boy is standing and playing the game.
A family sitting at a outdoor table at a restaurant.
There is a truck pulling a camper trailer
The display of the Magic Bullet blender, with a price tag of 53.99.
Pale shelves with bananas and other items and a  black marble topped L shaped counter against a brick wall with cooktop, sink,  and various kitchen items, meet, leaving a small section of inlaid wood floor.
Small white bathroom with a black-and-white shower curtain.
A green couch sitting in between two lamps.
The skateboarder is performing a trick, mid jump.
A light colored dog chewing up a child's toy on the carpet.
Boy with legs out stretched taking a jump with a skate board.
A vintage photo of hurricane damage to boats.
A picture of a living room in a house.
a lady with a real colorful umbrella that is standing outside
Two women in white tennis outfits hold out their rackets as a crowd watches.
Girl on a skateboard texting by the beach
Breakfast for four with omelets, fried eggs, bacon, ham, french toast and pancakes.
A man dressed all in blue playing tennis.
A clear vase holding white flowers on a table.
A kitchen area with many copper pots and bowls on display.
Two people huddle on a bench under their belongings.
A cat is sitting on a couch while leaning against the couch's arm.
An upwards-looking view of a Stop Sign, an All Way sign, and a One Way sign.
A person riding on top of an elephant near a tree.
A man walking with a dog that has a frisbee in his mouth.
Two men pushing a full cart down the  road
The street is lined up and down with motorcycles.
a close up of a person holding food
A black horse in the middle of a field with a mountain in the background.
A bench on sidewalk below tree next to lamppost.
A computer desk with a monitor, phone, and laptop on top of it.
A couple of wine glasses next to some bottles.
There is a small window in a stone building
A woman in yellow raises her tennis racket.
Flying a kite on a wide beach with few people.
A man standing next to a hipster girl.
A man is taking a picture in a rear view mirror.
there is a man flattening dough on a tray
A group of people standing and sitting around a table.
A blue shelf filled with Chiquita bananas in  a store.
A man with sun glasses and wearing a hat laying on a bed.
A man in bright green prepares to serve a tennis ball.
Here is A tender moment among zebras this afternoon
A Muslim lady holding a child that is being fed a birthday cake.
A plate of food is arranged with fruit and vegetables.
Horse and rider walking on sandy beach at ocean.
Skateboarder and board in mid air at a contoured park.
Cat sleeping near the sun on bed covers.
a group of guys playing with the wii
A plate of food containing broccoli,cauliflower, celery and other foods
Puppy and full grown dog outside near some refuse
A sleeping black and white dog wearing a pirate hat.
A hot dog with toppings and potato salad
There is a large group of skiers standing on a wide field
Two girls walk along a path near a waterfront.
Small brown dog laying in between a person's shoes.
a beach covered with umbrellas and tourists relaxing
A group of people flying kites under cloudy skies.
Two old people in motion while playing a Wii.
A private airplane is flying in the sky.
A woman sits in a u-shaped bench with her legs elevated
A street with people walking about it and a kite above.
A red, double-decker bus drives through the town as dusk approaches.
A metal sink filled with many lemons and apples.
A man in a baseball uniform standing on a baseball field.
A man smiling for a photograph and holding papers in his hand.
a yellow long tailed kite being put into the air by a couple
Three lit candles on a chocolate birthday cake.
A baseball player pitching a ball to a batter.
A train coming on the track in a train station
A person is doing a trick on his skateboard.
A player chases a tennis ball while the umpire watches.
A person giving a thumbs up to a computer screen.
Closed toilet and shower in small, bright bathroom.
A zoo keeper on a scale holding a giraffe with a "me gusta face"
A woman and child on a silver motorcycle.
Brown and white cat sleeping on desk next to a computer.
a person on a snow board does a trick over a hill
A couple of guying chasing after a Frisbee.
A person in a suit and tie looking unhappy.
A dark colored river with several horses on the other side near the trees and brush.
A bicycle with a springs mounted under the seat.
Two pieces of pizza on a plate pepperoni.
A decorated room with no one in it has a table in the middle with various items on top.
Two trains driving inside of a train station.
A woman in a costume inspired by the White Rabbit from "Alice in Wonderland."
A baseball player in mid swing and a catcher ready with his glove.
A man biting into a slice of pizza.
Advertising and traffic clutter a busy city street
a crowd of people by a school bus and a girl holding a big blue bowl
Two people are in front of a deck, and about to go skiing.
a kitchen with a sink trashcan refrigerator and a heater
a building with a clock sitting near the top of it
a man in a red and gray snow outfit stands on skis holding his ski poles as he stands near other skiers and snowboarders.
There are different citrus fruits in the bowl.
A bed and desk in a small room.
A horse that is grazing around in the grass.
A tie rack filled with lots of different colored ties.
A couple drinking wine on a horse-drawn carriage ride through the countryside.
half empty bowl of cereal with a loaf of bread, a banana, and beverage
A bathroom with raised shower, sink, widow and mirror.
a very large building that has a clock on top
A stop sign out front of a construction site
A flat screen TV mounted to a wall over a lamp.
A car and a large truck on a city street.
A young man swinging a baseball bat on top of a field.
a big basket of bananas next to some people
A chocolate cake sits half eaten on a table.
a public restroom with a white toilet and toilet paper
A red couch behind a brown ottoman with a cat sitting on top of it.
A stove with a willet cooking banana and a moka pot.
Hot dogs and buns cooking on a grill.
A beer advertisement on the side of a passenger bus.
a man wearing a suit and tie standing in a room.
A group of surfboards on a rack on the beach
A red fire hydrant sitting next to a green plant.
a surfer wearing a wet suit is surfing on a sunny day
A plate with with different kinds of food on it.
A man and a boy playing Wii in a living room
A man dressed in white is on a horse.
SOME GOOD WAVES FOR TWO SURFERS IN THE OCEAN
A smiling man that has long dread locks in his hair.
The tennis player is swinging the tennis racket.
A zebra drinks from a pool of water in a grassy field.
A kid holds a sandwich and a big candy cookie.
A boy leans on a counter next an almost empty soda bottle.
a skier with a red jacket is next to some water and snow
A man is smiling while talking on his cell phone.
A zebra is grazing in an enclosure while an ostrich sits in the background.
The single train car is painted black, yellow, and orange.
A man standing in front of a fridge with a lot of magnets on it.
The interior of a public bathroom with multiple sinks.
A man and  woman standing next to each other with the woman holding an umbrella.
A parking lot filled with yellow school buses parked side by side.
A kitchen with a sink, coffee pot, refrigerator and shelves.
A person handling bread over an open oven.
A woman with a tennis racket is running
a tray covered with cheese fries, a corn dog and a hot dog
A train is coming down the track near old warehouses.
Two teams playing soccer with one team  kicking the ball down field.
Shadow from a street sign with a message written on it.
Several birds overlook the skyline of a distant city.
A woman poses with avocado sandwich lunch at an outdoor restaurant
A living room with chairs, a table, and painted walls.
Two women shaking hands at a tennis match.
A police officer mounted on a horse while two children pet the horse.
Lone giraffe lying in dirt area of enclosure.
there are two hot dogs on a fake paper plate
a black red and white double decker bus people and buildings
A boy grips his skateboard as he jumps the edge of a half pipe.
A couple of elephants roaming through the tall grass
A close-up photo of a white and brown cow.
A young girl standing under a window next to a toilet.
A tennis player stands by her equipment bag holding two rackets.
A mountain view with two birds flying overhead
A boy and woman in an open area in shopping center with three park benches.
A street identifier installed as part of a curb in the sidewalk.
A large elephant standing in a grass field.
A young man is riding a skateboard with other young men watching him.
A view of the city is very colorful.
The picture shows the underside of a jumping snowboarder.
A black and white train on tracks next to a station.
There is a cat giving itself a bath while laying on a luggage.
A woman with a dog talking to two people sitting on a bench.
Several children and some adults celebrating a birthday party.
A black and white zebra is standing in the green grass.
a orange cat sitting on a half rotted wooden bench
A poppy seed muffin with orange slices on a plate.
A man on snow skis traveling on some snow.
The three giraffes tower over the smaller animals.
Two beds in a tiled room, both with lime green bedspreads.
A pitching about to throw a baseball at a game.
A vase that is placed outside of a window.
Pumpkins sit under a spooky lit up Halloween display.
Two people roasting hot dogs outside on a stick.
two people on a beach with a kite
Small group of kites being flown nice day.
Several people in the heavy snow on skis.
A green and silver train passing by a building.
A group of people walking down a street on a rainy day.
Slices of pepperoni pizza on a baking tray.
People standing with sheared sheep inside a fenced enclosure.
a coupe of people sit on a couch while laughing
A young man kissing the top of a young woman's head.
Vases and figurines line a long piece of furniture next to chairs, a lamp, and a picture.
A bird walking in the grass with it's beak open.
A man holding a large bag of lime green luggage.
A baby sitting on a bed next to a large brown and little white teddy bear.
A white horse looking up for a photo at a fence side.
A stop sign that has been tagged with graffiti.
Two people in a group with one holding up a phone.
a couple of people that are walking on a beach
A man standing on dirt holding a pink frisbee
A baby is sleeping in a swing in a room.
a close up of a cat in an open luggage bag
Many people on the city street with umbrellas.
A man rides two brown cows across water.
A meal of a sandwich and soup sits on a wooden table.
a close up of a bunch of green apples
a kitchen area with a stove-top oven and sink and cabinet with a dishrack
A woman is blurry as she rides her bike next to shops in a city.
The horde of pigeons take advantage of the crumbs left by pedestrians.
A group of elephants is standing on grass.
a person pointing to what they are putting on their snadwich.
A man holding a snowboard standing at the bottom of steps.
A man walking along the platform next to a subway car.
A train covered in blue paint and graffiti.
A bus displays an In Service sign, traveling down a road
A woman holding a cat up tight against her.
A mother and her child sitting on a couch using laptop computers.
Smiling woman standing in front of refrigerator with wine bottles on top.
The giraffe is posing for the picture near the wooded area.
This train car features a variety of colors and carries passengers.
A zebra that is outside eating some grass.
A young man in black clothes holding a yellow frisbee
Group of children sitting at table eating pizza off plate
Museum with ancient artifacts and people looking at them.
Several chocolate donuts with decorations sitting on a pink mat.
Several street signs shown on a city street.
A kitchen that has various types of appliances.
A group of people are standing around a caged giraffe.
People play in the water and fly kites at the beach.
A person wearing glasses is walking away from a stop sign.
Person bundled up out for a ski in the soft snow
a green street sign surrounded by some trees
A city street filled with tall buildings and motorcycles.
A traffic light is displaying a green smiley face.
A pile of vegetables sitting on top of a wooden table.
A marching band stands in a street in front of spectators.
The blue bathroom is small, sleek and efficient.
a cup of coffee a laptop and a table
Three women on a couch talking to each other.
A locomotive train traveling across a train trestle.
A street sign of NE 5th st and the back of a stop sign
A water bottle with ear buds on it in front of a laptop.
A cat is looking at himself in the mirror.
A girl standing in a room holding a green Frisbee.
THERE IS AN AIR PLANE THAT IS FLYING IN THE SKY
Two planes are flying by one another and one is putting off pink smoke while another puts off blue.
A room with three old tubs and peeling walls.
The boy is curious about what is beyond the umbrella.
The plane is flying over the parked cars.
A woman and a black and white dog on the beach.
Several cars at an intersection on a city street.
Female tennis player in blue outfit returning volley.
A bowl of a kind of vegetable stew on a table.
An individual snowboarding down a snow covered hill.
two elephants in tall grass with trees in the background
A tennis player prepares to hit a tennis ball, while others watch.
Oranges hanging from an orange tree in an orange grove.
The little boy eats a slice of pizza.
People standing around the stove and counter fixing plates of food
Two laptops and monitor on a desk in front of another monitor.
A pulley is seen in a room with lots of stuff on shelves.
A fake bear that is standing in the snow.
A pool surrounded with chairs and trees.
Man riding a snow board down a long slick area.
Several kites are flying on the beach in the blue sky.
a living room with a tv a desk and another tv
a black cat walking into a kitchen
A group of motorcycles parked in front of a tall building.
A giant neon Coca Cola sign glows in the stadium during a baseball game.
A bathroom counter has purple orchids on it.
An orange fruit beginning to grow on a tree.
The city streets are busy this time of night.
A skier flying high in the air over a snowy hill.
People standing in a long line at a train station.
A round intersection on a surburban street with one floor homes.
a couple of people that are standing next to each other
A large group of people standing around in red, white and blue colors
A dog is sitting on a chair near a stuffed animal.
a group of people that are walking down a sidewalk
A man and a baby lying on the couch in a living room
a giraffe eating leaves from a tree with its butt to the camera
A woman holding a racquet and tennis ball on a court.
A man wearing a suit directs two men riding horses through a city
A young boy touching a cow through a metal fence
A black and white photo of a train system going down tracks.
An aerial view of a street corner with a STOP sign and a ONE WAY sign above it.
Two men playing professional soccer on a field.
A man riding a wave on top of a surfboard.
A woman in a dark cave holding two sheep
The room has two couches in front of a tv.
The ingredients are on the kitchen counter next to the blender.
some people at a table with a umbrella silverware and some drinks
A man that is wearing a suit and a pink tie.
A gray and white tiger striped cat sitting in front of a brickwall
The fried rice has vegetables and meat in it.
a bunch of people on a snow slope in the moutains
A toilet attached to a red and white brick wall.
A train traveling along side of a road.
A white table topped with plates and bowls of food.
a white keyboard sitting next to a white computer mouse on a mouse pad.
two stage coaches traveling down a snow covered trail
there is a clock on the side of the old building.
A group of people are lined up skiing.
A man and his son playing Frisbee in a park
Female soccer player maneuvering ball on grassy field.
Skiers skiing in the snow with their skis on the ski slope.
A man in a purple shirt doing a trick on a skateboard.
A British Airways airplane flying in the air.
A man is doing an upside-down flip on his motorbike way up high in the clouds.
A man does a jump on a skateboard.
A white Ecohopper bus driving down a street.
A PICTURE OF A WEATHERED YELLOW AND BLACK TRAIN
A bathroom with toilet, mirror, picture and tub.
A hand holds an old-style flip phone in the open position.
A boy on a skateboard at the top of a rise on a skateboard ramp.
A large wooden clock hangs from the ceiling in a store.
A person holding two ski poles while standing in the snow.
a dog that is rolling down a skateboard
there are many people on the road riding motorcycles
a group of baseball players standing in a field
Large bird preparing to fly from beach area.
A group of brown cows grazing in a field
A LOT OF MEN ARE ON HORSES
A child is playing in a recreational park.
A street sweeper machine parked against a tree by a street.
a living room with some book cases beside the fireplace
a lady wearing a red sweater with an empty plate
A teddy bear with multiple colors with a new tag still on it.
A man laying down in the snow with skies on
Passengers near a yellow and blue ski airplane.
A large display of apples at a market.
A red couch that has a laptop computer on it.
A bedroom that is cluttered and needs organization.
A MAN DRESSED AS A PIRATE AT A PARTY
a young man is performing a skateboard trick
A baseball field full of baseball players standing on a field.
A bathroom with a sink on the left under a mirror and a toilet.
To bananas sitting on two blue plastic bowls.
The yellow commuter train is pulling into the station.
Photograph of a public toilet as taken from above
A LADY IN YELLOW ON THE COURT PLAYING TENNIS
A woman is shown holding a pizza with zucchini
An elegant bathroom has a light up mirror, marble counter tops and dual sinks.
A dog sitting on top of a made bed.
A man trying to block another man with a frisbee during a game.
A yellow cat is sitting on a green blanket.
The plate of food has a salad and toast on it.
A snow skier skiing down the ski slope.
a guy that is jumping on a skateboard
A white kitchen with a counter in the middle.
A blurry picture of a bird sitting on a wire.
a motor bike parked on the side of a road across from cars
A man is looking at his laptop while chatting on the phone.
A man that is sitting down near a bird.
a bus in a city at night time stopped
A person jetskiing in the water and creating a huge wave.
A TV sitting on top of a counter inside of a store.
A parking meter on the side of a street
A man leading a flock of sheep down a street.
A bunch of stuffed bears altogether during Christmas.
one sheep is standing in some tall grass
a family is sitting down at a table to have cake
a cappuccino and a overripe banana sit on a table
A person rides horseback down a beach along the ocean.
Some men and women in white shirts and bow ties standing in a row.
Skiers lined up at the starting point for a race
A baby sleeps sitting up while clutching a teddy bear.
Multiple computers and soldering equipment on two desks.
The kitchen has a stove, and a microwave in it.
A baseball player takes a swing at a ball.
A man making pizza in an oven on a wooden board.
A woman sitting back on a couch holding a little white dog.
A person standing next to a tall giraffe.
The interior of a modern kitchen including an eating area
A cat sitting on top of a blanket on a bed.
A young man in a bathroom taking a picture of himself using the bathroom mirror.
The graffiti on this Stop sign denotes a positive impact.
A small pizza sitting on a decorative plate.
a bird that is sitting on a pole outsid
A cat climbing on top of a suitcase.
Man herding sheep down a street with a child in front of the herd
A bathroom with a small window and a odd toilet.
A person helping a child stand on a skateboard.
A baby girl on table next to cake and balloons.
A woman sitting up in bead looking out the window.
A blue and white fire hydrant on a lawn.
A person skiing down a snowy mountain side.
The purple city bus is noticeable against the brick buildings.
A bathroom with a standup shower, toilet and sink.
This person is laying in bed while reading a book.
A group of people standing on top of a beach.
A woman standing in front of a counter full of baked goods.
A pack of elephants stand in a grassy plain.
Several people are doing something with remote controls.
Sheep are grazing in a field in the distance.
There is a surfer holding on to a sail in the ocean
A kitchen with drawers, a stove and a sink.
A tall giraffe standing next to a tree on a grassy field.
SKIER COMING DOWN THE SLOPES JUST OUTSIDE THE CABIN
A guy and a girl are sitting in rocking chairs using laptops.
A large pizza sliced in half in a box.
a herd of zebras drinking water at a lake
A cat laying on top of a suitcase laying on the floor.
A pile of chicken, carrots, and brussel sprouts.
Two buses driving over a bridge with boats in the background.
The slightly overcooked pizza is inside of a pizza box.
A box of cookies sits by a wedding cake decorated with berries.
A laptop and an old computer display text while sitting near a window.
An airport with an airplane that has a red tale
a number of different doughnuts on a table
The couch is directly in front of a huge television set.
A home desk has a computer, lamp, and knick-knacks.
The black and grey cat is facing the other way
A bowl of vegetables on a wooden table.
a close up of a jet flying in the air
The man in the red cart held the reigns controlling a pair of obedient horses.
The plate has broccoli and an egg roll on it.
a bunch of computers that are on a desk
there are many surf boards laying on top of each other
Man in yellow shirt grinding down a railing with his skateboard.
Two men surfing in water next to a dock.
Three horse wearing coats walk around a large field.
A man looking downward holding a teddy bear.
The seat of the wooden bench is covered in snow.
A group of friends playing a motion controlled video game
Toothbrushes and toothpaste lay on the counter by the sink.
THIS IS A PHOTO OF A MAN WORKING ON SOME SORT OF CRAFT PRJECT
A man driving a carriage pulled by three horses.
A baseball hitter swings at the pitched ball
A sailboat is floating on a lake under a cloudy sky.
One person flies a kite near a crowded sidewalk.
Freshly shorn sheep eat grass in a mountain pasture
A girl looks into the distance, while holding a clicker.
Two men on a motorcycle pass through a crosswalk.
A little girl eating a piece of birthday cake at a kitchen table.
Horse and carriage going down the street in the city.
A ship docked at an empty harbor at sunset.
The cat is on the desk by the two computers.
Man lying on a bed in a furniture store display.
Female tennis player in a purple uniform ready to play.
A couple of sheep are in the grass by a barn.
A bench sits in the sun near a path and some water.
TWO POLAR BEARS IN THE POOL EACH ONE HOLDING SOMETHING ORANGE
Male and female intent while attending a function.
A surfboard standing in the sand near trees and the water.
a few small boats in a large body of water
A plate that has several sandwiches on it.
sheep standing next to building near a city street
A bathroom with a sink, mirror, and toilet and other items
The man is driving the bus full of people
A cat dosing off while lying on a chair.
Screen of an iPhone with German language text held in a person's hand.
a baseball player swinging a bat on the field
A kitchen with a pull out ironing board and refrigerator.
A street sign on the side a of cement wall.
some parked bicycles and two women on a bench and a book
A hideous bathroom that is pink in theme.
A man is playing with a frisbee on the beach.
a tall clock tower near a building with a dark background
A line of girls holding frisbees or plates outside
There are two streets signs attached to the stop sign.
Two red and white stop signs on a street.
a decorated vase is sitting on the table top
there are many people that are sitting on the benches
two trains on opposite sides of a railway platform median.
A girl on a surf board riding a wave in on the ocean.
A man wearing a blue shirt while eating a hot dog.
A parade float with people on top of it
A chair and contraption between a grandfather clock and a plaque on a floor.
The surfer is on the surfboard riding a wave.
A large clock tower on the side of the water.
A geese and several goslings in a pond
A zebra walking in the grass while other animals are standing around behind him.
A picture of a snowy street with a red fire hydrant.
Two cars parked in the grass as a train goes by.
A bird sitting on a bird feeder next to green trees.
A bathroom is shown in dim orange lighting.
this lady is using controllers and those men are watching
a toy train on train track next to a toy railway platform.
There's a computer monitor on a desk with speakers around it
A photograph of Key Bank with a clock under the sign.
A bicycle is parked in the narrow alleyway.
A double decker bus drives down the street.
A horse grazing in a pasture in a field, with mountains in the background.
A girl that is standing away from the camera and has a Wii remote in her hand.
A skateboarder performing tricks under the lights at night.
A large red bed with a black cat laying on top of it.
a black dog sitting in a white bathroom
A man standing in front of a train car door.
a person sitting on steps with a cell phone
A man who is holding up a parachute.
A man using a One Laptop Per Child computer, while another man uses a standard desktop computer.
An open cell phone next to a sprouting sunflower seed.
A young man jumps up to catch a Frisbee underneath his legs.
a big zebra that has his mouth on top of it
A man is sitting on the couch eating.
Various people eating in a restaurant at a table.
A man with a tennis racket at a tennis court.
Bright yellow furniture sitting in a living room next to a lamp.
A bathroom is shown with a door cracked.
A man without a shirt is brushing his teeth.
A man playing Frisbee on a beach on a cool morning.
A woman is next to a scooter and cat.
A small boat with people on the top.
Three boats sit on dry land, the nearest one is called Lauren Jade.
The motorcycle officer wearing a helmet drives near a crowd of people.
People making a for sale sign on a car.
A man riding a motorcycle down a street and surrounded by houses.
A toilet bowl with a bucket and trash can by it.
A brown vase full of colorful flowers in front of a mirror.
A group of giraffes stand in a large open field.
neatly made bed with blue sheets in a pink room
two bikes parked near a clock pole on a side walk
A couple of boys wearing ties giving each other a hug.
a living room with a table chairs and a tv
A tennis player at the match is returning a volley.
The last car of a train sits on train tracks.
A bathroom with vanity with sink, toilet and tub.
Two men toilets, one regular toilet, and a sink in a bathroom
A chair sitting next to a flat screen TV.
A group of zebras are next to a patio table.
A green street sign mounted to a white street light pole.
A child posing on top of a mountain while they ski.
A herd of zebra walking across a dry grass field.
A boy in a tie poses for a picture.
A small yellow room with a couch, table and lamp and wood flooring.
Elephants gathered in the corner of an enclosure
A long train coming down the railroad tracks.
Lion statue with a large structural clock in the distance
a lady that is holding a laptop sitting by a street
A woman is holding a racket on a tennis court.
Three dogs are following three women toward the entrance of a building.
Cattle grazing in partially snow covered ground in winter.
An anniversary cake on a table with a picture and glass of wine.
A young oerson is raiding a small fridge in their room.
A pesto and chicken pizza cut into eight slices.
A green double decker bus sitting on top of a parking lot.
An old bench on a porch of someone's house in the valleys.
Some cars at a red light at an intersection stopped.
A model kitchen is shown with white appliances.
A large blue bus on the side of a road.
A group of motorcyclists fly the Puerto Rican flag.
A woman and a man look to the left while the woman points
A photo dark room with the red light on.
A giraffe standing near a tree by a body of water.
A woman and small boy feeding some sheep
Plates of food cover a table and includes vegetables and potatoes.
The bird with the purple feathers is perched on the branch of the tree.
You are proudly witnessing a 360 Ollie in progression
A large kitchen with a table in the middle
A cat is standing on top of a TV trying to look out the curtains.
A pizza sitting on a table, with a spatula in the back.
A woman pushing a stroller and looking at a cellphone.
A cat laying in front of a computer next to a mouse.
A small blue and white plate sitting on a small runway.
A fire station on a street in a downtown area
A large white airplane parked on a runway.
A person is holding a tomato above a tray.
a smiling woman standing next to a baby in a high chair
there is a female tennis player serving the ball
Two baseball players and an umpire standing at home base.
She is going to nail that tennis ball.
A view of a train station from the parking lot.
Two white cattle standing in water next to some ducks.
A bathroom mirror over a marble sink with the lights turned on.
a lady with a knife laying down in a bed.
a bedroom with some posters a blue and white bed and some pillows
The restaurant platter piles  french fries high  with a juicy burger.
Man prepares to throw a frisbee in an open park.
A plate of food with a pizza on it.
A view ofa  bar from behind the actual bar.
A group of children playing baseball out side.
The commuters are busy while they wait for their plane.
A few apples and a banana sit in a dark bowl.
A person on an ocean beach flying a kite
A train is traveling past a grassy area with a foot path.
A couple of kids petting sheep inside of a corral
The sheep and the dog are on a race.
A group of people sit at a table with cake.
A man riding a board on top of waves in the ocean.
A surfer catches a wave on a white and green surfboard with another surfer in the water behind.
Very long Coney dog on a long buffet table in a ball room
There is a statue of a man's head next to a cat.
A two sided pizza is being cut by someone.
A jar filled with liquid sits on a wood surface.
Group of people riding their bicycles on a city street.
A worn and tattered pink and black bag.
A young giraffe leaning over a tall bush in a dry field.
A large table with a laptop and home computer.
A man dressed like Darth Vader is standing in a white bathroom looking at himself in the mirror.
Small dog in street next to a skateboard.
A small sofa and coffee table in an apartment living room.
Two females are walking down the street wearing boots.
Female tennis player in the motion of hitting a ball.
Two street signs show an attraction and street name
A bird perched on a log with a house in the background.
A food cart with trays of food on the shelves
Perspective-corrected photo of a large masonry building under a clear sky.
Man cross country skiing with a yellow lab.
People crossing the street and walking on the sidewalk in a city.
The black and white dog is lying beside a stuffed bear.
A guy standing in a living room holding a controller playing a video game.
A cow laying down in the sand on a beach, with the water in the background.
a bowl of food next to a keyborard
three guys sitting down eating sandwiches and smiling
A group of people on surfboards in the ocean.
A gang of bikers driving down a city street.
Man standing in front of a parking meter holding a folder.
People wind surfing on the water near a suspension bridge.
An old picture of a twin bed and radiator.
A television, couches, table and a remote controller.
A girl is flying a kite on a clear day.
A room with a tile floor containing furniture. a staircase and people.
a parked van with graffiti painted all over it
a young person standing on a chair in a kitchen cooking doughnuts
A small white cow and a big black cow walking in an empty field.
A woman in a dress and Mary Janes bends down towards a Frisbee in a fenced in yard.
Three people skiing together on a path carved into a hill
A person wearing a red tie pointing to it with both hands.
A person with six snapshots making a call and taking a beer
A bathroom is decorated with white tiles and white towels.
Two woman standing in front of a mirror near a sink.
A man in yellow shirt and black shorts playing frisbee.
a couple standing in front of a wishing well.
a close up of two people walking close together
A very tall chicken standing next to the ocean.
a little boy playing a game on the television
Passengers are standing in a line in front of the door of bus.
A herd of cows graze in a field behind a wire fence.
A couple of girls with tennis rackets in a room.
A person in pajamas laying on a bed reading book.
Batter at  baseball game waiting to hit the ball.
A picture of a cat that is looking out a window.
A man and woman cutting a white sheet cake.
A bench right next to some tall grass at the edge of a body of water.
A man is in the ktichen and the living room is painted blue.
the people are watching the animal drink water
Four cell phone on a wooden table with their screens on.
A kitchen with a red stove top under a framed picture.
chopped onions sit on a cutting board next to a glass of wine
A bicycle parked next to a lake on a cement floor.
There is a sheet of stickers that go on a keyboard.
Children learning to make their own kites.
A black bear perched on the top of a fence.
A woman in a robe is using a mobile device while holding a cigarette in front of a garage door
Two twin sized bunk beds in a room
A young woman in the water wears a life vest holds a water ski.
The intersection at Durham Court with forest in the background
Pots being displayed at some sort of exhibit.
A couple of red traffic lights next to a forty sign.
Five men are around a table with food on it.
A red train passing by bushes and a road.
Some people with cowboy hats riding horses on a trail.
A batter swinging at a pitch at a baseball game with a runner on first base.
Two cows stand in a pasture eating grass.
A stuffed animal is smiling while sitting on a bed.
A small white cat sitting on a  ledge.
A group of people in boats on a river.
A surfer on a surfboard riding a wave.
Cat covers it's face while sleeping by the window
Only one slice left of a fruit pie.
The clock is on a brown stand with a wall behind it
A man on skis sitting near the mountains
A man riding a snowboard down a snow covered slope.
An old couple is sitting down on a bench together.
A skateboarder is mid-air doing a trick on their board.
Clothes hanging on a rope over an unfinished patio.
A plate with chicken, broccoli and mushrooms with a bit of gravy.
A group of people pose for a photo at an event.
A baseball player taking a swing at a ball
Two kids in bunk beds reading while laying down.
A sign warning of snakes in the area stands on a pole.
A bathroom decor is in shades of browns.
A biker has his young daughter on the bike
A couple sit together for lunch on a street bench
The four engine airliner sits on the tarmac on a cloudy day.
A sailboat in the water with the docks in the background.
A photo of bananas, mangoes, and oranges in a pile.
An airplane is shown taking off into the sky.
A tow truck carrying a bulldozer on a trailer.
A half eaten doughnut sitting on the side of a road next to a  truck.
A lone zebra standing next to a tree in front of a fence
Five wine glasses sitting on paper on a table.
An amazing lunch spread with a beautiful salad, peaches, tomatoes, and sandwhiches
a group of people are traveling down a paved road
A group of people with umbrellas stand in the road.
Child watching kite as kite is flying in the air
A man riding a paddle board down a river next to a lush green forest.
A school bus waiting at a traffic light.
Bushel baskets full of vegetables at a market as shoppers walk by.
A polar bear plays in its habitat next to a yellow traffic cone.
A set of traffic lights over a busy road with cars.
Two guys passing each other on a tennis court holding rackets.
A person walks on a platform next to a passenger train.
A pizza with lots of mushrooms is seen here.
A person is laying in bed reading a book
Two birds standing side by side on a branch
Two men in baseball uniforms stand on the dirt.
A wireless keyboard and mouse are on the table.
There is a close up view of a giraffe.
there is a very high mcdonalds sign on this street
A group of people are standing around holding video game controllers.
An apple, watermelon and bananas are setting on the table.
a ship sitting out on the ocean not moving
A large metal tray of rice and some vegetables.
two men and a woman stand by a fence and pet a elephant
There are three birds by the grass by the water.
A woman sitting at a desk pretending to converse with a teddy bear.
Two men skiing downhill next to each other.
Several snowboards with people on them located in the snow.
A Stop sign and other street sign on a road
Bedroom with a bed, dresser, and small picture hanging on the wall.
A person in action on a field with some people watching.
A loft bed with various stuff being stored underneath it.
A grey vintage truck on street next to a house.
A large orange truck parked next to a woman.
Two boxes that have a dragon on the lid are filled with food.
A crowded street and sidewalk on a city street.
Woman in a living room with large screen TV and cloth-draped furniture.
A dog resting his head on the side of the boat looking out at the water.
A large cruise ship is traveling on the ocean.
A girl lying on a bed looking at the camera
A man is holding a piece of food with chocolate in it
A skateboarder dressed in pink and black at night.
A man is snowboarding off of a hill in front of a crowd.
Four sheep watching a dog peek through their fence.
A show shining station with a pair of boots on it.
A man standing at a train station near a pile of luggage
A group of people sitting down at a table to have a meal.
A city street with lots of blurry traffic on top of it.
A group of people standing next to each other.
A group of stuffed teddy bears sitting on top of a counter.
A group of birds that are standing in the sand.
A large Cathedral like church with a clock tower and people at the gate.
a close up of an electric blender on a counter
Digital painting of a tabby cat and large dog touching noses.
Two cows behind a fence on a farm
Two brown and white horses in an enclosure.
a cookie being held up by a woman
A group of people standing around a man with a cop in front of him .
A bathroom stall with a small trash can and a chair.
A Kinnaird street sign and Stop sign with the word Art in yellow painted on it and houses in the background.
A bed in a room that has a window open.
A group of people waling across a cement covered round.
A bed in side of a room with a small white mattress.
a black and white photo of a boy and girl walking a horse
Passengers waiting patiently for their flight at the airport terminal
A man and a cat sit on a sofa.
a sheep is walking around near a tree
A bunch of giraffe hanging out together as a pack in the outdoors.
A bathroom showing toilet, sink, and shower
THERE ARE A LOT OF PEOPLE WALKING AROUND WITH KITES
A woman brushing the teeth of a toddler.
A salad with lots of different greens covered in sauce.
A skate board rider flying off a ramp in a skate park
A young man playing on a skateboard at a play ground
a living room with red walls a chair and a television
A picture of a fire hydrant next to a plant.
A white refrigerator with the door open with a small amount of food in it.
A beagle is sitting in a chair with arm propped up the way a human would sit.
a person riding a skate board on a city street
A bathroom with four urinals and a drain on the floor.
An old motorcycle rests near a rundown building.
A young man is doing a trick on a skate ramp.
Sign with the number "eighty" set against bright blue sky.
A young boy playing whiffle ball in the grass
The contents of an open suitcase scattered on a table.
A large display sign outside of a ski resort.
Crowd of people in a field flying kites.
Green street signs sitting on the side of the road.
A woman lies in bed reading a book, and petting a cat.
Two large elephants walking across a shallow body of water.
A vintage photo of a city bank branch.
The cat is laying down while someone rubs it's head
A smoking jet going straight up in the sky.
A baseball player getting ready to hit the field.
The table is littered with a number of typical office items.
A Dominos Pizza with pineapples on the pizza on the table
A young man holding a piece of food in his hands.
a man with a tennis racket in his hand
A  white microwave sitting on the ground outside
A herd of animals grazes in a field while a zebra nurses its foal.
A group of two people waiting to cross the street under an umbrella.
A man holding a pizza on top of a pizza pan.
The man and woman are talking in the kitchen.
a man in a tie and a suit is indifferent
A doughnut that has several bites taken out of it.
A bus parking lot area with several buses parked and one multi level bus driving.
A man is swinging a tennis racket at a tennis ball.
A beautiful dinner of authentic pizza with fresh bread, a plate of mozzarella and tomatoes and a lovely red wine.
A cute little girl sitting on a bench alone.
Two adult elephants interacting near a stand of trees.
Little kid leans against the gate in front of train
Blurry silhouettes of people and a horse against an evening sky.
a polar bear pokes his head and one paw out of the water
a young person wearing a shirt and tie
an image of a man eating a slice of pizza
Two rows of teddy bears of various colors and sizes.
A man and woman wearing tiara while sitting at a table.
This toilet sits in a stall in a public bathroom
a skate boarder performing  a trick while others look on
A cat laying on top of a wooden computer desk.
Two street signs atop a stop sign under a clear sky.
A bathroom that has a couple of toilets, but no stall door for them.
A man wearing glasses skiing during the day in the snow.
A simple bathroom features standard toilet and tan sink with dark wood cabinet.
Four zebras, two warthogs and a giraffe in an open field
A tv sits enclosed in brick outside on the street
kids watching a smiling woman milk a cow
Boys play soccer in sand in front of a crowd.
A kitchen includes a refrigerator, counter, and sink.
ON ONE SIDE OF THE PARK BENCH IS TREE DOGS SNOOZING
A bird statue sitting on a bench in a library near bookshelves.
A POLICE OFFICER IS SITTIGN DOWN TALKING
A suit case filled with a magazine and a pair of shoes.
a man running on a tennis court with a rackett in his hand
A MAN IS SWIMMING IN THE OCEAN WATER
The three men are walking down the road together.
a small red train is parked at the station
Two people walking and holding umbrellas over their heads.
A man riding a skateboard prepares to roll down a ramp.
THERE IS A WOMAN SITTIGN AT THE TABLE WITH HER LAP TOP
A kitchen that has white cabinets and drawers.
A square in the city occupied by people.
Sunlight bounces off the green wall in the den.
a passenger train sitting by a platform and a fence
A bathroom sink with all the usual toiletries on it and a hand towel hanging by it.
A bathroom with a colorful rug, white towels, and a picture on the wall.
A group of chefs standing in a kitchen preparing food.
Some people sitting in the grass leaning against some wooden rest.
A man hols a surfboard as he walks a beach alone.
A person that is wearing headphones and glasses.
A lot of ties are being hanged on the rack.
An older large green and yellow trash truck driving down a busy street.
A pack of elephants are walking through the terrain.
A herd of cattle in a field covered with snow
A propeller plane that is flying in the sky.
A man standing next to a motorcycle on a street.
blue car wrecked against bus trying to before them
A white cat holding  a wooden baseball bat.
Colorful toys in front of a cell phone rested on its side.
A hot dog on a bun with mustard
Two giraffes standing by a tree with a forest in the background.
A red double decker bus is parked on the street.
A red, blue and silver motorcycle parked on the street.
A man in a tie getting up from a meeting desk.
A view of a toilet from the adjoining room.
On a beach, there is a clock in the middle of the sand.
A surfer is riding a medium sized wave.
A pizza has red and green peppers embedded in the cheese.
A zebra at a zoo stands alone looking at the ground.
A little girl in a red dress with a red flower in her hair standing at a sink.
A wood deck table has a glass of ice tea and a plate with BLT on a sesame sub roll and green salad on it.
A plane is flying through a cloudy sky
a herd of zebras walk in a caged area
A man fixing a street sign on a raised up ladder.
A person water skiing falls in a lake.
A large group of doughnuts sitting on the table.
A skier stands outside in the snow on their skis.
A tabby cat sitting under the back of an old blue car
Several toilets some without lids are sitting on the ground outside.
An adult and young zebra standing in a field of green grass.
The little girl whose name is Violet, is fast asleep in her bed
A white and black cat sniffing a banana on couch.
A man is holding a banana in front of his face.
a green field that has a man with a kite
Two dogs running and playing in the sun.
A large metallic refrigerator freezer combination in a kitchen.
Brown dog sleeping on a bed in a bedroom.
A sitting room with three chairs a settee a sofa and a fire place.
A person on a motor cycle in the street with blurry buildings behind them.
Closeup of a corner of a metal tray containing three hotdogs.
The large grey sofas have throw pillows on them.
The picture shows a snow skier skiing down the hill.
The large cow is wearing a blue tag around it's neck.
Group of people standing outside a farm holding vegetables.
A man riding a bike past another man without a shirt.
A man in black jacket riding on a motorcycle.
A zebra wagging its tail as it eats some grass on the ground.
Man wearing glasses brushing his teeth in bathroom.
Two brown horses tied up at a post.
A child on a surfboard floating in the ocean.
The glow from the lights are super blurry.
A tall man eating and drinking next to a lady
A surfer is riding a wave in the ocean.
A plate with chicken,carrots and mashed potatoes with silverware.
Two men cooking food outside with jars of food behind them.
A child tries to catch a frisbee in a park on green grass.
A sandwich is cut into triangles and served with a salad on the side.
A man who is going down a hill on snow skis.
A time-lapse photo of a guy doing a skateboarding trick, jumping over a curb.
A baby elephant standing under an adult elephant.
Lady posing with two horses standing on a street.
The fire engine is ready for any emergency.
A group of people riding on the backs of elephants in a river.
A man on a snowboard in the snow.
A red stop sign with two green street signs posted above it.
He needs to rethink his choice of shoes for riding a motorcycle.
A horse is standing by a wire fence.
A bottle of water sits on a table next to fruit.
A man playing tennis as people sit and watch from the stands.
A selection of wooden kitchen tools on a counter.
A stop sign on an empty, foggy street.
A black cat relaxing in a cat bed on the floor
Some animals are walking on the street and next to the car.
This is an image of the inside of a home with lots of pictures on the walls.
Two stop lights mounted on the same pole
A man and woman stand with bikes in front of a field.
A kitchen has a refrigerator and ice chest.
The view of the headlights, handlebars and mirror of a motorcycle
A man is riding waves with his surfboard.
A group of young students eat lunch in the classroom.
a very large teddy bear that is sitting on a chair
An old man sitting next to a graffiti covered wall while holding a music keyboard.
a pizza covered with assorted peppers on it
A bus at a bus stop sports a bicycle rack.
Two small black bears walking through a grassy area.
A man rides his motorcycle through the water on the beach.
A fire extended hose for fire hydrant in rural area
Two children playing with a toy in a park.
Two photos side by side of fruit in a basket, vegetables and basil.
A woman rushes with a handbag through an empty train station with a large clock.
A man walking with a skateboard towards a concrete ramp.
A large tub is in a beige tiled room that has two windows and one window is white while the other is brown.
A kitchen with a magnet-covered refrigerator and a pile of junk nearby.
a clock attached to a tree in front of some buildings
An empty room with a light is currently on.
A girl wearing a pink cap riding her bicycle.
A large white polar bear walking near a building
A person doing a trick on a snowboard off a hill.
A man flying a rainbow kite in a clear blue sky
A lone shorebird standing on the beach as a wave rolls in.
A man on a cell phone taking a picture of himself.
Four jets in the sky at an air show.
A man in a suit stands at the podium and speaks.
A man walks on a snowy trail in skis.
A herd of deer and a single zebra in a field.
A dark brown giraffe leaning over the short fence of an enclosure
a old black and white photo of a construction truck
A black streamlined train pulling into the station.
Four motorcycles are parked by the side of the road.
The man holds the umbrella for the woman as they walk through the wilderness.
A train drives passed a station as another pulls up to the platform.
a couple of different pizzas on a counter top
A man on a bike in the reflection of a car mirror
A man on a tennis court is playing tennis in front of a crowd.
Surfers bring their boards to the water on a crowded beach.
A bus driving down the road near a church and traffic light.
A PICTURE OF WAFFLES BACON EGGS, AND JUICE
a batter, catcher, and umpire on a  field during a game
a close up of a person holding a call phone
A fridge that is halfway open during the night.
A clock sitting in the middle of the city, in front of a building.
a couple of zebras stand next to some horses
A group of people looking at an elephant.
a couple of indian men riding down a road on elephants
there are two airplanes  that look old hanging and one looks spaceship like
A group of people cross country skiing in forest.
A picture of someones meal being served on a plate.
A herd of elephants walking along a lush green field.
Man holding a surfboard by the beach in his hands.
A young man catches a wave on a surfboard.
A wooden table with an empty pizza box and napkin.
A man in a green hoodie preparing to snowboard.
A woman wearing medieval clothing with a cell phone attached to her belt.
two women and a man holding a big white surfboard
A man that is sitting down holding a sandwich.
A group of friends posing for a picture together next to a pizza.
An orange truck parked next to a pink truck in a  forest.
A guy on a white and orange surfboard catching a wave.
A red plate topped with a cut in half pizza with an egg on it.
A giraffe inside an enclosure with families watching in the background.
A woman is dressed as a man and a man is dressed as a woman.
there is a man that is taking a picture of another man
Three zebras running along a path in a field.
Pair of zebra standing in open area of grass and trees.
a small child sitting on a women's lap at a dinner table.
A living room features a white couch and black loveseat.
a monorail going down the track as a bus parks by the side of a road
A white fire place sitting below a giant clock.
three cooked dishes positioned on a wooden platter
a plane flying by a red sky during the sunset
Smart phone sitting in a red case being hand held by someone.
A lady in a red shirt shows a man how to use a video game controller.
A fancy clock face is flanked by two angel statues.
A cookie is sitting on a plate next to a cup of coffee.
A woman sitting down holding onto a fork.
A man skiing alone in a snow-capped bush
A woman wearing skiis while riding a conveyor belt outside in the snow.
A GIRAFFE STANDING SURROUNDED BY TREES LOOKING TOWARDS CAMERA.
a lone zebra stands just before a small body of water and looks down
A couple of sheep standing on top of a lush green field.
A sign on the side of a building for the business of Tomasino's Cellar Ristorante.
A garbage truck travels under a stop light.
A shower with a curtain stands next to a toilet with the lid open.
A cat is rolled on its side while napping.
This is an image of a patrol boat in the ocean.
a man on a surf board rides on a big wave
An anime action figure doll on a computer
A person in a baseball uniform holding a baseball bat.
A group of men standing next to each other.
The sign in front of a French bar which indicates the location of the bar.
A person is riding waves on a canal.
A man in a green shirt is wearing a Christmas tie.
Two plates with sandwiches on them next to a bowl of vegetables.
A white horse leaned over eating something in a corral.
many difference stuffed animals on a shelf on a wall
Baseball player standing near home plate in stadium.
A stop sign over a pedestrian crossing sign.
Two horses that are standing in the water.
Two eldery people are wnjoying the view of a lake in this park
Two people in the living area of an RV.
A very large bathroom has a two toilets and two sinks and a very large glass bath tub sitting next to a glass shower.
A photo of a man standing with a ram.
The young men are playing a baseball game.
A giraffe with its head cocked walking about a sandy area.
Two giraffes are standing near each other in a field.
some people are sitting in front of desks
A man skateboards in a parking lot while his buddies watch.
A table set with plates and a cat.
The fragment of the burned plane rests on the ground.
A large bird in the air over a heavily forested area.
A man is surfing on his board in the ocean.
A family watches television in a small living room.
a man holds a glowing item while in the dark
This person is preparing a  meal in the kitchen.
Broccoli, carrots and a small amount of potatoes on a plate.
A mix of beef and broccoli stew on a white plate.
A flat bread pizza topped with green peppers, onions, and tomatoes.
A large clock is posted above a turquoise rail.
Room with a bed and a chandelier and double doors.
A red fire hydrant sitting in the grass near water.
A stuffed animal sitting in a Christmas tree.
An unmade bed and a turned on lamp.
An old man wearing a hat with a snake around it and a cellphone clipped to it.
Two children with tennis rackets hold their hands up.
A black dog standing on its legs and holding Frisbee in its mouth
A man standing in front of a microphone.
Woman sitting on floor next to commode with glass bottle on floor.
A heavyset adult is outdoors and is wearing sunglasses.
A baseball player preparing to hit the ball thrown by the pitcher.
Items of fruit and flowers on a wooden surface.
A bed topped with two red pillows and a head board.
The cat is standing on top of the microwave that is on top of the refrigerator.
A woman puts her head in an oven.
Sheep gather in a grassy field in front of a lighthouse.
A dog sitting on a couch under a blanket.
Several species of animals grazing in grassy area.
A young person wearing a jacket travels swiftly on a skateboard.
A dog in a bathroom tears up a roll of toilet paper.
The women was playing tennis on the court.
A guy sitting at a desk with a nice monitor by a window.
Two dogs are sitting a neatly made colorful bed.
a plane on the air flying very high
An old-fashioned safe and roll top desk in a green room
A surfer is riding on a wave in the sunshine.
Toothpaste,toothbrush,mouth rinse,tongue cleaner and other mouth cleaning things are kept.
People talking in a kitchen with a mixer on top of refrigerator.
Animals eating at the side of the road near mountains.
A living room with a couch, television, and a colorful rug.
A herd of elephants walking down a dirt road.
A stop sign in the desert near an empty road
A girls' soccer team poses with their coach for a team photo.
A blue plate topped with bread and a salad.
A hipster couple is giddy at a wine tasting.
A cabin in snow with people around it.
Many people are outside celebrating on a sunny day.
Two cats that are sitting in the bathtub.
A book with a train on the cover near a keyboard.
a person is sitting on a park bench outside
Shelves in a dorm room, with knickknacks such as a photograph, a lamp, and a lucky cat figurine.
A sculpture made up of several traffic lights.
The small kitchen has a black counter and wooden cabinets.
Some type of wooden shower in a bathroom.
A man stands by as a girl feeds an elephant
A laptop sitting on a small black desk.
the fully furnished basement looks clean and orderly
A doll with large eyes and blonde hair holds a teddy bear.
A kitchen that has a tea pot on the stove.
A small house with a large tower and a walkway leading up to it's door.
TWO GIRAFFES GRAZING IN THE TREES DURING THE DAY
nine blueberry muffins in a muffing tin
landscape of water with mountains on the horizon and a cloud filled sky
A baby laying on its belly in front of a laptop.
a goose is standing by a body of water
there are three people sitting at a table holding up pizzas
A family gathered around an outdoor table with drinks and menus.
A dog catching a frisbee midair as his trainer prepares to toss another.
A young person on skis lies in the snow
A pair of scissors on top of a piece of paper on top of a rock.
Jet plane flying high in sky on partly cloudy day.
A person flying a kite near a basketball hoop
A pink and white laptop and three computer monitors on a desk.
a white bowl and a blue strainer and some bottles
A small hotel bathroom has been well stocked
A city bus parked by the side of the street.
The parking meter is empty by the building.
A man flying through the air on a  skateboard.
A child in a colorful airplane tie standing against a wall.
A pitcher on the pitching mound in a "after pitching" position.
A young girl blowing out candles on a cake.
A person standing on a surfboard in the water.
A man swinging a tennis racquet on a court.
A young man wearing a dress shirt and a tie.
A woman carrying a surfboard on top of a snow covered ground.
A train pulls up to a platform with a line.
Assorted flavored donuts being grabbed by multiple hands.
A man in a safety suit walking along the edge of a dog where a cruise ship is docked.
two benches placed on a snow covered land
Some people at a table with some nice desserts.
Young boy with stuffed toys lying on bed.
An airplane is in the shallow blue water.
Guy and his small dog out in a motor boat amongst bigger boats
The moon overlooking the boats in the harbor.
a dinner plate with steak, vegetables, and a baked potato
There is an old fashioned blue refrigerator and ice chest in a kitchen.
a desk some books a speaker and a video game system
A hotel room with a bed, desk and chair.
Some people with rackets on a tennis court.
A small animal, maybe a baby sheep, is outside.
A bunch of fresh produce sitting on a paper towel.
Decorated living area with desk and cabinets with television.
Skateboarder grinds along planter in an outdoor plaza.
A group of snowboarders glide on the snow as a large snowy mountain stands in the background.
A woman walking her bike on a busy sidewalk.
A red traffic light sits on the street.
A man is riding an elephant that appears to be playing basketball.
A large grassy field filled with grazing cows
A bus with three people getting out of it.
A bird that is perched on some vines.
A young male baseball player is about to swing for the ball.
a man in a uniform standing on a pitchers mound
The plate is full of pizza with chicken and vegetables on it.
A group of teddy bears with princess crowns on.
A young man and women in a very short skirt and heels.
Four men standing next to a small airplane.
A plate topped with a donut next to a cup of coffee.
A refrigerator that still has its sale tags on it.
A couple of men riding on the back of an elephant.
An elephant standing alone in a wooded area
A red stop sign near two large buildings.
Multi-colored patterned pillows on top of a white in an empty bedroom.
A baseball player is getting ready to hit a ball.
A stop sign below a lamp post at night.
A male officer and another man looks at laptops
There is a full view of an outdoor area and it is nice.
A man on a tennis court with a racket in his hand.
A tall multi story building painted with colorful designs.
A plate with a very big and tasty looking sandwich.
A moving truck filled with furniture parked on the side of a road.
A plate with a sandwich on it and several pieces of silverware on the table.
A kid standing in the batters box, preparing to bat.
a bedroom with a circle purple bed with a view of a tv
A very odd shaped but pretty style clock.
Three elephants standing on a stool with woman sitting on their necks.
Man and woman standing under a red umbrella.
A stop sign obscured by the brightness of the sun.
A young zebra sucking its mother in the wild
The stop light has various blue directional signs,
a tennis player wearing a red shirt  is playing tennis
A man standing on the side of a court holding a microphone.
an image of a man going on the ocean waves
A statute built into the side of a building.
Pink flowers sitting in a flower pot full of water.
A furnished doll house with stairs to a second floor.
a small bird stares out of a window looking at the outside
A man in a reflective vest walks toward a parked airplane.
A cage filled with candles sitting on a table next to a vase and another candle.
A yellow banana sitting on top of a table.
A young zebra is between two larger zebras.
A beagle pads away from the camera across a reflective surface.
Man walking on a sidewalk that is sloping downhill approaching the corner.
Desktop computer setup with ergonomic keyboard and headphones.
A down hill skier racing down the slopes in a blue ski suit.
A little boy playing, eating and shopping while in a shopping cart.
a man holding a white umbrella in a wooded area.
A young man stands on a skateboard on a sidewalk.
A couple of giraffe sitting on top of a lush green field.
A street sign and some cars next to a building.
a lady taking a picture of a red bus
Several remote controls lines up next to each other.
a woman reading a book with another woman standing right behind her with an umbrella
A hot dog wrapped in tin foil covered in ketchup  relish.
Close up back and back of head of a cat in dark with two rectangles of light on ground in front.
A pile of vintage suit cases in the middle of a building.
A sandwich and a pickle with a bowl of food on a plate.
Young man with crew cut and dark denim shirt taking selfie in bathroom mirror.
A person with some skis posing in the snow.
A room with furniture, wood accents, and a fireplace
a blue tank of compressed gas near a house
A man sitting on a train next to a woman.
a girl with a game controller with a boy standing next o her
A motorcycle is parked in front of two people.
A person on a snowboard rides down the snow.
A woman crossing in front of a double decker bus.
A pot full of vegetables is sitting on a table.
A living room filled with blue and white checkered couches.
A woman sitting on the floor with a teddy bear
Two men cooking and packaging food in a kitchen.
A man walks next to a couple of horses loaded with supplies.
A cup of coffee is sitting next to a laptop
A few kids playing in the yard with a frisbee
A large passenger jet with it's landing gear down.
A dog wearing a collar standing next to the water.
THERE IS A MAN THAT IS JUMPING A RAMP WTH HIS SKATE BOARD
The view of an elephant's head through a display window.
A dog leads the way for two crosscountry skiers
Two hot dogs in wrappers on a table.
Here is an Asian standing by a yellow fire hydrant.
Two zebra standing next to each other in front of a cart full of dry hay.
The woman serves the tennis ball as a child watches.
A skateboard enthusiast doing a jump on a skateboard on concrete near a small tan brick building with tinted windows.
The four skiers chose to wear bright colors, standing out from the snow covered white mountain.
A city bus thats turning a corner with another at the intersection.
A couple of elephants walking down a dirt road.
A variety of fruits and vegetable on a plate.
The man is having to work outside in the rain.
An elephant walking draped with a colorful blanket.
Small silver cellphone sitting on top of a wooden table.
an ocean a white fence and a black thing on some rocks
Young boys and their coach playing baseball in the sun
A shop window with people outside on the street reflected on the suface.
a toothbrush holder is sitting on top of a bathroom sink
A blue and white KLM Asia plane being serviced at an airport.
An old ad is showing a retro kitchen.
Spacious kitchen with a center island and stainless steel appliances.
a bath room wit ha sink and a bath tub
A man with glasses and in a suit talking in front of a microphone.
A zebra brazing on green grass next to a pile of rocks.
Sheep and a woman in a field in front of a cityscape.
A raw cut of meat still on the bone being seasoned.
A man wearing glasses standing next to an airplane.
FOLDED ROBE TIED UP LIKE A PRESENT IN A HOTEL ROOM
a vintage photo of a man washing a lamb
A horse running by itself through a flat area of land.
A man wearing a stripe shirt and a yellow neck tie.
A woman taking a picture in a garden by a polka dot umbrella.
A giraffe that is standing near rocks while an ostrich stands behind it.
An open refrigerator with various fruits and condiments in it.
Assortment of baked pastry items displayed in case.
A woman standing at a table filled with red lobsters.
Long old train barreling through the mountainous countryside.
A closeup action shot of a person surfing.
A hand reaching out towards a standing giraffe
a table with some dishes with food on it
a tv near a closet and a book shelf
A group of teddy bears in glass cases.
Several people that are drinking beer together and talking.
A large group of cows on a field.
Green wooden shelves holding blackened bunches of bananas.
a number of small boats near a body of water
A bathroom sink next to a white toilet under a mirror
a black and white photo of a person with a cell phone
Carrots are being cut into pieces with a large knife.
A woman laying in bed with a powder puff girl pillow.
a small child dressed in adult clothing by a stair case
A baseball player taking a swing at a ball
Two men in suits shake hands outside of an airplane while others look on.
three young cows in a fenced pasture with a  short black dog following them
A windmill placed near several cows in a grassy field.
a shirtless man is skateboarding in a pool
a group of kids playing frisbee chasing it
A man riding a sled down a snow covered hillside.
Gray cat laying with head on laptop on top of couch.
An active computer monitor that is sitting on a desk.
A bathroom with a toilet, sink, tub and shower curtain.
A person standing outside on the beach looking at a Frisbee.
A man riding a blue two seat motorcycle wearing a helmet.
Two sheep are standing in a field next to a wall.
A kite surfer rides the waves of the ocean.
A black and white cat laying on top of a keyboard.
The man in a business suit has a bag on his shoulder.
A group of men with volleyball's in pink uniforms.
A female jockey riding a horse spectators in the background.
A baseball field showing the catcher, umpire and a person up batting.
some food is laying out on some dishes
Two cows that are standing in the grass.
a large air plane flying in a sky
A red plastic basket with two hot dogs on it.
A bicycle leaning against a pole outside of a coffee shop.
The unmade bed has three pillows on it.
A man with a hand bag standing in a room.
A computer, keyboard and framed photo on a wooden desk.
A beer mug that contains water and flowers.
A cow standing next to a brick building.
A variety of food is displayed on a table.
A baseball player extends his swing to hit a pitch.
A closeup shot of the insides of a squash.
A person holds an apple slice with peanut butter on it.
The people has there umberellas up for the rain
two ladies in a kitchen preparing some food
Two plates of food with vegetables and bread.
A man preparing to ski off a steep slope.
A variety of food items are displayed in dishes.
A yellow street sign warns of a hump in the road.
A kitchen and dining room table and chairs sitting next to a living room with a chair and couch in it.
A group of boats in a body water on a clear sky day.
A bunch of craft supplies and a pair of glasses.
A group of people that are standing with umbrellas.
A pizza with toppings and a missing slice.
tow pieces of a desert on a plate on a table
a couple of people stand on some dry leaves
a couple of people that are laying on a couch
A silver commuter train at a train station next to luggage carts.
A guy smiling while standing under a run for rights banner.
Three white flowers in a vase with flower images on it.
a field that has a bunch of cars in it
A baseball player getting ready to catch a ball with his glove
A cat sits between a window and a large birdcage.
People cross the street in a busy downtown city area
The Time clock is in the center of town.
Two paper plates sitting on top of a table covered in pizza.
a living room with several chairs and a small table
Two women are sitting on a bench reading a magazine next to a bike rack.
A group of men sitting around a living room in front of a tv
An Air France passenger jet is parked on a tarmac.
a person sitting at a table with a laptop
A lone giraffe standing next to a river.
A happy little boy with a banana in front of his face.
a man sitting in a chair with a cup  in his hand
A street scene with a horse and carriage and buildings in the background.
A baseball player swinging his bat in front of a crowd.
Some boats in the water outside of some industrial buildings.
A woman walks down the street alone late at night.
A man standing near a van advertising a movie.
He does have control of the motorcycle while pulling a wheelie.
A clock in front of a window on a winter day.
A man jumping off of a red skateboard.
A pair of youths pause for a photo on a ski slope.
A bunch of plates that are laying on a plate.
The Big Ben clock tower towering over the city of London.
Blue-and-white jet airplane sitting at an airport runway.
A large white sink sitting under a bathroom mirror.
A cooked pizza that has been placed on a table.
An old photo of a man on a motorcycle and cars in the background.
A teddy bear sits next to a mossy tree behind some green leaves.
A young woman in a bikini surfs a small wave.
Two men smiling in a grainy photo while holding a banana.
a black orange and yellow train on its track and some trees
a couple of bears are standing in a field
A white and green fire hydrant sitting next to a light.
A baseball player prepares to swing as a pitcher throws the ball.
Trains parked on rail road tracks next to a tractor.
An image with multiple photos combined in it.
A banana with a frownie face drawn on it is by a computer.
A toddler sliding down a snowy slope on skis.
Two people are by a railing feeding a giraffe.
A train inside a building going down the train track
a woman is standing outside talking on a phone
a group of people standing around in the park
A crowd of people standing below the Eiffel tower.
A trio of little kids in front of a birthday cake
a cat laying in some blankets on top of a bed
a man sits on the ground with a guitar
Woman holding a small baby in front of her computer.
Some art work with a man with a hat on and some fruit in a bowl.
A triple decker sandwich is cut into quarters.
A piece of pizza sits on a white plate that has gold accents.
Two children interact with a television video game, while a third person looks away.
two beds are shown as the light creeps in.
A group of baseball players standing on top of a green field.
A couple of boats floating on top of a river.
A group of people riding skis across snow covered ground.
A white toilet sitting up against a brick wall.
A ceremony for military men from US and China
Two women sit together as one of them dries her hair.
A wooden bench leaning against a blue wooden wall.
a plate holding a big pizza in the middle of the table
three buses are parked at the buss station
An elephant statue standing on top of a lush green park.
a giraffe rinsk soem wate rin a nice pond
A man in a harness holding a waterboard.
A couple of men lying on some couches with covers on.
A man jumping for joy in a field of kites
a fork thrust into what looks like a pan filled with potato chips
Some cows and horses are outside grazing together.
A fully stocked bathroom with a vanity mirror.
A man balances on one end of a skateboard.
Group of people with wine glasses standing near table.
a couple of kids stand with a toy
A girl places a white teddy bear in a container
An elephant tied up in a city park.
A circus elephant using it's trunk to hold another elephant's tail.
a man riding a wave with a surfboard
A father holding his little child upside down.
A woman and two men posing for a picture.
A baby elephant walking with two adult elephants.
A cat sticking its head out of a piece of luggage on the floor.
A table topped with a bird and plates of food.
A woman in yellow shirt and skirt with cats in grass.
many giraffes standing together as a group eat from a basket
A woman walking her cattle down the road.
An older man is flying a kite with a small child.
a man with a hat standing on a snow board
A guy is returning a tennis ball that was hit to him.
A person dressed in black doing skateboard stunts on a skateboard ramp.
A white and red helicopter above a grassy field.
Spectators enjoying a tennis game at the US Open.
People shifting the concrete being poured in the forms.
There is a display of trophies on the table.
A cute cat sitting on top of a couch cushion.
A man is kneeling in front of a large elephant.
Two giraffes stand in the grass by trees.
A bus that is on the side of the road.
A man stands in the living room and plays Wii.
A laptop with a green apple taped to its back.
Two laptop computers sitting on top of a desk.
This three people pose for a goofy photo
a room that has some furniture and a table in it
The two cows are fenced in the field.
A tiny banana with a woman peeling another in the background.
Two bulls who are walking on a street.
The great wilderness with a white lonely horse grazing.
A professional baseball player takes a swing in front of fans in a crowded stadium.
two people in a body of water with a wake board
THERE IS A VAN THAT IS DRIVING DOWN THE STREET
A cay laying on top of a blue couch arm next to a wall.
A clock is displaying the time on a tower.
Two bees on an apple hanging from a tree.
A young lady sitting on a couch in front of a laptop computer.
A glass shower door in a small bathroom.
One giraffe standing and another giraffe sitting in the grass.
The bathroom of this house is spotless.
Children dressed in snow suits standing in a crowded resort.
A red and blue small train is on the tracks.
a lady happy she got her tooth brush out of the holder
a man on a surf board riding a small wave
Trio of elephants walking past a large log
A man sanding next to an orange frisbee.
A man riding a skateboard on top of a road.
A plate of food in a dim restaurant, ready to eat.
A black and white cat sits on a red cloth that is over a television set.
a man is jumping in the air with a disk
A toddler wearing a ski outfit and a pair of skis in the snow.
The tennis player in the pink sport dress is holding a tennis racket and ball.
Two monitors with art from Akon albums on them.
Tea, a tea cup, a teddy bear, and a tea brewer sit on a countertop.
A couple of people in the snow on skis.
A picture of different types of herbs and vegetables available from the CSA.
Four people on a sailboat one is on the phone and three are sunbathing.
A little boy rolls in a wheelchair pulling a suitcase.
A bike attached to a car bumper with people with luggage in the background.
A group of people stand outside, exchanging items.
a bird standing on a plate of partially eaten food
Baseball game with batter and referee on field with crowd
A zebra stands with its head down in its enclosure.
A group of people on bicycles in middle of street next to trees.
an airplane with people standing under the wing
A table topped with lots of fruit and vegetables.
A row of table and chairs along side a street.
A plane taking off in the air, on a clear day.
An umbrella on its top laying on the ground in the sun
A group of boats are enjoying riding on the sea.
A woman is wearing a jacket and a tie.
A bed in a corner of a room next to two window's.
There is a horse standing by some grass.
A baseball player getting ready to swing at the next pitch.
A herd of giraffe walk through the tall grass on the plains.
A fighter jet flying over two parked vehicles.
A group of elephants moving in the middle of a river.
The man is drinking a glass of wine in his kitchen.
A pair of elephants standing in their natural habitat.
A dog that is laying down on a table.
Snowboarders walking through the snow carrying their boards
Many people in business attire are sitting around tables.
A boy stands among a row of red mopeds.
A woman with a nose piercing is holding and looking at her cell phone.
A group of motorcycles on a street next to grassy area.
A photo in an airport showing a backpack and a cell phone.
A man in a tie and backpack is drinking a beer.
Three zebras are standing near a gate in a wall.
Black and white photo of an old car on its side.
A airplane parked out on a runway by itself.
A man riding around on a scooter with luggage on his lap.
A woman brushing a girls hair on a couch.
A old photo of how things were a long time ago.
A close shot of a cat staring at the camera.
A woman is sitting in a garden tub while brushing her teeth for a window view.
a person leaning on a bank holding a remote in his hand
A boy is standing out by the water
A person that is in the snow doing a trick.
Two boys sitting,younger one is trying to read something.
A one propeller airplane is in an airplane hanger.
A man in a car wearing glasses and a shirt and tie.
two people playing with a frisbee on a foot ball field
A living area with a christmas tree in it
A zebra standing around in the middle of a field.
Cat lying on top of a shelf with its front leg hanging down.
A pile of paper towels is on the floor next to a toilet.
A man that is on a pair of ski's in the snow.
A man cutting up scallions at an outdoor table
A bird as it flies lonely through the sky
A dark skinned child getting ready to be pushed on a swing.
A hotdog is placed on a table next to some french fries.
A blue double decker bus that says Garage on it.
An red fire hydrant beside a grey fence.
The little girl is sitting in front of the computer.
Four chairs sit around a dining table with papers and shoes on it.
A couple of cows with wreath decorations on their heads.
A blue train stopped outside of a train station.
A brown horse grazing on grass in a field.
Two men and two women, all wearing flowers, are posing for a picture in formal wear.
A view of a bathroom sink and porcelain tub.
A Starbucks teddy bear sitting in a Starbucks.
A cat laying its head against a teddy bear.
A big dog is resting halfway out of the window.
A baseball player throwing a baseball bat from home plate.
A kitchen with white cabinets, black counter tops and a white breakfast bar.
a little table covered with paperwork, books and a laptop
A woman holding a yellow umbrella standing near window.
A cheese pizza sitting on a white tray on a table.
A group of people with wine glasses stand together.
A salad with broccoli, cheese and radishes is in a bowl.
a number of people sitting at a table with a cake
Man in a black plaid shirt eating food while standing up.
A pink flower sticks out of a narrow white vase.
A couple of men working on a boat that's docked at a pier.
A group of children are wearing school uniforms.
A red and white air plane is parked on the run way.
Grey dog laying down in black and white sheets.
A kid in a baseball uniform holding a baseball bat.
A commercial airplane is flying low to the ground.
a woman holding a wil controller with a steering wheel
A busy street with many people standing around and lights on.
A baseball game is in action as the catcher leans for the ball.
A herd of sheep grazing in an open pasture.
a bowl with an apple and some bananas and some books
Two men in suits and ties shaking hands.
A group of people sitting at a table with stacks of books
a bunch of vegetables and fruits sit on a chopping board
A disembodied hand holds up a cellphone to take a picture of something on stage.
Young boy on blue skateboard in parking lot.
Three cows that are standing in the grass.
A flip phone open to a test message
Working man sharpening scissors with electric circular sharpener.
A couple equipped with umbrella hats taking a break from walking their dog on a bridge on a rainy day.
THERE IS A STUFF ANIMAL WITH ONE PURPLE CLOSE WINKED EYE
A man standing on a tennis court holding a tennis racquet.
a hand is holding a single banana to eat
Pretty blue flowers sit in a vase in the sunshine.
A man's handicap restroom located in an establishment.
Various buisness signs and an ornate lamp post in the city.
A cow that is laying down on the street.
A guy on a snow board does tricks in the snow
A plate that has various types of donuts on it.
Surfer riding a large white top wave on the ocean.
Two people standing next to a statue that is an invisible man.
A person walking across a snow covered ski slope.
A baby sitting at a high chair in front of a table filled with food.
Three hungry boys pose with a loaf of bread.
A group of people walking down a street next to buildings.
Three white castle hamburgers sitting in a white castle food bag.
a couple of people standing on a beach next to surfboards
A large inflatable soccer ball with spikes floats up from a field.
A male getting ready to throw a pitch at a baseball game.
The yard is full of stuff such as a truck and a tug boat.
A wall that has a large number of clocks on it.
A man turns to smile for a photo while talking on the phone
A close up image of a type of salad.
two kittens sitting on a woman in a chair
A woman is flying a kite in a city park.
several sheep watching two sheep standing by a drinking tub.
A young girl holds up a pink umbrella.
A man in a baseball uniform standing with a bat.
A bench next to a tree in a park.
a bathroom with a corner toilet and a sink
A boy with a cast is kneeling by a skateboard.
A cat sits on a desk, on top of papers and in front of a computer.
Woman playing in a tennis match in a tennis court.
the fire hydrant has on the side of the road
An "on-deck" batter watching the baseball game from the on-deck circle
A man standing next to a brown piece of luggage on a floor.
a railroad bridge with an old  train crossing it
a group of animals graze on some grass
a number of baseball players on a field
A selfie of a woman taken looking into a car mirror.
A heron is standing on the edge of a body of water.
A parking meter sits by a brick wall.
a man getting ready to grab a frisbee as others watch
A cat sitting on top of a car outside during the day.
A little girl and goat standing in the rain while the girl holds an umbrella
The boy wearing green is playing tennis on a green court.
a fire hydrant in the middle of a large paved area
Some people walking on the top of a snow covered hill.
Two children struggle over a bat in their playroom.
There is a man with glasses that is letting a spider crawl on his arm
A dirty wok on top of a stove beside a dirty tea kettle.
Two people that are laughing and holding a kite.
A horse is trotting past a man on that walks behind him in the pasture.
A man wearing a brown hat and a uniform shirt is holding a cockatoo upside down.
A desk that has a drink in the middle of it.
A man is skiing down hill using both ski poles and the snow looks powdery.
A broken tv next to a brick on the street
a girl is standing on her bathroom sink
A wooden cutting board with a knife, plate and several different vegetables.
The huge truck is carrying a construction tractor on it's bed.
A couple of people sitting on a wooden bench.
Toiled in a dirty bathroom with a concrete sink and tiled walls.
Houses of parliament on the edge of the River Thames.
A motorcycle is parked on the side of the road.
An empty city bus travels down a city street.
A desk that has a laptop computer on it.
some people a bus and cars a street lights and buildings
A man is skiing down the hill next to a sign
a person is riding a motorcycle by a grassy hill
People at an outdoor table eating pizza while surrounded by a crowd.
A pelican strolls in the shallow water at the shore.
A dog is sleeping on the step by a blue door.
A man sitting on a bench next to a dog.
An elephant stands in a grassy area with words written on his body.
A bicycle parked next to a wooded area, with a large brown bird perched on the bike seat.
an old photo of three people holding skis on a snow background
An old dirty toilet and a sink in a bathroom
A zebra standing on top of a lush green field.
A ski instructor teaching a class of children.
A herd of sheep standing below very tall buildings.
A baseball player who is sliding into a base.
A little girl standing on the grassy area of a beach.
a person sitting on a bench near other benches
A purple, red, and orange  commercial airplane on a runway.
A chicken or tuna club sandwich made with homemade bread.
A sign on the side of a snowy road stating avalanche zone.
A black and white coin meter on the side of a road.
A motorcycle with a suitcase tied to the back of it
A cat in a bed hiding under the cover.
Three men wearing red standing on top of a ski slope.
A sink and a dining table in a kitchen.
a plate with a bunch of meat and vegetables on it
A triptych depicting skateboarders who are mid air.
A couple of horses standing next to each other.
A VERY TALL GIRAFFE AND A COUPLE OF PEOPLE NEAR IT
Closed toilet, sink, and mirror in a modern bathroom.
An assortment of pens and pencils is spread before a keyboard.
A white sign that reads no turns hanging from a traffic light.
The small bathroom has brown tile on the shower walls and floor.
a person riding a skate board at a skate park
Bus backing up and being loaded onto a truck outside
a conveyor belt holding some donuts after being deep fried
A man is waiting for the wave he wants to ride to the shore
A biker standing next to a motorcycle. near a garage.
A group of elephants walking down a street with people on them.
A man in a wetsuit surfing on a clear day
Female tennis player touching the US Open logo banner.
A young man swinging a racquet at a tennis ball.
We see a girl playing a game on her Wii console.
One boat on the beach with the water in the back round.
The kitchen is full of various gourmet ingredients ingredients.
Two young children sit in bed and play on computers.
One man looks at the camera while another looks away
two men in a kitchen making stuffed potatoes
Three sheep next to each other at a farm
There are several pumpkins being used as decorations.
A long nosed train on the tracks near a station.
a tennis player with a racket on a court
A road side with graffiti sprayed on it to alter its message.
A slow children street sign cutout is propped up next to a fire hydrant on the side of a road.
A park bench next to fence and trees by grassy field.
Sheep grazing in a wise open green field with clouds above
A man is standing on a carriage pulled by four ponies.
The tot is making a face to indicate a distatse for certain vegetables.
A desk with two computers on it.
Person in gray hooded jacket attempting t cross busy street.
A person holding a glass of champagne in their hand.
A man on a laptop on a coach in his living room.
A black back pack on the side of a dirt road.
There is a bacon, lettuce and tomato sandwich.
A woman holding a cell phone to her ear.
A few sheep eating and grazing in someone's yard.
That cake as fresh strawberries on the top of it.
pizza a knife and fork a bottle of wine and a glass
A group of young skiers pose in a line on a snowy slope.
A man in a wet suit crouches down as he rides a wave on his surfboard.
A plane flying by a runway on a slightly cloudy day.
A large long train going down a track.
Humans holds dog back in a swimming pool
A man riding a bike next to a bus on a street.
a man holding onto a rail in the middle of an empty parking lot
An empty wooden bench sits near a neatly trimmed lawn.
A young boy on skateboard riding on a ramp.
A man doing a jump on a skateboard
Bottles of Pellegrino are stacked on refrigerated shelves.
A person riding a white board surrounded by a group of people in the ocean.
a close up of street signs with buildings in the background
A person standing on the beach flying a kite.
Pair of electronic parking meters in front of a red truck.
A bathroom done all in tile that is clean.
A jeep that is sitting in a field with a large fire and smoke in the background.
A bus that has bags of luggage on the side of it.
A Volvo bus parked on a road near a hotel.
Pedestrians, a rider on a scooter and several bicyclists cross an intersection at a crosswalk.
Adult women standing at open refrigerator filled with beverages.
a balding man in glasses holding an umbrella and wearing a jacket with a very high collar around his face
A giraffe standing next to a fence near people.
A bunch of people walking down a street with open umbrellas.
A girl on a boogie boards catches a wave in the ocean
An old unlighted sign hangs overhead advertising "Open Kitchen Restaurant"
A man is flying a cat while a cat watches
A little kid skiing down a hill holding ski poles.
A person's feet standing and balancing on a skateboard.
A large tree situated next to a large body of water.
Large group of stop signs in the same area.
this is a group of parasails in the sky
a cow stares as it stands in a muddy area
A Japan Airlines passenger jet climbs skyward with its wheels still down after takeoff.
A woman standing in a kitchen cutting up vegetables.
A bunch of people waiting on the train platform for the train
A pile of luggage, helmet, clothes and mirror.
The people are walking to water to surf the waves.
A horse drawn cart is driving down the road.
Skateboarders at a park skating in an empty pool.
Many cattle are trying to find food on the desert ground.
A street is void of cars at night.
A tall building surrounded by a crowd of people
A train wreck near a river draws a crowd.
A very large room filled with a bunch of diners.
A kid that is swinging a baseball bat at a batting cage.
a woman standing around a bunch of clocks
a table with some plates of food and some glasses and cups
A black and white cat with curious look sitting on a desk.
This is three cows eating hay from their stables.
Soccer paying kicking the ball while others look on.
Some luggage against the wall of a hallway
a person in skies is standing in the snow
An asian woman with black hair and a green headband posing with a tennis racket in front of a man with white hair and a cigarette.
Jet airplane in flight landing gear extended down
A boy doing a skate-board trick on a ramp.
There is a food truck set up under a bridge.
A young child looks at a group of zebras.
A traffic signal at an intersection on a city street.
A man uses his cell phone to take a picture of himself.
A baseball player hits the ball as the crowd watches
An empty park bench in the middle of a tree covered park.
A kitchen shelf holds an assortment of pots, pans, and utensils.
Young man spinning green frisbee on finger along shoreline
A white toilet bowl with an electronic brown seat.
A group of Asian chefs stand by bowls of food.
Some animals are outside in the dirt in the daytime.
A bathroom with bidet, toilet, tub, and a checkerboard floor.
Two people who are standing on a beach.
a bathroom with a strange looking toilet in it
The train car is stopped and it is empty.
The big ben clock tower standing tall in the foreground
Scissors, a marker, and two other items on a table.
A guy playing the drums with a very intense look on his face.
A dog attache by his leash to the side car of a motorcycle parked in a parking lot.
Young girl on surfboard riding small wave in ocean.
A row of parking meters in front of a stop sign.
Two zebras stand in a field with tall grass.
A view of the outside world through a train's window.
A horse and a dog on a grass field.
A piece of cake is on a white plate.
A lone sheep surveys a fern and wild flower covered hillside.
The clock is sitting atop the antique building.
Two old fashioned black and white buses are parked next to each other.
a beach with a lawn chair and umbrella positioned on
A man leans down playing a game of tennis
Young guy playing tennis on a clay court.
A woman is sitting outside on her phone
Man jumping about to serve a tennis ball.
A band wearing costumes standing around talking.
Kids on a bike while a man is drive a horse drawn buggy.
A police motorcycle is parked next to a police car.
A large  two sided clock by a building
a couple of sheep are standing in a clearing
cook prepares dish by putting it into the oven
A skier carves his way down the snowy hill.
A herd of adult and baby black sheep in a fenced field.
A television that is showing a news program on it.
two children playing with a frisbee in a drive way
A view of a giant bridge during the day.
A man riding on the back of a brown horse through a lush green field.
A parking meter with an hour and thirteen minutes left to go.
A clock tower stands over a city landscape.
A small frame building with a large sign.
a group of pizza standing around a table eating pizza
A man going down a slope near a ski lift on his snowboard.
A man at a baseball game is holding his bat on the ground with is head on top it.
A gang of bikers riding down a street.
That baseball player looks like he may have done something good.
A couple cutting their wedding cake at their reception.
A picture of a boat marina full of sail boats.
A MAN IS ON A MOTOR BIKE SMILING THUMBS UP
a person riding skis on a snowy surface
People surfing on a white water river.
a couple of trays that have some food in it
A large tv seems too small for an enormous surrounding cabinet.
A pan of food is in the middle of a table.
A photo of a surfboard with a man in the background
A young boy throws a pitch at a baseball game.
A pizza in an iron pan on top of a table.
Black and white photograph of women walking towards an umbrella.
A family sitting down for a meal and conversation.
People on snow skis are by a wooden building.
A small boy playing tennis while holding a racquet in his hand.
The sun is setting near the clock tower that reads 945.
Police person riding a blue and yellow check motor cycle.
A giraffe is sleeping on bare dirt next to a dead log.
The airplane is about ready to land at the airport.
A large kitchen with a large center island.
Many sheep grazing on grass in a field.
A crowd of people lined up in front of a food truck.
Piles of unripe bananas sitting next to each other sitting on a floor.
The child is sleeping in the bed with his stuffed toy.
A couple of men and a woman sitting next to each other at a table.
Four different plates that have food on a table.
An old phone shows a horse and wagon on a wide street and children are in the forefront.
A black and white cow is standing on the grass.
A white toilet sitting next to a white bathroom sink.
A skateboarder riding down a ramp in black clothing
A woman wearing a mask holding a racquet
A person on a court with a tennis racket.
A boy riding on a skateboard in the street.
a room with a bunch of teddy bears in it
A surfer rides a wave on his board in the ocean.
A train driving past a lush tree filled forest.
a fork with a plate with carrot cake
a couple of kids are sitting on horses
An umbrella is attached to a bicycle frame with leather straps.
A group of people riding bicycles down a street.
A white tusked elephant at his compound at the zoo.
The clock tower appears very tall at this angle.
A desert dish has powdered sugar on it.
Two men that are playing a game of baseball together.
Inside view of terminal building with large sunlit window and a clock.
A sandwich and a cup of drink on a table.
A large jetliner sitting on top of an airport runway.
m mm m m m mmm mm m m  mm mmm m m m
A stainless steel toilet with the sat up
A lot of motorcycles that are in a window.
A pink house with a bunch of bananas outside
A large black cat sits by the front door.
a station wagon covered in foot prints and stuffed animals
a red and white sign and some parking spaces
A wine server holds up a wine bottle display for a man to look at.
A woman with ear protection on swings a bat.
A wooded table filled with apples, oranges, pomegranate, and cherry tomatoes.
A man wearing glasses and a green tie
this is a man riding a board down a rail
A woman and man hold a kite with two children nearby.
A woman is holding up bananas at a market.
A red stop sign sitting above an orange not dumping sign.
Back view of a female tennis player wearing orange shorts.
The backside of a travel bus on the side of the road.
A young boy is surfing in the ocean.
An empty living room  has a cluttered coffee table.
a train passing on the railroad in a grassy hill
A close shot of a mini fridge.
A woman in a black and purple dress poses in front of some tall grass.
A pile of fruit sitting on top of a wooden table.
A black and whit photograph of a boy tying a tie.
pans filled with assorted veggies, fruit and rice
The old style airplane is flying on a cloudy day.
A man reading a book in the park
a white bus is on a city street
A large brown dog laying under an open umbrella.
a lady riding a horse holding the other black horse
A stop sign in front of a large home
A group of elephants walking in a green and rocky area with many trees surrounding them.
A large white dog panting while laying down.
A man in white shorts stands near a large television screen with a remote.
Small multicolored airplane sitting on a landing strip.
An orange train rides through the rural countryside.
The electronic contents of a bag are placed on a bed.
A brown horse wearing a bit standing next to a wooden fence.
A waterway with many people on some small boats.
A man helping a boy on a paddle board in the water.
A man cutting a cake celebrating his 50th birthday.
The desert is on the table ready to be eaten.
a close up of a child and a dog
An elderly man sitting on one of three park benches which are positioned side by side.
a bunch of signs together on a line.
A blue and silver train is pulled up to a platform.
Two zebras are pictured but there is an elephant and other animals in the background.
a person holding an uncooked doughnut near other ones
A person holding a cell phone next to many others.
there is a man taking a picture with his cigarette in his mouth
A town square with several tall clock towers.
food on a plate that matches the countertop
A man riding on top of a surfboard on top of a wave.
A man playing tennis in the middle of a serve.
two catcher and a pitcher stand on the pitchers mound
A baseball player in a blue and white uniform holding a baseball bat.
people standing around a table filled with some plates of food
A high clock tower is brown and has roman numerals.
A woman is standing on a tennis court holding a tennis racket.
A sidewalk with various signage and many cars in the street.
A Polar airliner is parked on the tarmac.
many types of vegetables in the vegetable section of a market
The angle view of tower with a clock.
A group of people riding skis on snow covered ground.
They look like they are beginning a ski race in the snow.
A yellow firehydrant on the sidewalk near a building
A giant desert covered in chocolate sauce next to cups of coffee.
A bear is snuggling with a bear cub.
A beautiful woman laying in bed reading a book.
Traffic has stopped to allow four zebras to cross a highway.
A man holding a tennis racket in his hands while on the tennis court
three skate boarders and one is doing a jump
a tea pot is steaming on the stove top
A white plate of food on a table.
A herd of wild horses grazing on a green grass covered field.
A person manipulating the skateboard with his feet.
A person sitting in an old refrigerator on the sidewalk, drinking beer.
The two signs give directions to upcoming cars.
A bathroom with a toilet and bathtub and handheld shower.
A man who is performing a trick on a skateboard.
A nightlight is on over a kitchen sink.
Will the elderly women finish the wii game?
A plate of vegetables, chicken, and white rice.
Watercraft in a row, floating on the calm ocean.
Several stuffed animals and teddy bears laying on a bed.
A white plate filled with slice oranges next to a pile of bananas.
A zebra standing, its face down, grazing on dried grass.
An elephant strides through brown grass and trees.
A kitchen with hard wood flooring and a stove top oven.
A man surfing down a flowing river rapid.
A banana filled with melting chocolate on a grill
A little girl sitting at a wooden table in front of two bowls of food.
A woman at a desk with a computer monitor and CD case.
a small girl with sunglasses is hitting a tennis ball
A black cat sitting in a bathroom sink.
this is a yellow train riding the rails
The sheep is wearing a bell with a blue cord around its neck.
A group of five zebras stand in a field.
Elephant playing inside mud, with fences surrounding her
a lady on the beach flying a kite
A dog wearing sunglasses sitting in the front seat
A person on snow skis in the snow.
a number of people standing in a kitchen near one another
two apples and one banana lying in the shape of smile on a wooden table.
A group of people ride a double decker bus and hold black umbrellas.
A bathroom with a vanity sink, mirror, toilet, and bathtub.
Two pizzas and three cups of drinks sit atop a table.
there are several woman wearing bikinis and waiting for cake
A desk with several computers and laptops on top.
A full wine glass means the bottle has less in it for later.
A man laying on a couch holding a gaming controller.
some writting on a wall by a window
A woman with a bear in a photo with a sign.
A red buses on a wet paved road by vendors undercover walkway, and one vender on curb with an umbrella over table.
Two giraffes are standing next to each other.
The farmers are working the land with their animals.
A person is chasing sheep through a field.
A dessert with few carrots on a plate near two candles.
A group of bread slices with cheeses on them in a pan.
Many people walking in the streets holding umbrellas.
A bright patio umbrella stands out against the plain white building.
an image of street signs being crossed in air
A man is putting a pizza in a oven
A small dog being carried in a backpack.
A beautiful marina with many boats docked in it.
A plate of food and drink on a table.
A ram standing still in an empty pasture.
A person is hanging in the air near a building.
a public males restroom with two urinals that are based on the floor
A cat sitting in an orange chair in a bedroom
Guys riding motorcycles through the path in the park
A red stop sign with a no left turn sign.
A man flying into the air as he catches a frisbee.
A photograph of the inside of a public men's restroom
This boy is practising in a play ground
A baby zebra getting a drink from mama during the day
A man enjoys a quick ride down a ski slope.
All way stop sign at the intersection of Prairie Street.
two men are riding in a train in hats
A man holding an orange frisbee on top of a green field.
A man in tie standing in front of a table.
Two white bears on the rocky shore of some water.
People are walking along the beach and people are skiing on the water with parachutes.
A bearded man riding a skateboard on pavement.
Two women on snow skis on a hill
A large patio area with many table and chair sets covered by large umbrellas.
People watch as a couple of people are skateboarding on ramps.
A group of people riding a boat on top of water.
A view of some snowy mountains from an airplane.
Two women stand at a store in front of a cooler containing various alcoholic beverages
A little boy that has a spoon with food on it.
A black stuffed cat with fangs is hanging on a rack with others.
Two red trains at a train station, with forest in the background.
A person is sitting in a chair on a sidewalk while a bus drives by.
closeup of a white horse with someone riding it
some food is sitting on a green and white wrapper
A bull and a dog charge across the field.
a ball player running toward home base by a bat
A slice of pizza that is sitting on a table.
A tiled floor bathroom with a red and black shower curtain.
A display case of different types of doughnuts in it.
A close up image of a bag of Broccoli florets.
Two brown sheep huddle near the back of a large plastic cage.
kids playing Frisbee in a park on a bright day
A person riding a bicycle on a street near a building.
many bananas hanging above some people in a shop
A man rides a horse while driving longhorns down the street.
A person flies a large and colorful kite at the beach.
A black and white picture of a large house is shown.
a guy painted yellow with blue overalls holding a banana
a row of boats are lined up in the water
The couple scoots around town on the motorbike.
A white sink sitting next to a bath tub.
A chair is made out of stuffed pandas attached to each other in a clump.
A bunch of people riding in an odd looking vehicle.
Tower clock designed with two western shooters for entertainment display
A couple of women holding game controllers in their hands.
A laptop on a plaid black and white blanket.
A woman sitting on a rock holding an umbrella.
A man riding a skateboard on top of pavement.
A person with a tennis racket on a court.
A bear peaking over a log in front of a rock wall.
Variety of fruits being placed into a blender.
a clock on a walk with a bike parked near by
Cows on display on top of codling with people far down below
A cat is sitting on a bathroom counter.
A person leaving a trail of snow as he glides on his skies.
A horse stands near a fence during winter time.
A half eaten dessert and half empty cup.
A young man standing on a beach holding a bat.
A memorial bench with a can of liquid sitting on top.
A clock tower with a clock against an overcast sky.
A group of people getting onto a bus.
A group of people standing in front of a Inn.
A group of zebras stand in a field.
A parking meter on an empty street at night.
Young boys with alien and spider facepaint tattoos
Two men in a showroom for snow skis.
a bed with a red and white bedspread and pillows
a group of children and adults gathered together on a snow covered bank
Six well-dressed men drinking beer and eating pizza.
This is a sink and mirror of a hotel bathroom.
A room filled with lots of clocks on it's walls.
Two dead birds covered in wires sitting inside a outdoor plant.
A man on a horse near a dog and two cows.
A young girl wearing a baseball cap eating a hot dog.
Bike left outside next to the bench in front of the river
there are many people in this living room playing a video game
A woman standing next to a fire hydrant wearing  a backpack.
A woman passing a bear mask on a market tent.
A task force of drug dogs monitors an airport corridor
a couple of women sit on the ground next to each other
three geese on some grass by a pond
A boy batting during a little league baseball game.
a woman is working with something over a book
A man and woman that are standing next to rocks.
The woman wears a hat and has flowers in the basket.
a girl on a board riding along a boat in the water
A guy is posing for the camera with a medal around his neck.
A view from above two men working in a kitchen cutting fish.
Oven light on in a kitchen with wooden countertops.
A stop sign on a snowy day in the daytime.
A television sits on a dresser by a window.
A young man is taking practice swings on the field.
Close up of a plate of broccoli and stems.
A stop sign centers an upside down street image.
A baseball game is in action as a batter swings.
A woman is holding a sawed off bat while wearing lingerie.
a small bird on a tree brand near fruits and leaves
historical fighter plane on display in an air hanger
A person eating lunch and using a computer in a cafe.
A picture of a subway shuttle bus traveling down a city street.
A very cute small child holding a big umbrella.
an image of two planes that have just landed
there are two stuffed bears sitting on a toy horse
a bathroom toilet with a carpteted seat cover and floor rug.
Three double-decker buses are parked in a lot.
A street sign is near a lamppost and trees.
A laptop that is sitting on a desk.
A truck hauling a large load to a job site on a winding mountain road.
A series of steep stairs lay next to a lake
An elephant at a water hole spraying water into his mouth.
A woman with something in her hand in a decorated picture.
Two tool boxes sitting next to each other on a  table.
A white and orange colored cat laying on a bed with its eyes open.
A person jumping in the air on a snowboard.
Two people ride horse beside dogs near a meadow.
A hot dog sitting on top of a bun covered in toppings.
A young man has his foot placed on a pole while another looks on.
The two men are riding their horses on the road.
A cat is laying on cozy white sheets.
THIS IS A CLOSE UP PICTURE OF A STUFFED BEAR AND MONKEY
A picture of a person throwing a frisbee.
A man in tuxedo posing for a photo.
Closeup of a baseball glove and a black ball hat.
a truck sits next to a big plane
A man standing in front of a parking meter about to put money in it.
Two young children laying in bed next to each other drinking from bottles.
A herd of zebras where one of them is biting another.
A dog sits with a frisbee at its feet.
A wooden computer desk with a computer sitting on top of it.
A red stop sign sitting next to a wooden electrical pole.
A man is putting a pizza into an outdoor pizza oven.
A cat that is sticking its head in a green bowl.
A clock on top of a post shows the time
The face of a cat that is sitting in a sink.
Two kitties playing with toilet paper next to the toilet.
A man who fell asleep with phone on face
A bathroom with a sink and a bathtub
Locomotive parked under a brick bridge in a secluded spot.
A bunch of bananas sitting on top of a wooden table.
A man in a swinging position holding a tennis raquet while on a court.
A person that is laying on a bed.
large semi truck with steel front end parked in grass
A young boy smiling on his skis in the snow
A woman in white dress playing a game of tennis.
Woman in blue outfit taking a swing during tennis match.
Two girls kicking a soccerball on a soccer field.
A young woman holding a baby with a teddy bear on her lap.
A man lying in a field flying a kite.
Man in baseball uniform playing shortstop waiting between pitches.
A man with a knife in his belt and a beer in his arm enjoys a sandwich.
An older woman with white hair and glasses, seated at a dining room table and another person in the kitchen area.
A table is set with a full dinner.
A small kitten fits inside of a gray sneaker.
Group of horses in race near canvas fence.
A man pushing a girl on a swing.
A green plate of food that includes rice, broccoli and meat.
Three horses grazing in a pasture in front of a house.
A toilet sitting outside a building in an alley.
a bento box filled with different types of food
Kitchen photo with window over counter and a bowl in the middle.
a large vase with a big colorful boquet sitting on a table
A bathroom with full vanity and wall mirror.
A group of motorcycles parked in front of a white church.
Groups of skiers near a ski trail in the snow.
A white sink that has a necklace, a rubber ducky, toothpaste and some beauty items laying around them.
A person skiing on a snow covered mountian
A man is enjoying surfing in the water.
A clock on an ornate metal pole in front of a shop.
A train traveling over a river on a bridge.
The dog has a frisbee in his mouth in the snow.
A cat and a dog sit in a colorful bathroom.
A bathroom area with a tub, shelves and a sink.
A dining table and a lamp are beside a fireplace.
A plate full of couscous with mixed vegetables
A young woman with tattoos using her cell phone..
Two beer trucks are parked beside one another and unloading.
A person is looking at a pair of scissors.
A colorful illustration of an old train and stormy skies.
a big airplane taxiing on a wet runway
A man riding a bike next to another person on a bike.
A couch and rocking chair are in the small living room.
A couple of people sitting on a beach watching an assortment of para sail chutes.
The open faced sandwich contains a meat in casing.
A man wearing winter gear snowboards while several people snowboard behind him
A plate contains a meal of meat, potatoes, eggs and fruit.
A white couch in a living room filled with Christmas decoration.
A silver suitcase on a wood floor with a pair of black and white shoes next to it.
an oddly tied tie on a pink shirt
A young child learning how to ski down a slope.
many elephants are walking on a trail and some trees
a cat is sitting on a couch in a room
a big white bed with a dresser and lights new to it.
A sign is displayed on a traffic light.
Fans sit in camping chairs along a fence to watch a children's baseball game.
A group of people are eating near a wooden bench surrounded by trees.
Man installing an OS while giving the "devil horn".
Some people playing a wii video game in front of crowd
A man cuts a bowl of greens with scissors.
A group poses in ski gear in front of Olympic rings.
A baby in an adult's arms is gnawing on a toothbrush.
two people sit parked next to each other on motorcycles
A car sits on the side of a road with letters written on it.
Woman sits on beach with laptop pondering what to write.
A person is giving a piece of crust to a dog.
The group of cows stand in a river drinking the water.
A skaterboarder is doing tricks at a skate park.
A person surfing on a surf board on some waves
An assembly line with doughnuts moving through an automated fryer on it.
A baseball player hitting a ball with a bat.
A dark street with signs and buildings on the side.
a older male with his mouth open wearing dusct tape.
A blurry photo of people watching a bunch of horses.
A white plate with a hot dog in a biscuit next to fried potatoes.
A fruit cocktail with banana, oranges, and various other fruits
A blue toilet is sitting in a blue bathroom.
a bunch of colorful items on a black plate
An old man with eyeglasses stands next to a giant screen
a toilet a shower a tub a sink cabinets and a mirror
A man on a snowboard rides on the snow.
two people in a green field playing with a frisbee.
The person is skiing at the bottom of a steep slope.
A mountain view with a plume of smoke in the background
A bathroom has a sink, toilet and an orange bucket in it.
A dog jumping to catch a thrown Frisbee.
A group of giraffes stands around near a watering hole
A man in a suit and sunglasses drinking from a paper cup.
A smiling woman eats outdoors with a group of people.
A very cute little bird on a green leaf.
A guy is cutting something out of a piece of paper.
Guy plows the field behind two strong horses
Many containers of food are on the table.
A woman with an umbrella walking her dog who also has a smaller umbrella.
A lighted mirror in what appears to be a bathroom.
fifteen different varieties of doughnuts in a display case
A mouse swimming and another climbing out of a river in a wooded area.
A man standing next to a truck near a forest hillside.
A person sits on a bench with the skyline in the distance.
a giraffe is crossing the road in front of a car
A group of people standing near a bus
The man stands on the beach prepared to enter the water with the green sail.
A basket full of white biscuits on a table.
A bed with five pillows under a hanging print.
a man on a giant bicycle rides by a tall pole in front of an empty, large field in front of some mountains
A train sitting next to a train station near other tracks.
Keyboard with iPod shuffle in front on desk
A giant chair with a horse statue on it
A cement elephant on the other side of a fence.
Box with picture of a hand holding a Nintendo wii remote.
The cat is sitting on top of the black suitcase.
A group of people is standing in a driveway.
Four people sitting at a table with a large pizza and cans of soda.
A picture of a families living room with nice furniture.
A tennis player who just hit the ball to their opponent.
Two newspaper stands with a fence behind them
a brown white and black animal and two people on a motorcycle
An older lay sharing a birthday cake with some little girls.
A woman standing on the top of a snow covered slope wearing skis.
A simple hotel bathroom with two sinks, free mini bottles of shampoo, and a hair dryer.
there is a man riding a bike and waving
A man and a woman sitting on a couch.
A bear sits on the rocks by a pool of water in a wildlife exhibit.
A woman holding a colorful umbrella with writing on it.
A man on a surf board rides a rough wave.
Two giraffes and a zebra with several trees.
a silver and black motorcycle is lying in the dirt
A tiled shower, molded plastic bathtub, shelf, mirror, wooden vanity, lamp, and sink make up a beige colored bathroom.
A hazy sun over chairs and an umbrella on the beach.
A lawn chair sitting on top of a beach covered by an umbrella.
Several monkey figures hang on a bedroom wall.
A pair of scissors, a tape measure, and a spool of thread sitting on a piece of folded fabric.
A flat screen TV sitting in a living room next too a shelf.
White parrot sitting on a ledge eating a seed pod.
A street corner with a stop sign and it's wet from rain.
A woman stands holding a white controller near some chairs.
A black cat is nestled among indoor plants.
A tennis player hitting a tennis ball in a professional game.
A fat cat laying in a bathroom sink.
A woman stands in a dimly lit kitchen at a gas range.
A man in grey shirt jumping on a skateboard.
There are parking meters alongside of the railroad.
Old worn red truck parked in a driveway near a cactus.
a lot of people standing in the middle of the road with red stop lights
A professional tennis player holding a tennis racket at the US Open.
People on a shoreline are flying kites on a clear day.
A commuter train leaving the clean subway platform
A giraffe is standing tall next to a tree.
an old photo of a miniture pony pulling a cart
A man serving a birthday cake to a woman
A kitchen has a washing machine in it.
A cat sits next to a laptop on a desk
a white and green street sign and a traffic light
A man reading a magazine and sitting on a toilet that is outside on the street.
A large building with stained glass windows and a clock.
A cat laying on its back with paws up in the air.
A man rides his motorcycle through an alley way.
A flock of doves and a man sitting in a park.
This is an image of a cake with a bear surfing.
A man with a gun standing in formal dress.
Three people riding horses together down a trail.
She is serving the tennis ball pretty high.
A bridge over water near several buildings in a city.
a man that is skateboarding on a ramp
A herd of zebras standing in some algae covered water in front of a sandy plain
there is a green vase with a plant inside of it
A man holding a bowl with an open oven
A man who is snowboarding down a hill.
A black and white cat sitting in a chair.
some veggies are in a small cardboard box
In the dark two street signs are glowing.
A surfer sits on a beach next to some surfboards.
A large bus on a open city street.
A bathroom that has some open windows in it.
A man standing on a beach holding a surf board.
Three geese that are standing by a pond.
a guy on a bicycle and a guy flying a kite
A photo of a person being taken in this picture.
People in a large body of a water using surfboards.
Many pots of marijuana plants growing in a greenhouse.
a toilet sitting in a tile covered floor in a single room
A large white bed in a red room
Sheep perched atop knoll on green countryside with rocks.
a large group of people walking on a city street
A plate and vase on display in a room.
A man laying on top of a white bed between two lamps.
A woman reaching for a frisbee as another defends her.
A man holding a Nintendo Will controller in a living room.
A tow truck at a traffic stop with vehicles behind it.
A horse drawn carriage riding across a snow covered field.
An older later sits and drinks from a cup.
A sink sitting in the middle of a bathroom.
A table that has a silver tea pot in the middle and several plates around it with desserts on the plates.
A truck that is driving down the street.
A boy in a red jersey throwing a baseball.
A man on a horse is herding animals down a trail.
An elephant standing next to a lake on a beach.
A griaffe walking on a road with two cars approaching.
A clean passenger bus driving in a city.
A small black cat laying on top of a couch.
A cat is trying to squeeze through a door.
The transit train stretches down the track under the power lines.
A man is standing near some graves, water, and a bus.
A man playing a game in an RV with a remote controller.
A black and white dog with a frisbee lying on the grass.
The tennis player is returning a strong serve.
Two young children play in the grass with a kite.
Two trains, side by side, waiting at the train station
One cow attempting to mate with another cow in a pasture.
a living room with a shelf a coffee table and couches
Trophies and cell phones are on a table.
a person on a beach with a frisbee
Two soldiers taking pictures of a group of soldiers arranged for a photo.
There are three men with fishing poles at the beach.
A BUS STANDING AT A TRAFFIC SIGNAL IN A STREET.
A zebra and an ostrich up close with other animals in the background.
A lone horse in the middle of a grassy field.
An elephant spraying water onto his body with his trunk swinging backward.
A pretty cat with both front feet in someone's shoe
Close up on meal food with three items side by side chicken with barbecue sauce, broccoli with shredded white cheese bits on top, and a bean and pasta or grain mix.
Three images of a brown and white dog sitting beside a doughnut.
A black and white kitten stands atop a laptop computer
People are flying a kite in an open area.
a person skiing by a start sign above them
A woman giving a man a haircut in a barber shop.
A wooden table topped with the contents of a woman's purse.
A brown fluffy smiling teddy bear with big paws.
A man and woman wearing skis on a ski slope.
The slice of pizza has tater tots, green beans, and cheese on it.
a hamper with compartments having a cup clothes and two bears
A small table has many foods and drinks on it
A Photo of a man on skis gliding on flat snow .
A MAN IS ON THE BIKE WITH A USA FLAG
a couple of sandwiches are on a white plate
Two views of bright objects floating through the blue sky.
Two men on a tennis court playing a game of tennis.
a giraffe standing in the foreground with an ostrich behind
a white plane is being prepared to board passengers
some giraffes are in a green field and some trees
a large in ground swimming pool near tents
A large number of teddy bears are sitting at tables with fake food.
Fighter jets flying together in close formation leaving vapor trails.
Group of giraffes in high brown grass looking to feed.
a person with a horse and a car in the background
A man that is standing on a surfboard in the water.
A flock of birds are flying in formation.
A huge heard of sheep are all scattered together.
A man and a dog on a motorcycle.
A giraffe leans over while another walks away from it in an outdoor area.
A goth man sitting on top of the floor near a store.
A child throwing a ball towards a batter during a ballgame on a field.
The person is walking on the sidewalk alone at night
Two cats lay together on a messy bed.
A woman in equestrian clothing on a horse
Some people by a long row of motorcycles parked together.
Large long tailed kite on string above rural town.
a giraffe in the distance in front of a tree.
A group of chefs prepare food in a restaurant kitchen.
A skier rides their skis down a snow covered hill.
A blue clock spire next to buildings and cars.
Several wooden cages with white cloth tops and sides.
An old man is sitting on a bench.
A couple of sheep laying on top of a pile of dry grass.
A white pick up truck driving down a road behind a line of elephants.
A dog behind the steering wheel of a car.
An L-shaped couch in a living room with a coffee table.
A urinal seperated from a toilet in a bathroom
A small elephant stands next to a tree.
The white and blue boat is floating on top of the water
A plate of two slices of pizza and a cup of juice.
A herd of zebras graze on an open grassland.
a room that has a bunch of chairs in it
A female tennis player is about to make a serve.
an image of a plate of meat and vegetables
A jumbo jet plane running along a runway.
A room with a couch, chairs, television and a table.
A group of people outside at a park playing softball.
A photo of two plans with water and birds surrounding it , one plane in the air one one the ground.
A herd of giraffes grazing on a tall tree stalk
A man holds a glass of wine on a patio by a vineyard.
There are two zebras standing in the desert.
Shimmering lights inside a living room with a dog on soap
A baseball player at the plate just after swinging at the ball
this is a little girl playing in the beach
a bride and groom are cutting their wedding cake
2 giraffes one of them is doing the splits
A group of girls on seats in a tour boat.
A man wearing a helmet on a bicycle in a street that has a guard railing on the side of the walkway.
There is a small toy elephant sitting on a wall
A skateboard on the walkway in an old bus.
A red headed skateboarder sips on his drink.
A table that is filled with hot dogs, and a hamburger.
A duck can be seen in the water with high rise buildings in the background.
A young woman on a surfboard getting ready to ride the wives.
a horse in a field of tall grass
A man appears to be giving snowboarding instructions to a woman.
man brushing his teeth in blue and white tiled room
Three people in green and black snow suits with ski equipment on a ski slope.
The modern church has a clock on it's steeple.
A cook is holding a wooden cooking utensil in his hand.
A living room with hard wood floors and a tv over a fireplace.
A train stopped at the station to pick up passengers.
a woman is riding on the back of a horse
two zebra standing in pen and grazing side by side
A picture of some food in a bowl together.
A man sitting at a table with a cake in front of him.
people getting on a public bus at night
Tractor passing a statue of a dairy cow wearing a lei
Woman with blue streaked hair sitting cross legged on bed.
A stop sign with graffiti written on it.
A professional horse back rider is getting ready to take a shot while the crowd looks on.
A hotel room with a made up queen sized bed.
A boulevard has been pictured by someone driving by
The empty bench is sitting in the nighttime street.
Airline employees by an aircraft parked at the gate
Some animals are standing together in a pin
Small piece of cake sitting on a plate with cherries on it.
A woman sitting outside her house under a fruit tree.
baby in a highchair with bib and cake
A red trolley train riding along the tracks near trees.
An array of lights on some sort of machine
A man sitting inside of a car on the street.
A man is being pulled on his skateboard by two dogs.
A horse sticking his head out of a doorway.
there are several clocks that look like they are hanging from the ceiling
A kid is doing a trick on his skateboard.
a man  with a suit stands in front of a brightly lit drapes
An orange cat and a black and white cat both laying on a bed.
A man in skies is going up hill.
A platter of food that includes eggs, hot dogs, and cheese.
A living room with a couch, chairs, television and a child's high chair.
A white bed sitting in a bedroom in front of a TV.
Roses and other flowers are siting in the vase
A person sitting on top of a rock over a river near a city.
two public transit buses parked near one another
A man is standing in a cluttered room.
A young women sitting at a picnic table eating a meal.
A crowded city street filled with traffic and bicycles.
A bird with a large crest standing on a branch
A giraffe lying on the ground in a zoo pin.
A shower and a toilet in a bathroom.
Three grey birds in a tree with blue backdrop.
two cars parked on the sidewalk on the street
A woman standing next to a modern style parking meter.
A chili dog, onion rings and chili fries.
This rider takes a brown horse across dirt
A large dark colored spoon sitting on a rack
The player who is up to bat next is getting ready for his turn.
A man on a beach getting ready to throw a frisbee.
Two old people sitting on a bench before a wooded lake
A small dog wearing a sweater and holding a Frisbee in its mouth.
there are many airplanes stop at the airport.
A woman checks her phone while holding her hat sitting on a bench, with a bicycle in front of her and a hedge behind her.
THERE IS A PIECE OF CAKE ON A PLATE
The person fell off of the horse and into the water.
A large truck can be seen in this picture near a bridge.
A small old street sign hanging on a building.
A man watching another man on a skateboard
A man sitting on a large bench talking on a cell phone.
Parking meters stand in front of parking spaces in an empty lot.
A man on horse back and a truck watching a herd of sheep cross a road.
A man in wetsuit on surfing on white surfboard.
A dog and a cat underneath a desk.
Tower clock made of rock set against a cloudy sky.
Two children who are standing next to a white fire hydrant.
A bathroom with a white counter top and white towels
Two snowboarders are standing with one foot strapped into their boards and one foot out, at the top of a mountain.
View of a partially shaded city street with autumn leaves.
A laptop computer sits on top of a messy desk.
A pair of sinks in the middle of a kitchen counter with a wooden countertop
A group of bicycles that are sitting on the road.
Several people are swimming and surfing in the ocean.
a herd of giraffes standing around a bare field
Two young men at sunset juggling a soccer ball on a beach
A dish with meat and vegetables set on a bed of rice.
A brown teddy bear sitting on top of a wooden bench.
The sheep are grazing in the grassy field.
A blue train on the tracks at a train station.
A cat sitting inside a toilet bowl looking alert.
A woman who is standing near a clock.
a man with a tennis racket plays a game of tennis
Two small black bears stand near a tree.
A group of people holding Nintendo wii game controllers.
The people ride the bike near the water.
A piece of luggage with a rainbow strap and a ticket on it.
A pizza is shown being cooked in an oven.
A man holding a frisbee in the field with grass
A street sign is posted to watch for senior citizens.
A large intersection that doesnt have much traffic
People in a park trying to fly a large purple kite that looks like a fish.
many different cup cakes on a grill on a table
A man taking a swing at a tennis ball
Snow skiing at night presents unknown dangers without the lights.
A bathroom with the curtains drawn down and the lights on.
Two old-fashioned bicycles parked together on a beach.
bananas and apples sitting next to each other on a counter
A white and blue truck driving down a mountainous dirt road.
Backpacks line a boardwalk to a beach surrounded by trees.
A bird that is standing on a concrete ledge.
A group of sheep sitting next to a stone wall.
A large pizza with tomatoes, basil and cheese.
An Asian man is sitting in a cubicle with a near a computer near other people working in cubicles.
A white seagull standing on a white column by a pier.
A yellow and metal train traveling down train tracks.
A pair of giraffes grazing through a wire fence.
A board full of chopped vegetables near a computer
a giraffe standing in between some brush as a bird flies by it
He is intent on another bite of the sandwich.
A dim runway has an airplane on it.
sheep cross the road next to a white barn in the rolling hills
A man sitting in front of a computer with a bottle of beer.
White and blue plate of two glazed donuts by two glasses of orange juice.
Flowers are in a vase on a shelf.
A group of young people brushing their teeth.
A grey cat with yellow eyes looking innocently at the camera.
There are giraffes that are standing g yogeter
The Asian woman is trying to sell her food on a local beach.
A zebra standing next to a van door.
A man riding a skateboard across a crosswalk.
a group of young people playing frisbee in a field
A table topped with cut in half Twinkies on top of cupcakes.
The woman is on the tennis court playing a game.
A picturesque view of a small town during winter.
A woman riding a wave on top of the ocean.
A zebra walking next to another animal across a dirt road.
Two horses and a man are on the beach.
A yellow plant with green leaves in a glass vase.
Headphones help her to hear her cell phone.
A picture of someone typing on a laptop.
A cat curled up asleep in front of a laptop computer.
A very nice looking room with a big bed.
A man holding a baseball and a catchers mitt.
People looking in the shop windows with a bicycle parked against the window.
A person kicking up on their skateboard at the top of a ramp.
A large bunch of green bananas hands from a tree.
Man wearing a bandanna trying to catch a frisbee
A woman preparing to serve a tennis ball.
a couple of people are standing in the shade
a telephone pole with a sign stuck on it
A man talking on a cell phone while walking down a street.
A boy in red shirt swing a baseball bat.
A  man getting ready to hi a ball in a baseball game.
A pie filled with white creme next to a yellow banana.
A man outdoors jumping to catch a frisbee.
A person on a snowboard catching some air over a hill.
A zebra laying in the dirt looking away from the camera.
A double decker bus parked next to a brick building.
A cheesy pizza with red peppers is in a box.
Elephant holding onto the tail of another elephant with its trunk.
A dog and it's owner sitting in front of a desk.
A picture of a surfer as he catches a wave.
A young man sitting under a tree with red leaves.
A woman sitting on the grass with a computer outside the Brown Library.
A pizza on a pizza pan with two pieces removed by a serving ladle.
A blue subway train pulls into the subway station.
a woman whispers into a mans ear with a suit on
A shirtless man is on top of a man on a couch
All the contents of a video game console have been unpacked from a box.
white sailboat docked with other white sailboats
A man swinging a racket on a tennis court.
Stuffed toy bears on display on shelf in large room.
A red stop sign sitting above a no parking sign.
A living room filled with furniture and a purple couch under a window.
Airplanes sit parked on the runways of an airport.
a person jumping a skate board in the air
A passenger train speeding down a track in the countryside.
A boy playing baseball is winding up for the pitch.
A dog and cat sleeping together on a dog bed.
A living room with different living room furniture.
A player prepares to run to first after hitting the baseball.
Three adult giraffe stand at a grove of trees.
A person wearing red gloves grilling a pizza.
the man is riding a skateboard down a ramp
Several sailboats sit in the water in front of some trees.
Looking up at a building with a large face clock near the top.
A large jetliner flying over a row of runway lights.
A clock on a tower as seen from a roof.
A stuffed animal hanging from a post in a field.
Two nicely dressed men standing together next to a flag.
A refrigerator filled with food and drinks next to condiments.
People are riding the waves on surfboards on their stomachs.
A few deer laying down in the grass near a bunch of trees
A dog is sitting on a covered couch with some light.
A couple of dolls are standing on a table.
Two children eat fresh vegetables from a skillet.
a person holding a coffee cup with a watch on his wrist.
a large air plane flying in he sky
a person on a skateboard is doing a jump
An elephant with tusks eating food behind a fence.
A man riding the waves on a jet ski.
A person on a horse that has a decorated hat on its head and covering it's ears, with another horse next to it that has a mask covering it's eyes.
Large blue metallic public transportation bus with Aubaines written across the back.
Large pizza with cheese, olives and tomato sauce
a couple of people that are staring into a icebox
A man standing in front of a restaurant with a skateboard in front of him on the ground.
A silver Sport Tourer BMW motorcycle on a sidewalk.
Seen through a wire fence, is a stadium area with watchers and many vacant chairs, a dugout with a railing and many men leaning on it, and a playing area with a lunging, uniformed batter with a catcher and an umpire behind him.
a woman with a cell phone and another with a large bag
A white table topped with two desktop monitors.
A sneaker and a paw are seen on the grass.
A train traveling through a grass covered park.
A red fire hydrant sitting on a brick sidewalk.
A very colorful mix of grilled vegetables looks delicious.
a large cat laying across a table next to a monitor.
An antique motorcycle restored to like new condition.
Baseball player barely delivering hit to ball during game.
a zebra bending over eating grass at the zoo
A person is cooking something on the stove.
a black vanity top sink toilet and mirror
An old fashioned bench is sitting on the sidewalk.
A herd of cows grazing on the grass.
A man prepares to swing at the tennis ball
A white toilet in a bathroom next to a trash can.
A boy that is jumping in the air with a skateboard.
A couple of people by a boat in the water.
A stop sign on the side of the road.
a pink and yellow sign is hanging above the street lights
Tennis player holding his racket looking ahead of him.
a woman with eye glasses sitting at a table covered with food
A skateboarder is performing a round about handstand.
A child is playing with a frisbee in the park.
Blue plates are stacked on a wood countertop.
An old time car is parked at the curb near a stop sign.
A trailer truck hauling with a crane hauling logs.
A furry dog playing with a green apple on the carpet.
a boy sits sits on top of a horse in front of a jungle forest
A couple of brown horses standing next to each other.
A motorcycle parked next to a stairwell behind a plaque.
A cat sitting in front of blinds in a window.
A young man is skating around white cones.
There is a woman playing a game of tennis.
An old sign with trees in the background filled with fall colored leaves.
A young boy in motion while holding a remote.
A fighter jet is flying through the air.
The young children are playing a game of baseball.
The fat grey cat is wearing a red satin tie.
The large boat has nets extended on both sides of it.
A man swings a racket during a game.
A baby is sitting on a wooden bench.
A man, woman, and two children laying together on a bed.
People watch a baseball game in a large stadium.
A very tall building with a massive clock tower.
A man flying through the air while riding a pair of skis.
A boy putting his leg back to kick a soccer ball.
A stuffed bear head and paw on a laptop computer.
Several dark colored cats laying together on a piece of luggage and a duffle bag.
A woman laughs as a man brushes his teeth in a public location.
a man shaving in a bathroom while looking in the mirror
A fruit salad with cantaloupe, kiwi, and bananas.
A colorful train is waiting on the tracks at the station.
A woman holding a tennis racquet on a field.
A bowl with a plant, a large vase, and two cups on a table.
a person that has a lighter in their hand
Man leaned back with his mouth open, sleeping on a bench
a bunch of knobs on a large metallic stove
A zebra is eating while standing next to some hay.
A group of people on a field playing with a frisbee.
A girl laying on a bench reading a book.
A hand holding a mug of green liquid next to a pile of fruit.
A pair of photographs of a dessert with a vase of flowers.
Two zebras are standing outside near a tree.
some brown and black horses a table umbrellas and a person
A child in striped shirt sitting on the top of a bench.
A young baby that is brushing their teeth while sitting down.
Group of signs on top of each other on a pole.
A man and a woman outside next to an old truck.
Passenger train crossing a bridge next to a grassy field.
A clock tower made of stucco with an arched window.
A women holding a fork while looking at a cake.
A restaurant with no one in it has several square and round tables.
a crowd of people standing around and sitting watching surfboard
A man in grey shirt riding on a skateboard.
Street traffic light that is on blinking yellow.
A man laying in a bed with tubes attached to his check and mouth.
Two bird flying low across a body of water.
A baseball player posed to hit a ball.
The man is putting his feet up on the desk.
A cat sits in a glass window by a stuffed toy.
A fireplace with a mounted flat screen tv above it
A person's hand on the back of a black cat that is diving into a bathtub filled with debris.
Two girls playing a game of tennis on a court.
The front edge of a well used skateboard.
Man juggling three balls at the same time.
a woman is standing in a green field playing tennis
a giraffe bending down to eat grass off the ground
Two tennis players on the court and waiting to play.
Two children smiling and eating small personal pizzas.
Old refrigerator open in an abandoned wooden building
A painted postcard of the clock tower and bandshell at the Daytona Beach, Florida.
A couple on a bike are riding on the sidewalk alongside a bus.
Bed in room of some home with windows.
a really big elephant that a man is on
A person in a ski jacket next to a train
a red sign is hanging on a pole outside
A orange cat sitting in a piece of green luggage.
This kid who is Pinoy is  skateboarding over an ollie jump
The man poses for a picture while holding a snowboard.
A beautiful young woman talking on a phone.
Two ladies on a road with an umbrella
A gathering of people playing a video game.
A small office with a desk and book shelves
Several planes are flying high in the air together.
A large elephant walking next to a man
A pair of young men stand in a field playing with a frisbee.
A kitchen area features white appliances, counters and a white floor.
Cars parked on a dirt road near airplanes.
A cat is looking at the side of a laptop computer.
A green wall in bathroom with white and chrome fixtures.
there I a motor bike that is pakrd on the street and one with something on it
A couple sitting on top of a bench under an umbrella.
A flat screen tv on a wooden tv stand.
Two young men playing a motion controlled video game
A cat sitting on the edge of a table.
A woman sitting at a table eating a donut.
A table contains a large square cake decorated with a flower.
A set of blue bleachers sitting in the middle of a dirt field.
A pair of rusted scissors stuck in a stone sculpture.
Man standing in front of a television holding up a Wii controller.
A man playing a game of tennis and people in the crowd watching.
Three blenders with colorful tops and bases, two of them matching, stand in a row.
a room showing a microwave and a cooker also an oven
a man wearing a hat while riding a surfboard
A man brushing his teeth in front of a mirror.
Two bears in an in closed area with trees and stumps.
A man holding on to a parasail over the ocean.
A white toilet sitting next to a white sink in a bathroom.
A blue sea anemone living on a coral reef.
a long haired white dog is eating some cake on a plate
A batter practicing his swing in the batters cage.
A person riding a skateboard down a metal railing.
A snow skier standing at the top of a snowy slope.
A chocolate cake with decorations and a knife
A large white jet airliner flying over trees.
Flock of sheep eating grass on a mountain.
A surfboard resting on the sand of a tropical island beach.
An overhead view of a group of people sitting at several tables.
a sandwich with some fruit and a drink
A parking meter on the side of the road.
A pile of luggage sitting on the floor.
Fruit stands with bananas, pineapples, oranges, and other fruit.
The three giraffes are walking together on the grasslands.
The hand is reaching out in hopes of catching the flying disc.
a paper plate with some pizza on top of it
female skier, skiing slowly thru cold white snow
There is a computer monitor with a graphics program open sitting on a wooden desk.
A giraffe laying on the ground in the grass.
An dinner of pinto beans, broccoli, a roll, skim milk, an apple, and something unidentifiable.
Spectators watch a professional tennis player serve the ball
A meal on an airplane of cereal, milk, and fruit.
This is a child holding a remote to a game console.
A young skier in a red jacket goes down hill
Three big horn sheep are in an enclosed pasture.
A group of people standing around outside with their bicycles.
A toilet with jelly fish and star fish on it
there are two pieces of bread on a yellow plate
A large bear ornament hangs on the Christmas tree.
A small bottle of liquor next to a whole orange and an orange half.
A woman holding a few bread sticks and a glass of wine.
A view of an airplane wing flying over a mountain range.
A tennis player holds her racket during a match.
The cat is angry while sitting on top of a pillow.
a blue and yellow train engine and some people
a group of people that is surfing on some water
A man riding skis down a snow covered slope.
A plate of food that includes a sandwich and shoe string fries.
The dinner plate has three smaller bowls next to it.
The yellow bird is waiting for its mate.
Two empty stone park benches placed up against a stone wall.
An airplane sits on a stand for display.
a close up of a person holding an electronic device
A airport tarmac filled with a jetliner and trucks.
A piece of cake and for that are on a plate.
A man skateboarding through an obstacle course with cones.
a white toilet and many rolls of toilet paper
a four-legged animal grazes on the side of a hill in a forest
Two young ladies petting a young calf on a farm
The woman in the colorful dress is holding a video game remote.
A wing outside of an airplane window high above clouds.
A person on a surf board surfing a wave.
A small kitchenette with personal items displayed attractively.
A man with glasses sitting at a wooden table with a lamp.
A desk area with a computer monitor, keyboard and mouse.
A girl that is sitting down with a cell phone.
A great full shot of the bathroom with wooden floor.
A cat lies in an empty fruit box amongst other fruit boxes.
A man is outside grilling some hot dogs.
The truck is traveling down the road in really bad weather conditions.
A field filled with lots of white sheep next to a river.
There are a lot of animal heads laying on the bed.
A women standing on a bike backwards .
a woman sitting on top of a horse standing on a beach
A man in a blue shirt holding a piece of pizza.
A bed with two pillows under a window.
A white plate with slices of meat and veggies.
A young man is eating a hamburger while a young girl watches and laughs.
A parking meter reads .90 cents as a silver car is parked behind it.
A man looks down at a dog sitting on a chair outside.
A picture frame with 2 pairs of scissors dangling from top and a painting sitting in front of the frame
A woman that is leaning over a pizza.
There is a suitcase which appears to filled with foreign snacks.
Two women in a public place playing on a Wii system.
two people on a tennis court playing a game
A large crowd in a grassy area with the capital building in the background.
There is a plate of pasta with a fork in it.
An old mattress lying amidst overgrown brush and leaning against a fence.
A person on a surfboard in the water.
A refrigerator sits next to a counter in a kitchen.
An elephant is stretching its trunk on the ground.
A kitchen has a vintage gas range and yellow walls.
A man with a glass of wine in his hand.
Red fire hydrant with blue top on downtown street.
A person standing on top of a snow covered slope.
A group of smiling police offers on brown horses.
A media center with gaming consoles and a television
A traffic light that is currently a green light.
A white plate with meat and broccoli on it
The yellow earth mover sits in the field in front of the pole.
A man holding a surfboard in a hotel room
Looking up at a traffic light next to some street signs.
there are many train engines and cars
Dessert pastry with apples served with an autumn theme.
A red stop sign next to a road in the middle of nowhere.
A kitchen with a sink, counter, cabinets and a dish rack.
there is a woman riding a brown horse on gravel
A noodle and vegetable dish is displayed on a plate.
A snowboarder holds a snowboard for a photo.
Motorcycle decorated with an American flag and reindeer
A person on a skateboard on a ramp.
a man with a tennis racquet serving a tennis ball
A young man is in the middle of performing a skateboard trick.
A young man playing a game of tennis against an opponent.
A man with a pan in his hand walks by pizzas in a oven and on counter tops waiting to be baked.
some kids are watching as giraffes walk around their zoo exhibit
A young lady holding a bat behind her shoulders
A couple of cats sitting on top of a couch.
Kid sits on the edge while another jumps over riding a skateboard
A group of people walking across a crosswalk.
A group of three men standing with their backs against a fence.
A couple stands smiling next to a sitting older couple.
A large white table with chairs surrounding it.
A snowboarder and her child in the snow.
A male tennis player holding his racket in the air.
A red and blue train on a bridge during a cloudy day.
Brown horse on the sand at the ocean.
poles full of signs in front of a skyscraper
People reaching into a broccoli garden and picking broccoli.
An old fashion looking bus is sitting idly.
Homemade cheese and red sauce pizza on a plate with flour and dough on the wood table.
A desk with a bunch of paper on It.
A cat being offered water in a glass.
Three loaves of bread are in an oven.
A teenage couple dressed up and smiling in a aprk
A kitten is held and fed with a bottle.
A woman stands on a patch of dirt holding a tennis racket.
A woman taking a swing at a baseball
a big bear stand next to a river stream
A table topped with construction contents on top of a wooden table.
A young man wearing a tie and sunglasses is looking away.
A baby sitting in the grass looking at the kites in the sky.
A fork and knife on a plate with pizza
A trio of people stand near two elephants in a covered area.
A girl playing frisbee in the backyard of a house.
A doorway leading to a dining room area.
A child is holding on to a rod while he rides a boat.
A stainless steel pan with a pizza cooked on it.
Graduate talking on cellphone with people behind him.
a painting of the president sitting with his hands folded in front of his face
Automobiles stopped at a traffic light at night on a busy street.
A street sign sitting next to parked cars and motorcycles.
A train in a subway that has a few passengers.
A lot of people walking in the streets and on the sidewalks.
A baseball player swings a bat at a thrown ball
A motorcycle parked next to a green grass covered field.
Many swans in a lake are overlooked by a cow.
A herd of cattle that are sitting on the grass.
two giraffes and a man in a brown shirt is feeding one
A smaller kitchen with a very decorated fridge.
white cabinets a sink stove refrigerator and a window
A woman walking down a rain soaked street with a red umbrella.
a military vehicle and a smoking tow truck on a rural road
A black and white picture of a child on a skateboard the street
A number of grizzly bears sitting on tan rocks.
A group of people sitting in a field eating together
A man is cross country skiing through a forrest in winter.
A buffalo is looking at a bird from a distant.
A cardinal standing on an empty wine glass.
A chocolate cake being sliced and served on plates
sliced tomatoes on a plate and a bottle of wine
Two people stand next to each other holding cell phones.
Large pink boat on wheels parked on the side of the road.
A bathroom with a unique double sink and round mirror.
A polar bear walking along a snowy, rocky ridge.
a man on the phone looks angrily at the man
Different sized and styled teddy bears on display with pictures and information.
This is a train traveling down the train tracks.
Two women riding an old motorcycle with a side car
Woman jumping on bed caught in mid air.
A person on a surfboard high up over the water.
There are two monitors and one laptop on the corner desk.
Photo of kitchen being remodeled with a new stainless steel stove.
A surfboard is laying flat is the sand beside a palm tree.
A sign that is on top of a pole.
A group of three people sitting next to each other  on a bench.
A table with an assortment of items such as a keyboard, phones, pens, snacks, keys, sunglasses, a water bottle, and more.
A woman playing fetch outside with a dog.
A man reflected in the mirror in a washroom.
A man in black shirt holding a large striped flag.
Two children riding a horse in front of their home.
A pair of buses sit next to each other on the road.
A group of skiers entering a tunnel through the snow
Afternoon tea in a living room of a home in a hot climate
A male ostrich runs through the grass in front of the trees.
A man posing for a picture, in a kitchen.
a wooden desk with two monitors and a keyboard on it.
Someone is holding their tablet connected to a surge protector.
A woman walking next to a train, pulling a suitcase.
A couple leaving their wedding ceremony in a shower of rice.
A stuffed teddy bear sitting on top of a bench.
A huge dump truck is fenced in in front of a neighborhood.
Two little girls sitting on a bench at a softball game
Someone is enjoying a small slice of pie.
A group of people riding horses down a sandy beach.
an image of a man that is drinking wine
People in uniforms playing baseball on a baseball diamond.
A bathroom with a double vanity and round mirror.
Light colored cat lying on woven rug next to checkered shoes.
A modern sink and shower stall are visible in this photo.
A passenger bus that is driving down the street.
there is a broken tree log on the ground
A mini keyboard attached by USB to a laptop.
The raw material of meal preperation including Broccoli is kept on the table.
Two apple computers are on a white desk
Man sitting on a step in a run-down part of town.
Two people standing in the grass under a cloudy blue sky.
Two toilets sit outside on the pavement next to a yard with many decorations.
A man high in the air mid trick while snowboarding.
a bathroom that has a white toilet in it
A fire hydrant is across the street from an Asian restaurant.
People waiting at a bus stop with a bus parked.
A group of men playing a game of basketball on court.
a fridge filled with assorted foods and condiments
a guy in a half pipe gets ready for his trick
A small boy is holding two pizza muffins.
A display case in a bakery with decorated cakes and cream rolls.
Zebra crossing a dirt road by itself in daytime.
A long yellow train traveling past a train station.
A lady staring lovingly into her pizza.
A brown teddy bear sitting on top of a pregnant woman's belly.
An old black and white photo of a man with glasses in a suit and tie.
A man on a skateboard with a woman filming
orange, pear, and apple are all in a row.
This is a bus with a Titans themed advertisement for Coors Light on the side.
Little bird looking out from the tree it's standing in
Everyone is waiting in line to purchase tickets.
A small model train traveling around a small track.
Two forks on a plate of cake and cream.
A doll house living room filed with furniture and a persian rug.
A man standing next to a hipster woman while holding a beer in his hand.
Two tall birds stand together on a grassy spot next to a large rock wall.
A group of people standing on a beach next to the ocean.
Someone sauteing broccoli and onions with wooden spoon.
A woman posed for a picture while eating.
A woman looks through things on a desk.
A train car with purple and grey graffiti covering windows
A small bathroom has a sink and a storage rack over the toilet.
a group of men trying to get an air blaoon  working
A large dim kitchen with light coming in from a window.
A skateboarder is gliding along a paved walkway.
A group of elephants in sandy area next to trees.
A bird sitting on a fence and looking around.
A person windsurfing with the sky in the background.
A stop sign and three street signs attached to a pole.
a black television is on a white table
A picture of a red prop plane parked in a field.
A woman sitting in front of the Eiffel tower near pigeons.
an image of a closed mcdonalds taken in a parking lot
A man is shown, with headphones around his ears.
A man is waiting for a bus on the side of a city street.
A kid up to bat in a baseball game.
a church with a clock built into the side of it
A clear vase full of purple flowers sitting on a table.
Two women play singles tennis outdoor surrounded by trees.
A stop sign is leaning a little bit.
A cat sitting inside a piece of luggage on a vehicle.
A white bathroom with pedestal sink and small cabinet and daylight window
A dog picks up a Frisbee out of the grass.
An open laptop computer on a wooden night stand.
An old man holding a bag walks down a street.
A laptop computer with pictures of giraffes on the homescreen.
A person riding a skateboard on the sidewalk while holding a pole.
A person laying face down and balancing himself on four yellow poles and a fire hydrant.
A laptop computer and a desktop computer sit on a wooden desk.
A woman is holding her daughter in front of a birthday cake with candles while another lady stands nearby
A sheep with long horns wearing a purple bit.
Pink glasses are inside a clear plastic bag with bananas.
a bed that has some material items on it
Sheep resting under a blue boat foundered at low tide.
A bed and mattress store front with open doors
Three men sitting at a table in a restaurant eating.
A very clean bathroom that is made out of wood.
A small airplane flying in the air near land.
A man and a giraffe are greeting one another.
two horses standing in the snow inside a fence
A bathroom with a separate tub and shower
A large machine digs up a side walk at a construction site.
The yellow train is running along the tracks.
A chili cheese dog in a travel box on a table.
a women that has a carrot in her hand
People are sitting on a cart pulled by a horse.
A little girl standing in front of a pile of surfboards.
A close up of a modern motorcycle on display.
A person rides a snowboard in a forest setting.
This is a cow on a grassy plane with a mountain in the background.
A large display filled with bananas for sale.
A woman is sitting outdoors at a table with a sandwich.
An individual is taken in this very picture.
A photo taken over a water way with a clock tower in the background.
some men riding horses down a mud track
A catcher throwing a ball at a baseball game.
A mixer in the process of mixing foods.
A busy inner city street with cars, a bus and a biker on it.
A bathroom stall with two toilets and a plunger.
A group of three men and one women are holding Wii controllers in a living room.
The boy is on his boogie board in the ocean.
Vases with flowers are setup against a pink backdrop.
A women walking down the street while holding an umbrella.
A couple of cows laying on top of green grass covered field.
A dog lays on the bed with a remote.
A woman hitting a tennis ball with a bat.
A dog lying on the floor on some clothes and a remote
A tennis ball sitting on a tennis racket.
A long row of scooters stretch down the length along a sidewalk.
A white and yellow plate holding three bananas.
a living room with a couch and two low wooden tables with floor cushions in a log cabin.
A cat watching birds flying on a Sony TV screen.
A person standing by a field with a large chair.
The man is standing by the table using his phone.
a person at a table with many plates of food
A wooden ladder stands over a toilet in a tile bathroom.
A toilet with a sink and a towel dispenser in a bathroom.
A photo of Thomas the Train coming down the tracks.
two elephants in a field behind a fence near many trees
A very tasty looking dish of food with some broccoli.
Three giraffes on grassy field next to trees.
A town with buildings, vehicles, and street lights.
A side view of a building on a street corner are shown.
The decrepit bathroom features a brand new toilet.
Individuals are there commending and having a ton of fun of their life.
the person is putting something into an oven.
A man in a kitchen handling food, with another man in the background.
A clock that is sitting on a wall.
A giraffes head in front of a metal grid
A young man in formal dress is standing.
Custom made pizza sitting on a plate ready to be cooked.
A pedestal sink and a toilet in a bathroom.
A man dicing carrots with a large knife on a cutting board.
Graffiti on a French street showing a man holding a red umbrella.
A zebra is standing next to a fence.
The plains with zebras and gazelle around a watering hole
A lonely bird sitting on a white bench.
A man and a small girl standing next to a glider.
A coffee cup and a plate of food.
a display table filled with assorted carrots and cauliflower
A person on a skateboard riding on a street.
A man that is on a skateboard in a concrete bowl.
A crowd of people holding umbrellas walking down a sidewalk.
A person's feet in the bed with socks and shoes on.
Two toilet paper rolls sitting next to a toilet.
A young woman carefully touches a giraffe's long tongue.
Several people ride down a dirt road in a horse drawn carriage.
A foggy street with lots of traffic driving under traffic lights.
There is some food sitting in the pan.
A view of many different scissors on display.
A woman gives the peace sign at lunch with her friend
A fighter jet flying through a blue cloudy sky.
A vase filled with peacock feathers sits in front of the window.
A man in a long sleeved hoodie holds a cup
Red jello with fruit in container in microwave.
Motor vehicle traffic on a paved city road.
A school bus parked in a parking lot next to a building.
Two men watching three horses running down a path.
an image of two zebras in the wild
a person jumping over a curb at a corner in front of a liquor store
A man with piece of cake and a spoon sticking out of the top of the cake.
an image of a zebra in a field
A woman with a cell phone sitting on a couch surrounded by a red white and blue border.
A boy in a safety vest lays in the snow as another boy on skies stands near by while a man in a red jacket kneels in the snow by the boys.
A vase filled with white flowers sitting on top of a wooden table.
A person sitting down a bench in front of the ocean.
THERE IS A MAN THAT IS SITTING ON A BENCH READING
A red truck driving past tall buildings on a paved road.
A young Asian boy holding a tennis racket.
A woman standing near some steps at a river's edge.
Coordinated bedding pulls together a full size bed and a set of bunkbeds.
A person on a court with a tennis racket.
a fire hydrogen that is sticking out of the ground
Old and young men sit around a table with laptops.
A batter, catcher and umpire in a  baseball game.
A skier with a huge black pompom on his hat.
A bathroom is shown with a mirror and a sink.
A motorcycle is parked near a quiet river.
A man is holding a partially peeled banana in his hand.
Railway train on tracks traveling on beach next to ocean.
two people sitting at a table with many wine glasses
An empty refrigerator has its doors open as it stands next to a kitchen sink.
A person with purple hair and and tie.
Plane sits on a bridge above the water
This is a man and woman on a ski slope.
this is a fire hydrant sitting on the sidewalk
A black and orange cat sitting on a wooden counter top.
The service man is putting meat on the tray.
There are giraffes standing together near the trees.
A girl sits at a dining table set for three with food on the plates and cups and a candle on the table.
A young woman decorating a cake with a frosting bag.
a pair of scissors and other knitting supplies are on a table
A fire hydrant is spraying water onto a city street.
A man on a bicycle is looking at a semi truck.
A dog tied to a pole, with a bike behind it.
A bedroom is shown with a suitcase on a bed along with several clothes on it.
A girl with curly hair and a teddy bear on a bed.
A man in ski gear skis down a slope.
A train pulls up to a platform at a station.
A kitchen with a stove and microwave above it.
Two stuffed bears sitting next to each other.
The surfer wearing a wetsuit is riding the wave.
A woman making pizza at an outdoor event
A bowl of cereal and a glass of water are sitting on a table.
A plate with some food on the top.
a mom and a kid in a green kitchen
A large artistic clock is posted on the side of a building.
The man takes a look at the food in his hand with the door to his fridge sitting open.
A man waving at a school bus from his driveway.
three baby lambs laying on a pile of hay
A crowd of people crossing across a street.
The brown cat has big round brown eyes
Bike riders travel next to a passenger train
A group of turkeys feeding in a field.
a young man is holding a 2000's style cell phone up in front of his face.
The display of products for sale at a motorcycle shop.
The man is sitting on the beach with a head and sunglasses on.
An older couple in a boat float past ducks on an open river.
A lone skier on a snow slope with some areas of dirt expolsed.
A man on a surfboard riding a wave.
An older man sitting at a wooden table with a plate and a drink.
A mural on a city wall with a women walking down the sidewalk.
Two women wearing bikinis on surfboards in front of beachfront hotels.
The kids are playing tennis on the courts for physical education
A snowboarder does a trick beside a Hilton hotel
A large passenger jet sitting at an airport.
A couple of brown horses grazing on a green grass field.
The inside bow section of a narrow metal boat floating on blue-green water.
A street sign has multiple street names on it.
A woman with short ginger hair has a book open as she lays in bed.
a white and orange cat sitting on wooden table
A tennis player runs towards the ball during a match.
A bathroom sink sitting next to a window covered in curtains.
Fresh baked pizza being served at a restaurant.
A walkway along a river that looks out at a bridge.
A man hits a tennis ball during a tennis game.
A sigh advertising a dancing club is present.
A train pulls up to an empty platform.
A person who is riding a wave on a surfboard.
A double sinked bathroom has circular twig wreaths hanging above.
a close up of a bird with a blue head
The cheese bread appeals to a variety of people.
one lady is on the computer one is digging through a backpack there's a  man on the phone and another man on a computer
A baseball player running down a baseball field.
Man on a skate board holding himself up.
Some plates and containers hold a variety of food.
A person in glasses makes a funny face while eating.
a white plate of meat and carrots and a side of brocolli
A group of children sitting at tables working on laptops.
A person in a wetsuit on a surfboard on a wave in the ocean.
A man is looking inside a fridge with only four items in it .
a bunch of umbrellas are in front of a house
Two horses in an enclosed area during the day.
A plate of wild bananas sitting on a patio ledge.
A couple of yellow school buses driving down a street.
This object has a long cord attached to it.
A batch of sweets as well as oranges.
A brown teddy bear sitting next to cup cakes and then sitting on a couch.
A cluttered living room with a laptop computer.
A person in a tennis outfit holding a racquet.
A young girl playing tennis on an indoor court.
A yellow fire hydrant is next to a tree.
A motorcycle with cheetah print, parked on a curb
A polar bear that is underneath the water.
A boy is sitting in front of table filled with apples.
a  small dog laying and a cat laying on a sofa
A sandwich with chopped vegetables sits in a cardboard container.
a man riding a snowboard down the side of snow covered slope.
A large horse standing next to a smaller horse.
A boy running to catch a frisbee in flight.
a cat sits on someones lap and looks at a plate of food
An orange that has been placed next to a beer.
A close up of a large, white plane with someone standing beside it.
A brown cow standing next to a black horse.
Many colorful kite surfers over an ocean cove
A airplane that is sitting in the grass.
A view from an airplane of mountains with a partial snow-cap
A man and a woman sitting on a motorcycle.
A bun has carrots and parsley on it as it sits on a green plate.
A batter swinging at a ball with a catcher and umpire behind the plate.
a man that is jumping a small skateboard
A man is sitting next to a computer system with two monitors, keyboard, and mouse and a desk that has many figurines and dolls atop of it
A black train is stopped on the tracks.
A professional baseball player holding a ball during a game.
A double decker bus driving down a street.
Two men playing a game while a boy watches.
A man wearing a green shirt on top of a tennis court.
a airplane that is parked on a  runway
Boys on different teams running for a basketball.
An empty street with a red double decker bus in the distance.
Man and woman standing up while playing wii.
A pile of pieces of dark green broccoli.
Guy sitting at the front of the bus typing something on his laptop
Man with glasses in suit leaning over to blow out candles on a cake.
Two men with backpacks and skis standing on top of hill.
main street of a slum with cars and people
A bed with sheets, a chair and wall hangings
A group of paddle boarders watch the beach.
An adult smiles while skiing with small children.
A street with a bunch of street signs and a building near the street
Two benches and a garbage can sit on a beach.
A cathedral with clocks set in four directions in the clock tower.
a person jumping a skate board in the air
Three big rigs parked in a row in a field.
The white and black horses are grazing near mountains.
An airplane with the word navy pained on the side is sitting on a runway and people are sitting inside of the plane.
Two horses hold their heads near the short grass.
The woman is sitting at the table in the restaurant.
Man and woman exchanging words on stage with horse
A person standing on a skatebord on some grass
red suitcase and two black suit cases on pavement
A bus traveling on a city street near pedestrians and buildings.
a bunch of travel bags sit in front of a television
A boy sitting on a bench looking at a cellphone
A table set up with flowers is in a farm type area.
A group of kids that are sitting in front of a table.
A jetliner flying low as viewed between two skyscrapers.
A white three tier wedding cake decorated with roses.
Three dairy cows in a grassy paddocks fenced by bushes.
Teddy bear tucked into bed in a bedroom.
A person holding a rainbow colored umbrella near a crowd.
A large sink with three silver faucets on it
A cat is playing with a backpack strap.
A woman holds a Wii remote in her hand while making a face of concentration.
Motorcycles and mopeds line the street of an asian shop
A young girl on skis and holding poles, posing in fake snow.
A baseball player holding a bat on a baseball field.
A vase filled with an orange reddish flower.
A PERSON IS ON A HORSE ON THE BEACH SHORE
a group of men work on a air balloon
Bicyclist riding on a city street at night.
An apron is flying in the air next to a tree.
a motorcycle with a bag on the back of it parked in the road
A young man talking on a cell phone with a stuffed animal on his stomach.
A baby on a brown horse next to two people.
two people in a kitchen preparing food
A table of doughnuts with light showing on them.
A women who is wearing snow skis and performing a jump.
A man with glasses and a tie stares straight ahead.
three cats a gray one a black one and a brown and black one on a bed
a backpack and luggage on a car seat
A man with a backpack holding a bottle of beer.
There is a woman sitting in a boat drinking something.
A zebra and a "part zebra" eating grass.
some signs on the road showing the street and direction
A nice setup of stuffed bears having a picnic.
A man is surfing a small wave in the ocean.
THERE ARE DOUGHNUTS THAT ARE ON A PLATE
A woman and a baby walk on a grass field where kites are in the sky
A black headed woman skiing in the snow.
Two laptop computers sitting side by side on a wooden desk.
A couple of sheep walking across a lush green field.
There are a lot of sweatered teddy bears in this pile.
A doughnut with sprinkled sugar and icing on it.
A young smiling boy stands holding a set of Wii motes.
A group of elephants is walking across a grassy field.
A large breakfast omelet, english muffin and fruit
A zebra standing in a stall with its mouth open showing its teeth.
A small elephant walking around in its enclosure
A man is talking to children about surfing on the beach.
A green street sign next to a neon sign on a building.
A man leans back in a chair with a beverage.
Woman posing in front of two pints of beer.
cabinets a sink dishwasher and stove and a window
A woman riding on the back of a white horse.
Two white swans and grey ducks  in a grassy area.
A plate filled with an assortment of food
The toilet is broke and sitting on the grass.
A blocked off street that is ready for a event to happen.
a red and white firehydrant sitting in some grass with cars and trees in the background
Cows are in a pasture with one glaring attentively for a photo.
Two people are on a bike together traveling down the road.
A couple of people walking across a beach with a surfboard.
a chef slices jalapenos on a cutting board.
The plate of food with a spoon on it has broccoli in it.
A contemporary light-rail train seen from the front is stopped in a station.
A bike and some people on a street.
Baseball player on the ground at home plate while an umpire makes a call.
a woman holding up a smart phone while smiling.
Woman surfer in the river catches a wave
a close up of a pair of scissors in a scissors pouch
There is a man standing in the kitchen.
The man is reading the paper on the bench.
A man riding on the back of a motorcycle.
A stove with the clock set at 1159. There is a spice rack on the stove.
The cab of an eighteen wheeler in a parking lot surrounded by trees.
A clock is standing in the middle of the grass in the middle of the afternoon.
a stop sign and no right run sign in a big city.
a person crossing the street in front an orange trolley while holding a garbage bag.
A woman standing in a living room with a Wii remote.
A man is sitting on a chair playing a guitar.
A toilet sitting under a metal bar in a bathroom.
A large dog laying on top of a bed in a bedroom.
Two women and a man sitting at a table.
an image of a cat walking in the kitchen
A mini pizza with an egg in the middle.
Three women play with Frisbees in a shady park.
A woman and a child are hiding under the covers.
A topless girls sittinng on a bed holding a bear and leaning on a suitcase.
a man gets ready to catch a frisbe
a female lunging after a tennis ball holding a tennis racket in both hands
this is a green fire hydrant and brick street
Plastic containers filled with food including fruits and vegetables.
a couple of guys are sitting at a table
A table is set with plates with pancakes and bowls of fruit and a bottle of syrup.
Outdoor art piece of an elephant covered in paint being displayed for sale.
A woman and her two children walking in the rain while holding umbrellas.
A woman standing next to a giant refrigerator freezer.
A clock tower sitting in the middle of a parking lot.
There is a tea pot on the gas range.
A large long train on a steel track.
A green and a pink bus are next to a store.
A very nice looking motorcycle parked by some trees.
A sink of a bathroom with things on the counter
A person on a surfboard rides a wave.
a person and a child playing with a kite
The stop sign is across the street from a bridge.
A girl is holding a umbrella.Someone shorter than her took the picture. She isn't smiling.
ball bats standing on end leaning against each other
A train drives under a sky walk for pedestrians.
A lady is paying the Wii at the store while a man looks on.
A man is on the snowy hill in his ski gear.
A very long blue and white bus pulling out of a parking lot.
Four girls and two boys sitting in the back of a parked white Ford Super Duty pickup truck.
A red traffic light at a street corner with vehicles near it.
A woman is riding the horse while the crowd watches.
A teddy bear sitting in a window holding a cell phone.
Dishes with strawberries and walnuts are set on a table.
a police officer riding a bright yellow motor cycle.
A man and a woman riding a scooter past a church.
Military float plane flying overhead on cloudy day.
Assortment of sliced pizzas in yellow cardboard boxes.
there is a small pizza that is on a white plate
A man standing on top of a base on a field.
Male tennis player standing on a court holding a racket.
A man in the living room plays a game on a game system.
A close-up of the face of the horse with a woman on the back.
An open face sandwich and a pile of potato chips on a plate.
The man is outside skiing in the snow.
Several color fruits and vegetables, all unprepared on a concrete surface
A small bathroom, with a commode, and a bathtub with bath toys in and around it.
A large boat floating on top of a lake surrounded by a forest.
A person carrying an umbrella walking on a path next to water.
What a funny picture of one giraffe hanging on to the neck of the other.
A man standing in front of a laptop computer.
a male surfer in a wet suit some rocks and water
A cat sitting underneath a bed in a room.
A stop sign is at the bottom of a four way stop.
A blue, metallic parking meter with a yellow number six.
A brightly colored food item is on a white plate on a black table.
Man skiing down a slope just beneath a lift.
A girl holding a yoga mat riding skateboard down the street
A laptop computer sitting on top of a wooden desk.
a couple of treys with some food inside of it
The chocolate cake on the plate is topped with strawberries.
A woman is standing in front of an old train.
Black and white photograph of animals and horses in field.
Two zebras graze on grass by a dry creek bed.
A man standing in a kitchen preparing food.
A rabbit on its hind legs in front of pigs.
a man sitting on a wooden box in front of a mural
A partially eaten pizza sits on a tray on the table.
There is pizza topped with white sauce and broccoli.
White planes lined up in a parking lot.
A young man sitting on top of a skateboard.
Black and white photograph of a man on a skateboard.
Red Regio buses parked close together in a line.
A group of people waiting on a train with items balanced on their heads.
an empty park bench sitting among the trees
A gravestone with a vase and stuffed animal on it.
A couple of people on a sidewalk holding umbrellas.
A man with black sandles standing in a dress store.
A crowd of people standing next to a vending machine.
A slice of a  banana on the table
Monkeys eating through the peels of bananas.
Three zebras are standing in the grass, while one stares at the viewer and other two stare off to the right.
The boat makes a big splash in the water.
Group waiting to take their turns on the ski jump.
A jumbo jet flying in the air during the evening.
A dog lies on a floral rug near a living room window.
A baseball player swinging a bat over home plate.
A person with a snow board posing for a picture.
A small glass table with vases on top sits by an open window.
several zebra are walking together at a zoo
A woman in a hair net in a bakery holding a box.
A train covered in graffiti sitting on top of  train tracks.
A young professional is working at his laptop while his coworker is reading material.
A bed, chair, drawer and a wall hanging
A kitchen counter covered in pots and pans and appliances.
A giraffe eating leaves from a tree near a forest
a small child has a brush in his mouth
Trees and a street sign  next to a street.
A dog is poking its head out of a vehicle window.
Police officers ride horses on a city street.
Trees mark the far side of a fence that encloses a large environment space with man made rocks and two giraffes, one close up and very large, the other small, and seemingly far away.
A plate contains skewed meat with a side of vegetables.
A red stop light across from a brick building
Table set for two with pancakes and syrup.
The brown and black cat is laying on a computer laptop keyboard.
A picture of a person walking down the street.
Three zebras that are standing in the grass.
A train traveling across a bridge over a river.
A donut that with sprinkles on half sits atop a Nautilus jump rope.
a transit bus parked near a building near a cart
A dirty toilet in a small bathroom.with items on top.
Bathroom with radiator, sink, lighting, shower curtain and decor items.
A young man performing a trick on a skateboard,
A white plate filled with pasta and broccoli.
A little boy holding a Nintendo Wii game controller.
A group of zebras watching in a field.
A black bird with spiked hair standing on rocks
A red double  decker bus in front of a white one.
The tattooed man is talking on a cell phone.
Luggage on a tiled ground and people sitting on rows of chairs in the background.
A very pretty bird perched by a tree.
Farmers markets have become popular destination points in metropolitan areas.
A man in a grey suit with a blue pixelated tie leaning against a wooden podium.
Two women in white dresses playing a game of tennis.
A blue and white bus that is parked next to another bus.
Three young boys are cavorting along an old sidewalk.
A beautiful blonde girl standing next to a blonder.
A man and two girls sitting on a couch with a dog.
A brick wall with a sign giving directions and a clock on top of it
An African-American man, wearing a shirt and tie, glasses, and a cap, is looking downwards.
A small grassy hill with three sheep at the top and a fence along the side.
Lots of people walking on a city street with Chinese stores on both sides of the street.
A white bowl is filled with broccoli garnished with crumbles of cheese.
a man in glasses holds an umbrella with a brief case
Parents at a chain link fence watch a Little League baseball game.
A dog riding on to of a yellow surf board on a wave.
A young girl sitting in a chair covers her face.
A jockey on a horse jumping over a hurdle.
Large open living room with black leather furniture.
A red stop sign sitting in the middle of a road.
A bird sitting in a shallow pool of water observing something.
A old photo of a pitcher on the pitchers mound.
A large quantity of banana's piled in a fruit stand.
An old fire hydrant sits on the grass in a park.
An old truck sits parked in an empty grassy lot.
This desk has a computer paper, water bottles, and a rolodex on it.
An airport has several planes on the runway.
A man rides a surfboard down a wave.
A worn down stove and oven sitting in a parking lot.
A stadium full of people watching a batter hit the baseball at a game.
A man with a wrench turning off a fire hydrant.
A blue and white train raveling past a rusted out train.
The man is covered with a net and sitting on the ground.
A patio table with two dinner plates of food and two bowls of salad.
An oriental style room with tatami floor coverings.
A kitchen with gas stove with four burners and a sink.
A pile of red and green apples sitting on top of each other.
A person on a snowboard jumping a snow covered hill.
A woman smiles while eating a pita sandwich.
Crowd of people walking in snow in front of buildings
A woman and man dance while smiling.
A bird perched on a grave in a graveyard.
A man with long hair and a beard smiling with his arms outstretched.
A large cow laying on top of a sandy surface.
A woman with green hair standing beside a brow and white horse
A row of planes flying in sky with smoke coming from their tails.
A boy with a tennis racket bouncing a tennis ball in the air.
A skateboarder is standing, wearing a helmet and holding their board.
A man on top of a ski slope on skies posing for a picture.
A glasses wearing woman with a hotdog sandwich to her mouth.
A child sleeping with a teddy bear on a bed.
A table with plates of food that include corn and fruit.
an Amtrak train with eight cards beside a field
A Honda motorcycle parked next to a grassy area.
a close up of two pizzas on a plate
Two giraffes in a field between multiple trees.
A little zebra playing around inside an enclosure at the zoo.
Some black kitchen machines used for cooking food.
An older man holding a plant with banana bunches.
a person that is standing in a kitchen
The man in the colorful shirt pulls two luggage bags behind him.
there are many people awaiting a train at the station
A sign next to a stone wall stating the road name.
A giraffe walks on grass looking for something to eat.
a close up of a plant of bananas
this kitchen is large and has wooden cabinets and a granite island
A display of teddy bears are on an outdoor blanket.
A soccer player kicking soccer ball around opponent.
A plate of pasta and a bowl of spaghetti
a stop sign sits on a street corner
Men play a soccer game on a dirt field.
A pepperoni pizza is bigger than the child sitting in front of it.
a group of small boats in a body of shallow water
A person skis down a snowy hill while others watch.
A long train going down the train track.
A sink and some shelves are in this small bathroom.
A small car is pulling a man on a bicycle.
a person standing in front of a mirror with his reflection in a different pose
A bus that is standing next to a building.
three friends hanging out on a snowy hill
Half eaten berry filled dessert on a white plate.
A living room with several books and paper on the floor.
Part of a ship sits in the shallow end of the bay next to a city.
The letters of a laptop keyboard are sitting on a wooden table.
A sign with many different stickers placed on it.
A young boy hitting a ball in a yard with a bat.
A small airplane flying through a blue sky.
People in a field are looking up at a kite.
A girl stands holding the string of a flying kite.
Herd of sheep resting in the shade of tree in open area.
cat sitting on top of a red and black motorcycle outdoors
A toddler getting help from an adult to brush its teeth.
A stack of orange solo cups near scissors.
An abstract photograph of a moving train on its way to New York.
A group of young people sitting around a piece of luggage.
A man is surfing the internet on his laptop.
A bunch of people in a metropolitan area with umbrellas, walking on a sidewalk next to buildings.
A man is riding a horse in a fenced enclosure.
This woman and man are holding a gold bag.
A man riding a motorcycle with a woman on the back.
There is a bear walking across the grass.
A young boy playing video games on tv.
A number of people are in a building with many colorfulful items over their heads.
A large passenger airplane sitting on a runway.
Two police officers riding motorcycles down a city street.
A very neatly organized display of many items.
A person with a red umbrella and a dog on a walking trail.
Three children are in the bathroom brushing their teeth.
Signs directing traffic in front of two several story buildings.
A desk along a wall with book cases over head.
A boy is performing skateboard tricks in parking garage area.
A cabinet holding several oriental vases and lamps.
A dog laying on the ground between someone's legs sitting in a chair.
Small piece of cake on china, with a fork.
A group of people standing by a white and green train.
Paperback book about Mother Theresa on a pillow
A clock and some books in a room.
A woman raises her tennis racket on the court
The man is holding a large bird outside.
A blue sign suspended above a street with cars driving under it.
People walk down a rainy avenue carrying umbrellas.
Two snow skiers pose on a snowy landscape.
Lake with boat in grassy fields with cows.
A female tennis player readies for a hit
Broccoli and chopped carrots sit next to each other.
A large bed with pillows and a blanket.
a small airplane that is just lifting off into the air
Three cats surrounding a stuffed bear holding the sign that says help.
A group of people sitting around a wooden table.
The child smiles next to a stack of donuts with pink icing.
An old western town miniature in the backyard of a house.
A train is coming down the tracks next to a field.
Sepia photograph of a stop sign next to row of mailboxes.
Shirtless man in white shorts writing on top of a skateboard.
two ladies riding horses there's a reflection of one of them in a mirror
A man in a yellow jacket that says police is looking across the street at a crowd of people and has his hand on a wooden structure.
A little league baseball game showing a batter, catcher and umpire.
Two sheep graze in a grassy field at the edge of woods.
cars parked on a city street with buildings in the background
I am not sure what kind of food this is.
An airplane flying over a beautiful ocean shoreline peppered with sailboats.
A train pulls into a station constructed of brick, rock, and metal.
A large passenger airplane stands at the gate, near cargo vehicles.
A hot dog has ketchup, mustard, and mayonnaise on it.
A fire hydrant is decorated to look like a dog.
Three girls biting into a piece of fruit.
Two blue bowls of food next to a bottle of cinnamon and sugar.
A man is water skiing in the ocean.
Yellow commuter train at station near industrial area.
Cat overlooking keyboard as seen from above in lit room.
Street sign at intersection is written in English and Arabic.
A pizza sitting on top of a blue plate near a salad.
A tower clock on a building in the city
A person wearing gear sitting on the side of a fence.
A plate of baby carrots, mashed potatoes and tuna set on a table with a cup and utensils.
A man talks on his cell phone in front of toothpaste advertising.
A car carrier truck with a car loaded on it driving thru a city.
A street sign in front of a gated parking lot area.
Two men on a sunny beach flying a kite
A young man about to kick a soccer ball on a green field
a young boy smashing a toilet with a little sledge hammer
Round vases sit on tiny shelves against a white wall.
a train that is on a train tracks that is a model
A young man is standing in the surf on a surfboard.
a bench next to a tall thick tree
Three people in skiing gear posing with trees in the background.
A wooden clocks sits above a shelf holding several books.
Two elephants walk in an open savanna with dried grass.
A fire place with a clock above it on a mantle.
A large number of pots that are grouped together.
Four pictures of people skiing and snowboarding on a snow covered slope.
A man is playing frisbee by herself on the beach.
Black and white photo of person walking with umbrella.
Buses are lined up in a single line along the curb.
A young man playing tennis with his hat on backward.
a rainbow umbrella some bicycles a fence and some grass
A group of people riding skis across a snow covered slope.
a hand a cellphone a laptop and a beer coaster
a little plane flying across a blue sky
a couple of women sits around a counter top
A train passing down a station, in the middle of the day.
A city bus leaving a bus stop on a residential street.
A group of people are gathered around a large pizza.
A young boy posing with a baseball bat for a team photo.
A cut in half sandwich sitting on top of a table next to a foam container.
An air plane landing on a landing strip.
Paintings are hanging on the walls in a living room.
A food store is architecturally designed to include a clock.
A horse is walking in the sand along the water.
A cow lying down in the grass with a cowbird next to it
Cows walking on grass and a dirt road
A sales person showing a customer different phones.
a child holds a spoon of rice while a woman offers the rice on chopsticks.
A boat and lighthouse are in a wavy, stylized painting.
a number of cows in a field near one anotehr
A woman standing next to a man in a living room.
A child in diapers standing on a bed.
A city bus driving over a bridge under an overpass
Protesters gather with signs on a street corner.
A man in grey shirt with red tie and red baseball cap.
A soldier is mounted on a horse as a small dog walks near.
A small bathroom with shower, toilet and vanity.
A very tasty looking sandwich and fries waiting to be eaten.
A man uses both hands to swing his tennis racket
A picture of someone carrying video equipment in a bag.
A baby sitting on a bed holding a book and smiling at the camera.
Decorative event banner at a field full of flying kites.
A woman on a tennis court holds her racket as she finishes up her swing.
Two boys who are playing soccer against each other.
Two women and a child flying a red, white, and blue kite.
A little boy looking at his birthday cake.
A young bog eating cake with his fingers quite messily.
Two busses on a street next to sidewalk and trees.
a man throwing some kind of frisbee toy
strangely colored luggage stands out in a line of passengers
A large open room has an overhead book shelf.
One of the two children is smiling as they pose next to each other.
Four bicycles with baskets parked under a tree.
Young girl spilling water into canisters in the park.
A picture of a small bathroom taken outside the bathroom.
A light brown teddy bear sitting up posing.
A boat sitting on water next to a red bench.
There is a cat sitting on the back of a motorcycle.
Young boy carrying white Frisbee with toy stuffed monkey on back.
a cat napping on the laptop while on firefox
A female tennis player holding a ball and a racquet
Three slices of cheese pizza and a quesadilla are on a plate.
a number of people standing holding white boards
a man leaning over as he plays a video game with a wii mote
A desk has different peripherals, computer, and a binder beside a shelf full of books.
Two females stand in a modest dorm room
A miniature wooden toilet in a doll's house bathroom.
A man that is standing on a surfboard in the water.
a women grabbing onto a statue holding an umbrella
Sheep and lambs grazing in a pasture behind a hedge.
A woman is holding a baby next to an elephant.
A tray filled with fresh vegetables on a wood table.
a large group of snow skiers out side of a ski lodge
A group of skiers has gathered at a red fence
A living room with couch and fireplace in it.
A pitcher wearing a red shirt and red cap throwing the baseball.
The side of a motor bike and side mirror.
A young child standing next to a large box.
Black and white bathroom with large shower stall.
four kids holding wii controllers in a living room
Four stuffed animals, a leopard and three teddy bears, in a row sitting on a stone ledge with grass and trees behind.
Two people riding on the back of a large elephant.
A toilet that has an open lid with water in it.
A bearded man holding a wire whip and a Wii controller.
A flower shop has a wall full of differed colored vases.
A baby sitting at a chair by a computer desk.
A man in a grey t shirt holding a purple frisbee
A stop sign painted on a wood pole.
Two zebras in a open dead grassland, one is eating
A glass block wall in a bathroom is shown
A black and white picture of a traffic signal in a city.
An old military plane on a runway with wings folded.
A young man riding a surfboard with a large wave behind him.
IN THE BATHROOM THERE IS A TUB TOILET AND SINK
A small bird perched on the windshield of a car
The train car has been vandalized on the outside.
a elephant walks through a vegetation area next to some trees
Some very big commercial planes all parked in a row.
Young women are playing frisbee on the grass.
Boat in the lake looking for spot to dock.
An alpine skier leaning forward while jumping through the air.
A diner with large pepsi signs on the front of it.
A truck pulls a construction truck on its back.
several female soccer players engaged in a soccer match
Young man holding a skateboard and his helmet.
A person with a snowboard, sitting in the snow.
A young boy is holding a frisbee with a picture on it.
Three adults sitting on a couch looking at their laptops.
A tall brick clock tower with a clock on each of it's sides.
A bride and groom are cutting into a cake.
Box of various doughnuts on a wooden table.
A pizza with different toppings sitting on a plate at a table
A man and a woman are flying a kite.
Four teddy bears outside sitting on chairs on a sidewalk.
A young girl combs her hair with a yellow comb.
A woman with an umbrella in front of a crowd
A cluttered desk, containing a laptop, blue water bottle, and many other items
There is a coffee sign below the stoplight.
two brown and white cows in a forest
a street sign on a light pole on a city street
A parking meter sitting beside an empty street.
a small child in a white shirt  and a bowl of cereal
A sandwich with french fries and cole slaw.
A train on tracks with power lines and buildings in the background.
A red bird sits in a bird feeder in a tree on a sunny day.
A woman posing in front of a batch of apples.
A cat chewing on a packaged pink toothbrush.
The back end of a semi truck driving on a divided highway.
a couple of people that are cutting a piece of cake
monitors are hanging over people who are sitting down
A young baseball player gets ready to field a hit.
A teddy bear sitting on a ledge of a building
Black and white birds walking in the grass near water.
two elephants together standing on a dry plain
A counter top with a plate with a fork and few scraps of food and a teddy bear lying on side with arm outstretched on plate near fork, with another plate with an apple and two bowls with produce, a canister and some metal objects.
Street and stop signs direct traffic in the proper direction.
A lady is holding her tennis racket for the crowd.
A BOY IS PLAYING WITH A FRISBEE IN HAND
Woman pushing a cart of luggage in a transportation terminal.
A nice looking story on a sidewalk near some other stores
A train is making a turn past a closed station.
A cat is sitting on the desk by the mouse.
A man sits on a crate with bananas nearby.
A automobile with multiple bicycles on a roof rack.
a small dog is standing on a motorcycle
A cat wearing a colorful hat over it's head.
Child sitting in high chair with plate of food, stuffed animal in buster chair and bottle of ketchup. Another hand holding a fork and a partially filled plate.
A cluttered room contains green counters, a brown table and windows.
A horse attached to a carriage on a street.
a computer,a keyboard and a mouse and a bottle of wine on a table
A large long table full of many laptops.
some people and signs a bicycle two horses pulling people in a cart
A dog suspended in mid air catching a frisbee.
A man jumping up to catch a frisbee on  the beach.
A stop sign sits along a road next to a shore
A pizza sitting on to of a white plate covered in cheese.
A sink underneath a mirror inside of a bathroom.
A bunch of yellow and orange fruit in varied sizes.
A room filled with lots of toilets and sinks.
A man riding a surfboard on a wave in the ocean.
A woman standing on top of two pieces of luggage.
A parked motorcycle on a dirt road in front of an old building.
a small bird on a fallen branch near other trees
a pole holding a couple of street signs beside a building
a rusty brown train trackwith just one train on it
Pears, cheeses, cornichons, and other delicacies are artfully displayed on a dish.
Young man picture of receipt with the phone.
A cat laying on top of a refrigerator.
A WOMAN IS GIVING A MAN A HAIR CUT
A zebra standing on a dry dirt lot.
A girl sitting on a couch is adding something to her mug while other people stand nearby.
A train traveling down railroad tracks next to a train station.
A lot of toys that are on a table.
A horse and foal galloping through the woods
A person skiing down a snowy mountain side.
A cutting board with fruits and vegetables that include broccoli and blueberries.
A giraffe is walking through a wooded area.
A large truck turning onto a road in a city.
A display of coffee and sandwiches on a patio table.
A pitcher in a baseball game pitching a baseball.
An empty beach dotted with straw umbrellas awaits tourists
The colorful bird is perched on the branch.
a couple of birds swimming in a lake
A hot dog that has some cheese on top of it.
A red train parked on a train track.
Young girl in sunglasses standing in a lawn, holding a frisbee
A kitchen scene with yellow walls and a checkered floor pattern.
Two children and an adult ride in a horse pulled cart.
A woman is laying in her bed playing on her laptop.
A bird flying through a cloudy sky over a body of water.
A man is holding a surfing board on a beach.
Two brown horses inside of a steel fenced corral.
two adults dressed in ski attire and skiing in snow in an open field
A laptop that has a picture of outside a window.
some people are riding horses at the beach
a picture of a sign post for a bikelane at the corner of Hancock ave.
A man that is standing in the dirt with a baseball bat.
A mother and daughter smile as they eat their meal.
The old plane is now hanging up as a decoration.
An elephant with tusks curling it's trunk upwards, standing behind a fence in the sand.
Replica wooden sailing vessel with passengers in a harbor.
a little girl sits on a swing with a stuffed animal
A cat stands on a bathroom floor alone.
Giraffe trying to reach some leafs on a tree.
Two slices of bananas next to ice cream on a plate.
A bunch of biker dudes begin led by one on a orange bike.
A pizza that is not quite shaped correctly
there is a small plane that is very close to the ground
A large living room with a cat on the rug
Four glasses of wine sitting on a bar are half filled.
A man flies a kite at an event from afar.
Half of an airplane jet over a snowy mountain range.
A man holding a computer mouse next to a glass of water.
Young child enjoys a deathly meal for dinner
The peperoni pizza is served from the restaurant.
A car driving on the road near a road sign and a bird.
Two draft horses pulling plow, color, under cloudy skies with trees and other horses in background.
A kitchen has wood cabinets and white appliances.
BASEBALL GAME WITH BATTER UP, READY TO SWING
The blender is full of some type of beverage.
A crowd of people standing around a pole with three fire pits attached to it.
an animal behind a fence next to a tree
Four cup cakes with sprinkles on a plate.
A man with a large remote controlled hobby aircraft.
Major League Baseball players practice throwing on the field between innings.
Arms and hands holding onto the bars of a bicycle.
A man talks on the phone at the table.
A pigeon is standing and eating in the street.
THERE IS A PERSON THAT IS WALKING WITH A SUIT CASE
Woman of African descent in mid tennis backhand.
A cup cake in one photo, an empty wrapper in the next photo.
A man in a suit eats a banana in his car.
Three zebras stand together in a field of grass.
two city buses one  following the other
Several sheep grazing in the grass on a sunny day.
An orange RV and white mini-bus is parked in an adjacent lot from a building.
Man standing in a yellow room holding some kind of remote
A young man holding a foot long hot dog covered in pickles.
a bunch of sheep are staanding in a field
Dark haired man making a serve at a tennis match.
A large number of people riding motorcycles down the road.
The two people are ready to serve the variety of donuts.
A green broccoli plant with lots of green leaves.
A guy with a cap holding a blue surfboard.
Adults and children sitting on a bench at a park.
A person on a surfboard riding it in the water.
A white plate with two slices of cheese and a whole banana unpealed.
A woman flips a tortilla in the kitchen from a skillet.
Two  beige plates with thick sandwich and mustard.
a aircraft flying above a snowy mountain
A view of a pizza from a table, with a man behind it.
Public bus travelling down road past apartment buildings.
An Air Canada airplane is waiting at an airport.
Animals outside a shelter grazing in a pasture.
there are two yellow empty school buses
a big boat that is floating in a body of water
A skateboarder rides his board through a skate park.
Two people engaging in water sports in the ocean on a cloudy day.
A pink room with two urinals near a door that says Catering Staff Only
Quail walking in tall green grass near a fence.
Two young boys read in bed using a lamp light.
A black cat on a wooden table in front of a laptop.
A little dog is staring at a herd of sheep grazing in he field.
Two aeroplanes with two sets of wings flying in a clear sky.
A surfer holding a surfboard straight up on a beach in front of ocean waves.
Man in all black doing a trick on his skateboard.
A male skateboarding over steps in front of other people.
A group of people who are serving a cake.
Several older men sitting in front of a library.
a pizza sits inside of a box on a table
A suitcase is sitting in a hotel room
Some people standing under an arch which has a fancy clock on it.
A bed with a blanket underneath a window.
2 baseball players in the field prepare to catch the ball
A family of zebras in an open landscape.
a bunch of bananas hanging near a blue wall
A couple of bears standing next to each other.
a man holding a cell phone sitting in a car
A stand on the side of the street with political tones.
Two guys at a skate part having fun.
A large plane with airport terminal in the background.
A tray with a glass lamb next to a pot of flowers.
A small, dirty bathroom has peeling yellow, walls.
Two bowls of food on metal plates next to a fork and spoon.
a close up of a dog with its head in a bag
A couple sun bathing near their bikes on a bay
Cars and people on a street traveling under a traffic light.
A green walled building in the middle of a brick wall.
An Apple desktop with an animated figure on the desktop
A street scene of a busing coming down the road and dark clouds in the sky.
The woman waves at another surfer also carrying a surfboard.
A sheep with lots of fur on a fence in the field
Closeup view from front underneath of a commercial airliner plane in the air with wheels down, against blue sky.
A woman is taking a picture of herself.
a close up of a plate with a doughnut near a cup
A man about to hit the ball with the tennis racket.
A marina full of boats nearby a seaside town
A large brown dog running across a grass covered field.
A silver tray on a counter serving pizza.
A desk with a cell phone and two computers.
A group of people sitting around each other in a room.
A small yellow car with a driver sitting on the right side of the vehicle.
A skateboarder, holding a skateboard in front of the camera.
A bride and groom cut the cake on their wedding day.
A truck driving down a road along side of train tracks.
A stuffed teddy bear sitting on a green bed.
The famous Suzuran Street in Tokyo during the day
A street sign for Rodeo drive is seen in close up.
A kitchen with a sink, mirror and window.
An older man surfs in the large waves.
The two young children are playing with a plastic chair.
Three laptops with faces on the screens on a bed.
A group of boats sitting in a water cove next to some buoys.
A Dell laptop on a desk is surrounded by cords, books, and papers.
A blonde girl in green shorts playing tennis.
A man is sitting on top of an elephant.
A white and orange train traveling down train tracks.
Two trucks with workers in the extended baskets.
A man wearing a snowboard is standing on his head.
American airlines commercial jet sitting on a tarmac.
A yellow fire hydrant sitting on the side of a road.
A woman holding a plate with a pizza on top of it.
A person is holding a banana that is dressed in a costume.
An old black railroad car parked on the tracks
A man that is standing on a tennis court with a racquet.
A cupcake that has a ribbon on it.
A young man standing in front of a white plane that a young woman is standing in.
Many skiers are on the snow covered mountain side.
The glare of the sun cuts across a wave and a wet-suited surfer coming in on the tide.
A woman with a bat hitting televisions that say Comcast Doesnt Care.
A traffic light monstrosity shaped like a tree sitting in a parking lot.
A little boy flying a kite up in the sky on a beach front.
A herd of zebra standing along side of a river.
The tower of the building has a big decorative cross on it.
Bottle of red wine and red wine in a wine goblet.
A crowd watches a player pitch a ball in a baseball game.
Several people waiting for the train to arrive.
A man is next to a boy on a surfboard catching a small wave.
A man stands in a train station as a train passes
Two horses in grassy area with fence and house in background.
a clock in the center of some plants and bushes
Several men are all trying to catch a Frisbee.
A woman and a little girl approaching a train on the tracks.
A toaster oven that is heating up on a table.
Many animals sit on the beach next to the ocean.
A person riding skis across snow covered ground.
A bus is going down the road at night.
A person on the street with ear phones neara parking meter.
A laptop computer sitting on top of a table.
A horse that is standing in front of a carriage.
Two people wearing jeans sit on a bench with their legs crossed.
A sign with a button for crossing on a street corner
A herd of sheep with a  man standing next to them.
A bunch of animals being held during a competition.
The man is holding a teddy bear wearing a hat and scarf.
Statues on the second floor of a building, sitting below a clock.
A man his holding his cell phone overhead.
A yellow traffic light hanging over a city street.
Crowd of attendees among colorful display on banners.
An woman across the table puts her hands over her mouth and nose
A motorcycle full of gear parked on a gravel road.
A boy is on a tennis court carrying a tray of balls.
a kitchen next to a wood floored living area
Two elephants with their trunks raised are at a log rail.
A tray that has various plates, with various foods.
Sandwich and greens on a plate with a glass of water.
A bathroom vanity with a his and hers sink.
A woman with a blazer on has her hand up to the side.
A chair sitting at a fire hydrant near a road.
A person who is wearing glasses holding food in their hands.
The two people are talking about items on the computer.
A home office with a cat sitting in the middle of the desk.
A granite counter with a plate of food and a drink.
A chocolate cake with chocolate frosting and zebra top
A person that is in the water having some fun.
The jet airplane is parked near a field of tall grass.
There is a double decker bus that is red and beige
a laptop on a desk with an extra keyboard
A kitchen in industry with empty everything
A group of five posing for picture on skis.
Four older men sitting on a wooden bench.
A picture strip and a pair of blue handled scissors.
A kid with a large umbrella on a street.
a group of zebras under a huge shade tree in the middle of a grassy field
A farm along a river overlooks a wind turbine.
A man wrings his hands while observing a tray of pizza outside
A kitchen sink sitting under a kitchen window.
A boy in blue striped jacket playing with a toy.
There is a mirror and trash can and a mirror with two cats nearby.
A bunch of goats are eating out of a box
A woman smiling while holding an open umbrella.
A little boy stands outdoors on a rainy day with a pink umbrella
Hotel room with a pair of beds and a sliding glass door.
Two giraffe's in a pin, one walking, one standing still.
A Seattle Mariner's baseball player is up to bat at a baseball park.
A woman standing on a bridge holding an umbrella.
a close up of a keyboard on a desk
THIS IS A PHOTO OF A BLUE MOTORCYCLE
a close up of a person cutting a piece of cake
A vintage clock from the 19th century tells the time.
a kitchen with counters a door and cupboards
The white head of an animal sticks out from a field of green grass.
A woman in a red jacket sits astride a white horse.
The people sit at the bar next to the motorcycles.
The children are getting ready to enjoy a piece of cake.
People on a motorbike near a vehicle loaded with food.
a man on a pitchers mound lunging forward delivering a ball
A giraffe walking near a tour vehicle in the grass
A person spoons macaroni and cheese into a bowl.
A very big messy bed filled with many items.
A plate of food with various items on it.
A refrigerator plugged into the wall of a kitchen.
a brown and black acoustic guitar and an orange frisbee
A zebra follows another zebra through a park.
A large vase contains an assortment of flowers.
A man in a wet suit is surfing in the ocean.
a person holding onto a partially eaten donut hole
A cat that is sitting on a motorcycle.
Blue commercial airplane getting loaded at the gate.
A man riding a motorcycle on a race track.
A dog's face is partially showing and being blocked by something.
A shot of a baseball player about to throw the ball.
There is an old yellow train coming down the tracks
Two vehicles cross under several street lights at night.
A cat on a toilet seat in some dirty washroom.
A group of people stare up at something out of the frame.
Meat and mashed potatoes smothered in gravy with peas and carrots and bread
A series of two pictures with a small dog wearing a fruit hat.
a man goes down the street on a skate board
A large elephant standing next to a pile of dry hay.
A woman and a horse standing in a corral.
The picture is full of many suitcases with tags.
A woman walks across the street at the intersection.
Image of a bedroom featuring a modern style bed and other furniture.
A man wearing skis and holding a handle leans toward a sandy plain.
A group of men in bathing suits next to an airplane boat in the water.
A man is standing outside a store at night time.
People stand by a truck near a street filled with vehicles in a city.
the bench is completely covered in snow so is the tree
Closeup of row of yellow hats and baseball mitts.
The small refrigerator holds several different types of drinks.
A wine glass with wine next to a wine bottle.
A man riding a motorcycle down a road with a POW - MIA flag.
a number of people standing flying a kite
A police officer on a motorcycle patrolling a protest.
some bowls of food, one with broccoli, the other with some chow mein noodles
Two people, a woman wearing a hat and carrying a paddle, and a man, both hold umbrellas.
A white plate topped with meat and two types of veggies.
A dog lying down on the beach.
Two girls holding Wii remotes and nunchucks while standing up
a woman is holding a tennis racket on a court
The motorcycle is sitting beside of the people.
Headless statues show of clothing beneath a colored background.
Public transit bus traveling past brick large building.
An intersection during a cold and foggy night.
A couple of people riding on the back of an elephant.
A man and two dogs stand near a park bench.
A woman is jumping on a hotels bedding.
A golden vase filled with flowers on top of a table.
A few pieces of pizza sit on a skillet.
A rusty bicycle filled with mangoes and bananas.
A red fire hydrant sitting on the side of a road.
A motorcycle is parked on the grass while people look
A very colorful old style train engine on the tracks.
a toilet sitting underneath a big window
The man and the girl are flying a kite at the beach.
Two double Decker buses on a two way street.
The street signs are clearly visible for all to see.
A male chef holding up a knife in a cooking area.
a small kitchen with stainless steel appliances and a large window
A white toilet sitting in a bathroom stall.
A cat standing on a woman's shoulder in a bathroom.
A teddy bear cake with a candle and sparklers.
A passenger rail train leaving the train depot.
A man swinging a tennis racket at an outdoor court.
A white-and-black cat sitting on top of a laptop.
A snowboarder is doing a trick mid air.
Four beautiful women in red posed around a motorcycle.
A camper brushing his teeth standing on a stairs brushing his teeth.
a close up of a bird flying thru the air with people in the background
Plate of food with green vegetables on top of bread.
Business is slow at the local bathroom sink shop
a close up of a person bending down feeding a dog
A large brown wooden fence near a wooded area.
A skateboarder skating up the ramp at a skate park.
A stainless steel stove that is in a kitchen.
Two sumo wrestlers and referee with people watching.
A man sits in a chair and pets a furry dog.
A small clock sitting on a bedside table
The dog is in the car with his head out the window.
A sole person sits in the front pew of a large church.
Two guys talking while standing near a parking meter,
A white bowl filled with a caramel chocolate dessert.
A flock of birds are flying near a body of water.
A lady sitting on the bleachers looking at her cellphone.
A bare kitchen has light wood cabinets and counters that appear to be granite.
A person riding a skateboard and doing a trick in the air.
Some food and bread on a plate on a table.
A piece of cake on a plate with cream filling next to a fork.
A smiling woman pressing her head against a mans head.
A large white church with a bus outside
some brown and white oxen laying in some dirt and cars
A large living room with a kitchen in the background
A table covered in fresh produce and a book called "edible San Diego."
A group of three women standing around each other near surfboards.
A person with feet propped on top of a desk.
A bowl filled with food sitting next to two pieces of bread.
A toy train set with flowers and house
a desk with a cross on it and candles
A mixture of random tools sit on a metal tray.
A girl is hold a new white and black racket.
A laptop seems to have the infamous "blue screen of death" on the desk.
suitcases sit on a dressed up stage and bags on a dressed up table
A modern motel room features oak storage and casual accessories.
A laptop sits precariously on a desk, with a second keyboard in front of it, and windows behind it.
a cooker and an oven well cleaned in a kitchen
A small white bathroom with a colorful tile accent.
People at a gathering with some hitting a beach ball into the air.
A surfer prays while standing on his board.
A chef is in the kitchen wearing a white apron
a doll sitting by a plate with a sandwich and fries on it
a male in a white shirt riding a bicycle and some signs
BEAUTIFUL SCENE OF THE RIVER AND ALL THE BUILDINGS FROM THE BENCHES
a church with a tower and a clock built into it
A black motorcycle sits on a paved surface.
A city street filled with lots of traffic.
Chicken and broccoli are in a skillet on a stove burner.
A cat sitting in a leather office chair.
a person in uniform riding a horse
The young woman is looking at her cel phone.
A young soccer player is preparing for the kick.
A group of young children sitting around a long table.
A white plate and metal fork on a plate of food
Mom cuts the birthday cake for her daugher
Two giraffes standing by a pole in a grassy field.
A giraffe sticking its head over a fence.
A view of two computers sitting on a desk, with a man on the cell phone behind them.
This is a display of teddy bears and snow globes
The surfer expertly crouches to finish the ride.
A small boy with blonde hair sitting in a rocking chair and holding a baseball bat.
People standing in an over cast ski looking out to sea with surf boards.
A smiling blue-eyed boy toddler chewing on a plastic object.
A bird sitting on the branch of a tree.
A cellphone with a strange rainbow screen saver
A smiling shirtless man laying on a bed.
Young men are gathered together while enjoying drinks.
A toilet, sink, and shower are located inside this bathroom.
Students in a classroom watching a lecture on television.
A man sits on a bench and plays his guitar.
giraffes, zebra and bulls in zoo habitat together
a living room with an orange couch and green decorations
A lot of people are sitting on the bench.
An elephant that is putting something in its mouth.
A street with many buildings is lit up at night.
A woman on a phone with a book with peoples photos
A large platter full of colorful food product.
Three zebras standing next to each other with heads together.
A couple of people standing in a room.
Older woman and two young guys stand against the fence posing with tennis rackets
Two old ladies with rackets playing tennis at the court
a person skiing while holding onto some wires
Ostrich in enclosed area next to a giraffe.
Giraffes statue displayed in indoor room at commercial business.
An elephant is standing in front of his food at a zoo.
An old brick clock tower with a metal roof
It is very dark in the room and there are pillows on the floor.
A woman walking on a sidewalk talking on a cell phone.
The building is a piece of art.
A guy skateboarding on a street at night.
A blue painting dominates a living room with a brown coffee table.
a person riding a motorcycle on a city street
A stuffed holiday bear decoration in a garden.
View of the underside of a jet airplane passing overhead.
Lettuce, a knife and tomato slices sit on a cutting board.
A lady is sitting in a restaurant while talking on her phone.
A living area with various furniture and a bicycle.
Grown men playing an indoor soccer game on turf.
Two vans are parked next to each other.
A table has two plates of desert on it.
A man and two girls sitting at a restaurant table
An unmarked van with trailer in tow is pulled over.
A small teddy bear with a pink bow sits of a bed
A soccer goalie unsuccessfully jumping for the ball
Some flowers are in a clear sealed tube
A sign on the side of a building.
A cat sitting on a shelf in a refrigerator.
A lady is walking along side a blue train.
A red bench in the middle of a city street.
A black motorcycle with a gargoyle painted on it.
This is a modern living room with great natural lighting.
A toilet sitting in a bathroom next to a scale.
A white coach travel bus sits parked on the street corner.
Two people skiing a snowy trail lined with trees.
two cow grazing in a field with a tree beside them
An egg sandwich and other food on a tray.
teddy bears dressed up in clothing sitting on a loveseat together
Snowboarder and skiers on a bright sunny day.
A man is talking to a horse which is inside a fence.
A woman balances her surfboard atop her head on the beach.
A man in a "nun" costume riding a skateboard in a parking lot.
Three people standing at the waters edge on a beach with a blue surfboard.
there is a apple and two oranges and a stuffed animal on the bed
Many different fruits and vegetables are laying side by side.
This is an image of a cat sleeping on a table next to houseplants.
A police car next to a pickup truck at an intersection.
Looking down at a cup of coffee and a piece of cake
A child with an umbrella walks down a store aisle.
A small white dog tucked into a persons backpac
a couple of people are playing with a flying disk
A close up of a woman eating a hot dog on a street.
Two people are playing the video game while the others sit at the table
A sign on the side of the street with religious meanings.
Wooden pole in sub urban area with intersection and trees nearby.
A fire hydrant is in front of a wall which says Fire Hydrant.
A tennis player in action on the court.
A man riding skis down the side of a snow covered slope.
a baby wrapped up in a blanket laying next to a brush
A couple of plates with sandwiches on them sitting next to an open can of spam.
A girl is eating a piece of pizza.
A child is on top of a boogey board in the water.
An orange sign that says the right lane is closed ahead.
A small baby with a kite and other people playing with kites.
A man gestures over a microwave as he leans on a chair.
pair of women standing on sidewalk at roadway pedestrian crossing area.
The little league player swings a bat at the baseball.
People chopping cucumbers while a third person watches.
A large umbrella open wide on a pole.
A woman standing in a  room holding a Nintendo Wii game controller.
there is a train that is about to go through a tunnel
a truck by the water with a boat attached to the end of it
A plate of sliced bananas, melon, and orange slices.
A pie with a fork and knife place setting and a bottle of beer to drink.
A stop sign and fire hydrant on a grassy corner
A person sitting in a chair in the living room.
A donut sitting on a plate next to a cup of coffee.
A slice of pizza on a paper plate.
A modern residential bathroom with a shower over the tub
four wooden benches under the shade of a tree in the park
A large bird swoops over the waves of the ocean.
The plain is taking off from the airport.
a public transit bus on a city street
A city bus traveling down the street next to a truck.
a person riding a horse close to the water
Elephants are hitched at this post like horses in an old west town
Crowd of people at outdoor gathering on grassy field.
A woman rides on the back of a prancing horse.
A horse peeking out from behind a hedge
A little girl in blue shorts standing on a tennis court.
The tiny bird is flying next to the flower.
A group of men standing around a batting cage.
A bookshelf is packed to capacity with books.
A man with his hand on his skateboard as he is about to come down a ramp.
A dog laying on a couch in a living room.
A clock tower next to a building with a painted mural on it.
A broadcast editing room with numerous video monitors and audio mixing stations.
A double decker bus going down the street.
A boy does a skateboarding trick next to a building.
a person in a field flying a kite
A picture of a sun that is over a street.
a man is wearing headphones and eating food
A yellow and blue motorcycle parked next to a stage.
A cat stares at a television, which is turned on.
One zebra lays in the dirt while another walks away.
Large buses and cranes on the wet parking lot of a commercial building.
Two zebras in an enclosed area during the day.
Calico cat sprawled stealthily in the grass in an alert manner.
A man and woman posing all dressed up.
a close up of a cake on a plate on a table
Lunch plate with grilled sandwich, carrots, cheese, bananas, and lemon.
An old fire hydrant sitting outside in the grass.
Two women at a long table working on some urns.
The motorhome is parked outside the red brick house.
A person surfing on large waves in the ocean.
a couple of people that are under a umbrella
Two people sitting on a ski lift, one posing for the camera while wearing a colorful hat.
A man and a woman standing next to a table fulled of lettuce.
A herd of three zebra standing next to each other near two giraffe.
Chopped and sliced ingredients atop a cutting board next next to a bowl partially filled with grated cheese.
a  large building that has large clock on it
A smiling woman showing off her pizza topped with olives.
People are standing in a field under British flags.
A red bike in front of a statue and cannons
A woman in tennis whites playing tennis on a professional court.
Two men in black aprons stand in a kitchen tent area.
A baby sitting at a table with a plate of food.
A very cute dog with his nose in a big red circle.
Three elephants, one a baby appearing to be holding it's mother's tail, in wet land, but arid hill in background.
A street sign gives directions to numerous major streets.
A hotel room with a bed, chair, desk and an end table.
a person sitting on a bed reading a book
A man riding skis on a snow covered summit holding ski poles.
A big sign in front of Lake Kawaguchiko.
a young boy holding a baseball bat with a baseball helmet on
a male tennis player in a red shirt is playing tennis
A man putting his time card into the time card machine
Two boys with their faces painted hold stuffed animals.
Bathroom counter with lighting on over mirror and sink.
A giraffe standing in a field by some zebra's passing through.
An orange monoplane is tied down on the tarmac.
Rainy camera showing a car driving down a street.
An octopus vase with three roses in it
A young boy is playing with a red soccer ball.
People prepare to fly a kite with an image of an American President.
A bus that is sitting in the street.
A cellphone sitting on table with papers in the middle
A man on a surfboard is riding the wave
a couple of people that are standing in a field
A man stands by his bicycle with long horn handles on the sidewalk of the beach.
A boy holds a cellphone up to the camera.
A tall elephant standing next to a man next to other elephants.
A young girl throwing a softball to a team mate.
a close up of a building window with a sky background
A double-decker bus is going down the street.
Toddler boy sits on the stairs holding a tennis racket.
A black and grey dog in the passengers side of a truck.
Stainless steel fridge in the kitchen of a home.
A baseball player that has just hit the ball.
The train looks as though it needs to be fixed and washed.
A kitchen and a living room are situated next to each other.
A girl sitting at a counter with a piece of pizza.
Two giraffes that are together in an enclosure.
A man in a suit does a dance pose near a young child.
A boy on a skateboard going down a rail.
A black and red locomotive sits on the tracks.
The woman in the black and white dress has a colorful tattoo.
A solitary man walks through a crowded parking lot with his striped umbrella.
two giraffes sitting on the grass outside of a stone enclosure.
A skateboarder making a big jump in a parking lot.
Two trays of pizza are on the racks of an oven.
there are two small bears embracing each other
A man in a suit talking on a phone
A small child at a table eating some food.
A herd of sheep on the side of a road with trees to the side.
A young man is doing a trick on a skateboard at a skate park.
The cow is all alone in the brush.
Bench sitting on sandy area with lighthouse structure in background
A horse is pulling two people in a carriage on a street.
A man sitting in a chair in a kitchen drinking a canned drink.
A steam train parked next ot a 1950's commuter train.
A picture of someone's dinner. Steak with carrots and greens on the side, on a green plate.
A train that is sitting on a track.
A baseball player pitches on a dirt floor.
There are people that are flying kites in the air
A family of elephants standing in a watering hole
A bowl has fresh fruit and a toy fish.
A black bird standing among blades of grass.
The stop sign is near a fire hydrant on the neighborhood street.
there is a large piece of food and a knife on a cutting board
An office desk with keyboard, monitor, mouse  and lava lamp on it.
A small bathroom with a toilet that has buttons on the side.
A man standing in a carriage hooked to some horses.
A woman is laying on the bed with her feet in a suitcase.
A very spacious and well organized kitchen witha wood floor.
Bird cages with birds in them inside a pet store
A view of the ceiling of a kitchen with several light bulbs.
this plane has two large fans on its wings
A crowd of people shopping for fruit in a farmers market.
A group of people riding boats in the middle of an ocean.
A group of zebras and giraffes standing by a bus.
A red truck parked in a parking space.
A group of people that are standing in the snow.
A sign warns of a 350 fine for honking a horn.
A colorful bird is perched on a branch.
a field that has a bunch of people flying kites
A child with a backpack underneath an umbrella.
a picture of a large clock tower in a city.
A spot with a few materials that is agreeable.
The motorcycle riders are taking cover from the rain.
Three slices of tuna lie on a plate with garnishes.
a man is wearing yellow and blue in skis in the snow
A zebra leaning over to eat some hay in a field.
A giraffe standing in the grass and bushes, next to a bare tree that has one bird perched at the top of it.
The clocks are on display in the room.
A group of people standing on top of a sandy beach flying kites.
A man wearing glasses using a laptop computer.
A man is standing on a kitchen counter painting the wall.
A sign shows various directions through an intersection
A person in a wheelchair walking a dog looking at a horse
SOMEONE HAS THERE FOOT ON THE COFFEE TABLE WHILE WATCHING TV
A surfer is in the ocean riding a large wave.
two brown animals and one is laying down the other is standing
It is raining, a male jumping and so happy to take this picture
A residential house next to some trees and a field
Nicely decorated train has a red smokestack and gold trim
This is an image of a pug chewing an empty water bottle.
A sign for Madam's Organ Restaurant  Bar hangs on the side of a building.
a black and yellow bus driving down the bus with a double decker bus behind it .
A young child sitting in front of a TV watching the Flintstones.
a toy set of a bear sitting at a desk
A man standing in front of three toilets in one bathroom.
a few boats that are out in the lake
Personal toilet in a portal potty in a very confined room.
two men standing next to each other one on the phone .
Plane seen on the horizon above the boats
A person in a dry area with a sail high in the sky
A couple of trucks and a car driving down a highway.
An x-ray machine in a hospital next to a bed.
A skateboarder is partially kneeling on his skateboard.
The huge delivery jetliner has three turbine engines.
A colorful lady flying a colorful kite on a sunny day.
A child sitting on a horse holding a flag on a field.
A plate with fruit and nuts and cookies.
A woman is holding a young girl up to look at a horse behind a fence
Small girl laying down on top of a board on the beach.
A woman is hitting the ball at a tennis match.
A woman hits a tennis ball with a racket.
Two tall television monitors are next to chairs and desk.
A collection of painted boxes stand in a courtyard.
A street with a wall with graffiti and plastered paper.
a toilet next to a sink in a bathroom
A small herd of sheep stand still in the snow.
A person is on a skateboard performing tricks off a wall.
a beautiful white bathroom with one huge mirror.
A party with people, some in costume, standing around something not shown.
Lots of crew people in a large building working on an airplane.
Two donuts are on a plate on a desk.
A young man ridding a skateboard down a rural street.
a close up of a cat laying on a dog laying on a bed
There are airplanes parked in a lot at the airlines.
A giraffe is caged inside a building at a zoo.
a person in mid air on top of a snow board
Blurred view of an intersection and metro area.
Several cows standing in the grass near a few buildings.
A sandwich, carrots and strawberries in a lunch box.
Laptop computer with keyboard and mouse displayed on white surface.
A man in grey baseball uniform swinging a bat.
The boy is wearing a suit and a tie.
A boy is skateboarding on a city street.
Skiers waiting to ski on a busy mountain slope.
A table topped with a pizza next to a salad.
A man with a drink stands by a woman in a white hallway at a house party.
A beautiful woman holding two skis while standing near a wall.
A muffin on a plate with a cup of tea.
a blue vase with blue flowers on a sink counter top.
A person on a motor cycle poses on the road.
A picture through a porthole of a bike on the boardwalk.
Motorcycle riders are approaching an intersection by a bridge.
a street a fence people cars and traffic lights
a man departing a bus onto the street and another man standing next to the bus from the sidewalk.
A man holding a motion controlled video game controller
An over ripened banana and a cup of coffee.
A dark alley with an umbrella in it.
a orange sponge cake, with something square around bottom.
A little boy chewing on a tooth brush that is still in the wrapper.
A wall mounted grandfather clock mounted to a wall.
A man and a group of kids on a field.
an image of a professional baseball game being played
An airplane landing strip area and apron area with several planes parked on it.
A giraffe and five wildebeests roam in the Savannah.
A herd of zebras is running through the grassy landscape.
a long body of water lines with boats and trees.
A large bed sitting inside of a bedroom next to a lamp.
Small children holding up white controllers on a couch.
An old sign hangs on an old building
A bird sitting on a hand that has a glove on it.
A plate with three doughnuts on a table.
Three woman holding vegetables outside on a cloudy day.
A motorcycle parked outside in a parking lot near the beach.
An adult talking to child while cross-country skiing.
The man is racing his horse on the race track.
The dog is being fed with a banana.
A cake shaped like a stuffed and roasted chicken.
A man in a blue shirt with a red beard, laughing.
Many people are sitting under black and white umbrellas.
Two cats laying on the floor playing with toys
THERE IS A DESK TOP COMPUTER ON THE TABLE
A man swinging at the ball in a game of tennis.
a person sitting on a curb operating a cell phone
a keyboard an orange and white cat a desk and a monitor
The mirror is near the view of an ocean beach.
A large brown dog walking next to a wooden table.
there is a male baseball player that has swung for the ball
There are two street signs on the pole.
A stop sign is on the side of a school bus.
A bunch of scooters sitting a room with themselves.
A girl in boots on a skateboard and a man teaching a boy to ride a scooter.
A man is swinging a tennis racket at a ball
four different pictures of men making homemade pizzas
A white bathroom with sink, toilet and tub.
A young boy walking through a living room towards  a cat.
A homemade focaccia is ready for the oven.
A man with black hat and glasses holds a cup with drink
A small child is cooking in the kitchen
An enclosed shower with a window and bathtub.
A row of kites in the sky and girls are walking on the road.
A large commercial airplane parked on the runway
A desktop and laptop computer sit side by side on a desk.
Three Starwars action figures playing in a blender.
There is a little dog next to the driver.
A man is turning on a fire hydrant.
a yellow and blue train riding a track by some trees
An elephant with its calf standing inside an enclosed area
A herd of elephants are by the water.
a row of skiers skiing on a course
A truck carrying a golf cart follows behind a motor home.
A group of people out enjoying a trail ride on horseback.
A shiny metal train is traveling down the track in front of a sport's stadium.
a person on a small boat in a river
A man wearing a toothbrush for a moustache.
Several people sitting at a table working on their laptops.
A small table set with fruit and drinks in front of a wide window with brown chairs.
An old fashion with a red truck with someone walking towards the front.
A bear stands in front of a large fallen tree.
A man taking a swing at a tennis ball
Two dogs and a cat laying in a big bed.
A man in the middle of a busy city street displays nearly the same colors as an approaching Volkswagon bus.
The building has a large clock displayed on the side.
Two people ski down a large snowy hill.
A close up of two doughnuts on a plate.
A group of people ski down a hill
A large giraffe standing in a grass field.
A young boy wearing a powder-blue baseball uniform poses for a picture of him holding a bat.
A woman is watching a girl ride a horse.
A white plate topped with mint angel food cake.
A plastic cup filled with two tooth brushes and a tube of toothpaste.
Four skaters in speed suits are racing down a curved street.
A bathroom with a white toilet next to a shower.
A black and white image of a bird flying over the lake.
A kitchen is well lit by three hanging pendant lights.
A back of a truck with doors and two windows.
A spinach pizza sits on a plate next to a class of wine on a table.
People mill and gather about a vintage military airplane.
A group of children with frisbees are standing in a field.
Skier skiing down a hill near a guard rail
There is a little boy standing in a base ball uniform
The skateboarder has fallen off is the board.
A silver BMW motorcycle being posed for a picture.
A man puts his feet on a desk with a laptop, a PC, books, and work papers.
Someone flying a kite while on the beach.
Man holding a tennis racket and ball on the tennis court.
Small boat moving along water with orange objects hanging off end
A dump truck that is driving on a dirt lot.
Two giraffes standing next to each other under a group of trees.
The bananas on the tree are not ready to be picked.
Two horse pulling a wagon with a load of hay with children on top.
A toilet is sitting in the grass by the trees.
This is a picture of three buses parked together.
A woman standing at a bus stop with an umbrella
A young lady holding a black umbrella in front of green bushes and trees.
a couple of giraffes walk next to some trees
A very steep snowy hill filled with skiers and a lift.
A group of people flying kites in a blue sky.
Two children sitting on a couch eating food off of plates.
A RED NOSE PIT BULL PUPPY SHOWING HIS TONGUE.
A few people are off their surf boards in the water.
A shopping center sign right by a road and a big red building.
A plate with a cupcake on top of it next to an orange.
A kite with happy pictures on it is flown on the beach.
a person in a field with a plane shaped kite
a person on a surfboard riding a wave
A waffle iron, and the ingredients for waffles are displayed.
A woman riding on the back of a motorcycle with a child.
Floor level view of woman with dark stockings and high heeled boots in crowd.
A cardboard garage sale sign stapled to a post.
An elephant performing tricks on a stool in a circus.
A cat standing near a dead bird with some words on the picture
A group of men playing a game of soccer.
The windmill is sitting in an open field.
A man in a suit talks on a cell phone.
Series of clocks with lights in them on a city street.
A woman and a man are cooking food in a kitchen.
A man is standing on the sidewalk talking on a cellphone.
A chicken sandwich with tomato and lettuce with onion on the side.
a bath room with a mirror and a sink
Several people standing outside in the evening, some carrying umbrellas.
two street meters attached to the same pole on the road
Five adult sized giraffes grazing in a field.
A boy on a body board with a surfer standing in the water behind.
An old teddy bear stuffed into a iron railing on a balcony.
A man with a large bear wearing a brown hat.
Two pizza rolls on a tray with a sign up
A man holds up a small banana in his hand.
a hat on a table near a cake
A motorcycle rider is near a crowd on the sidewalk.
A green city bus pulling out into the street.
a group of zebras graze on some grass next to an antelope
A view of a bunch of birds flying around purple flowers.
An individual is in the open view in the picture.
A group of zebras standing close together .
A photo of a bedroom with two beds.
A man in riding armor poses in front of a motorcycle.
two small children playing next to a fired hydrant and holding a balloon
A smiling man in a striped shirt playing a video game.
a line of skate boards sit in front of a wood plank
The bathroom has a wall sink, medicine cabinet, toothbrush holder, and bare walls.
An instructor is teaching the little girl how to surf.
A crowd of people walking down a street next to tall buildings.
A giraffe standing in a dirt filled area.
The train is crossing the bridge by the water.
A suitcase sitting next to a bottle of champagne.
A lady is observing three other people in the background.
a kitchen that is empty with just a sink and some wine bottles.
A pizza with no meat overflowing from a plate.
Horses, a pony and sheep all grazing in a green field
Large collection of cakes shaped like hearts on a display.
A toddler stands next to a No Trespassing sign.
A white plate topped with meat and veggies.
A Frito Lay delivery van parked outside in a parking lot.
An assortment of donuts on a plate.
A person is standing with their foot on a skateboard.
A little girl sitting by a bunch of bananas
A dalmation dog sitting in the drivers seat of a bus
a tall and old brick building with many windows
A soccer team in purple is watched by a crowd.
a woman and a little girl with an orange shirt standing on a skateboard
A woman is cooking food at a restaurant.
A cat is sitting on a pink chair near a computer.
an image of two people that are each holding kites
A cook standing in a kitchen in front of two bowls of food.
Two horses roaming the fields during the day.
A young man riding a skateboard down the side of a ramp.
A man sits down around the bunches of bananas
Two giraffes in the trees, one standing up.
A red fire hydrant with a motor scooter in the background.
Fresh vegetables and smoked sausage on a bread tortilla.
Man in black business suit on street corner.
a bathroom with red walls a shower a sink mirror and toilet
Two people wearing life jackets on a watercraft.
a cat playing halfway under a straw hat
There is a huge crowd of people in an area sitting on the grass and watching.
Large clock on post displayed near overhead display of commercial enterprise.
Three cats are relaxing on a tile floor.
A flock of birds flying over water and sand with a volley ball net on the sand.
A white sink sitting next to a toilet.
On a wide street are people walking, on bikes, or in trucks.
Two older men that are preparing a table full of great eats.
A number of people moving about on a snowy ski slope.
A hitter, catcher, and umpire playing a baseball game.
A white jet sitting inside of a hangar next to other aircraft.
A trio of elephants stand in front of a watering hole.
A man in a t-shirt flying a box kite
A man in a blue jacket is traveling on snowshoes through snowy woods.
A white bowl filled with rice and vegetables.
a child and another person a refrigerator and a silver cup
a yellow pink white and green vase and two other vases
A man taking a selfie with his smart phone.
a man with a bat swings at a baseball
A woman standing between a motor bike and a striped wall over a river.
A large clock mounted to the side of a building.
A small child is lying in bed with a baby.
A man is on a laptop at a table
The pitcher is starting to deliver a pitch on the mound.
People lined up on the sidewalk with pizza boxes laying in the snow.
a bride and groom are cutting their wedding cake
a bunch of bananas are on a table
Short rain as view from above either from over view mountain or air craft.
Seven doughnuts on a wooden plate over a doughnut pan.
two people riding horses on a city street
A man this is putting a bowl inside of a microwave.
A cat is lying on top of several shoes.
A few friends are gathering for dinner in a restaurant.
This is a nasty bathroom located in an undisclosed area.
a group of people sitting close to each other all using cell phones
A junk pile of broken porcelain toilets in front of a wall with graffiti on it.
Seven suitcases, stacked on top of one another, in front of a booth.
A large donkey standing in the middle of a grassy field.
A red bus diving past a fountain in a city square.
Bald man in black and red shirt playing baseball.
An analog clock set in a class case.
A LARGE AIRPLANE THAT HAS LANDED AT THE AIRPORT
A sign attached to a light pole on a street.
A man talks to a plane full of smiling people.
Small child signing a document next to two men.
A small child holding a piece of broccoli up to their face.
an iced  birthday cake with a number candle on a table with a pink tablecloth.
The kitchen with green oven atop white tiled floor.
A airplane coming in for a landing with a full moon above it.
three men sitting in a row eating a sandwich
A cute child is dressed up standing by a door.
A table with plates containing an assortment of cold cuts, cheeses, and vegetables.
A man flying a kite in a parking lot by a lake.
A boy riding a skate board down a stair rail.
A group of horses grazing in a green field.
Pens, scissors, markers and other assorted clerical tools.
A man taking of photo of himself in a mirror with a cell phone.
The white devil slavemaster puts a bat in the young black girl's hands and trains her to attack Mexicans on sight.
A train rolls down the track through rural tree lined scenery.
A man dressed in red riding a horse through town
a plate of bread and a bowl of fruit
Two black and white horse standing next to each other with gears.
A man holding one frisbee and throwing another.
A small breed dog looks up while laying on a couch.
A man standing in front of an elephant.
some people on skis go through the snow as people watch
Several empty boats floating on the river on a cloudy day.
The view of a clock in the distance of a building.
A motorcycle parked in an intersection with cops on motorcycles going past it.
Various foods are sitting on the large and small plate
A person pouring batter into a donut maker.
A row of horses tied up on a rope rail.
A man kneeling over a laptop computer on a table.
An old fashioned styled kitchen has a microwave.
A woman covers her face, as a kitchen, flowers, and a laptop computer are also visible.
A gentleman in a suit is standing near a wall.
A car is driving down the road near some road signs.
A man in uniform standing next to another man wearing a suit.
A dark room with a tv playing spongebob squarepants
A bus is on its way to the station.
a person is standing on a skateboard outside
A double decker red bus is driving down the snowy street with the headlights on.
A person's hands are opening a laptop beside another person
a close up of a clock on a pole on a city street
The yellow fire hydrant is at the side of the road.
Plate of food that includes chicken, beans and a pickle.
A couple of people riding horses with Saint Patrick's attire on.
A man is in the middle of swinging his bat
a boy is looking at a train made of candy
Two buses driving by people in a city.
A street with vehicles, pedestrians and detour equipment.
a black and white photo of a building clock and people and trees
Simple bed in room with pair of nightstands and lighting.
A black dog standing in front of a door.
This is the outside of a building with chairs and benches present.
A KITCHEN WITH A STOVE AND LAP TOP
An old style cook oven with multiple pull out compartment
Man skateboarding on rail in front of a building.
a picture of some vegetable meal and a plate of what looks like chicken and a side bowl of rice and curry.
Man with a courier bag on a mobile phone on a crowded street.
Calico kitten lying on a backpack on a wood floor.
A table set with pizza and a bottle of coke.
A zebra that is standing in the grass.
Woman with dark hair in a multicolored bathing suit is flying a kite.
A small bathroom has a sloping roof with a window.
A set of five train tracks in front of a graffiti covered wall.
A herd of elephants walking across a grass covered field.
A girl by the side of the road selling flowers.
Man and woman in airport lobby saying goodbye.
A decorative church has several rows of pews.
Diners at a cafe overlooking a sandy beach.
A crowd of people standing under a clock ina  train station.
A big cute black dog in the air with a disc.
A group of friends siting at a table enjoying pizza.
Four cows eating grass on a sunny day.
A woman getting food from a tray with fruit, cereal and juices.
Two people on horses ride through a field.
One large sheep and small sheep next to it in a dirt ground area with a stone wall structure next to them.
A child with a backpack looking at a polar bear.
A stuffed bear that is in a backpack.
A red double decker bus is riding down the road.
A man in a wet suit walks across a crowded beach on a sunny day.
A table filled with a big bunch of assorted veggies.
A barber with a big mustache trims a man's hair.
A plant in a glass vase sitting on a window sill.
there are old cabinets in this kitchen along with a microwave
Fruit flavored donuts lined up in a glass fronted cabinet
A sandwich ,pickles and cookies  are for lunch
A person on a snowboard in the snow.
A boy in yellow shirt playing a game with a Nintendo Wii controller.
Two goats on the road surrounded by trees.
A man holding the reigns while riding a horse.
A toilet sitting in a grass yard out side.
Several giraffes wander around their enclosure at the zoo.
A herd of sheep stand in a snowy field with a cloudy sky in the background.
A view out a bus window of people riding bicycles.
Young girl acting silly in the waiting room.
A table full of food and chair with no one there.
A plate of ries and a drink is sitting neatly on the table.
Some goats are looking up at the camera.
The skier is competing in the winter Olympics.
the ball is coming toward the batter and the catcher is ready
a close up of an animal with something over its head
A basket filled with donuts covered in powdered sugar.
A young boy in a red shirt flies a kit high in the sky while a girl in a t-shirt watches.
A policeman roller boarding in the street with another man.
A skiier is preparing to ski on a snowy hill.
A young male laying on top of surf board.
A woman making some food inside her kitchen.
Lots of luggage is lined up on the sidewalk of a busy city.
The kitchen is clean and ready to be used.
A metal sculpture of two birds and two poppy seedpods.
A male skateboarder skateboards on a wall in an enclosed area
a pizza with a bunch of tomatoes on it.
a woman getting ready to hit a tennis ball with her racket
The dog lays down to scratch his itch.
A giant Amoco sign sitting above a gas station.
A man in a fuzzy hat is talking on his cell phone.
an image of a bowl of tomatoes and a flower
A dog is standing on the sandy area.
A train track scene with one train on the tracks.
Some fruits and vegetables and a ghost are in an orange container.
A young girl is sitting on her bed, talking on the phone, with a laptop on her knees.
Man in a uniform talking on a phone at a work desk.
Small  stuffed toy rests on leg of teddy bear.
A woman standing next to a red and white truck.
the toilet is white and the cabinets are brown in this bathroom
A person holding a phone to their ear and working on a computer.
A man holding a tennis racquet on a tennis court.
A herd of sheep grazing on a hill next to the ocean.
A pizza with meat, cheese and tomato sauce.
A train traveling down tracks next to a rural country side.
A pitchers mit with a ball inside laying on some bleachers.
A group of young men sitting on steps in front of the ocean.
Airport security drives past airplane on the runway
An umbrella laying on the ground next to benches.
A white counter top topped with a ripe banana and three coasters.
an edited picture of the same boy doing several different tricks on a skateboard
A large clock with a red second hand is attached to a modern building.
Truck on an urban road hauling a lot of corn.
A boy and girl play paddle ball in the grass
An room that has been broken into smaller work areas by a divider.
A modern jet liner taking off at the airport
A laptop computer on a shelf above a stove.
A young child riding on the back of a brown horse.
A train engine on the tracks with a side rail beside it.
Looking up at a stone and brick clock tower
A woman is taking a picture of herself in a bathroom mirror.
This decorated cake has a horse with a fence on the top.
A tennis player is making an effort during a match.
Some small boys standing near a floor drain on pink tiles.
A boy riding a skateboard in the street.
A traffic light is red for people on horses.
A baseball player slides his body into home base.
A taxi van in the street with pedestrians, by the corner of a building.
A man partaking in a water sport in the ocean.
One surfer riding with the wave in the ocean, and another surfer on his stomach riding into the wave.
A young girl is playing Wii boxing by herselg.
A person wearing combat boots sitting on a kitchen counter.
Two women with open umbrellas walking down a street.
Luggage including a trunk and a guitar stacked up by a wall
Group of zebras standing in a fenced in area with shade.
The man wearing  the animal puppet makes it cut the boy's birthday cake.
A bathroom single sink vanity with a large mirror.
This is a cityscape of a skyscraper in front of a large mountain.
Group of skiers posing for photo on foggy day.
Large green truck parked at the outside of stadium with group of people walking past
The cutting area of a sewing room containing scraps of fabric
Many cats lounging on a couch in front of a window.
The truck sporting graffiti  is parked on the street.
a blue and yellow bird is sitting on a branch
A bird sitting on a branch next to some berries.
A beer can and mug are shown with a rib plate.
A man stands on his skis on a flat patch of snow near a fence.
A stop sign with grey paint over top of it.
A bus being loaded with bags of luggage parked in front of a building.
A person that is in the grass with a kite.
People going up a snowy hill on skis.
A train is blowing steam as it stops at a train station.
A pan on top of a stove with pizza dough and tomato sauce.
A concrete bench is in front of the water.
A person on a snow board performing a trick on a ledge.
A skier has fallen down in the very deep snow.
two street signs with one pointing towards the right next to a building.
A man guides a dog to herd sheep.
A fire hydrant in front of bushes with a glass face on top of it.
Red bus coming down a street next to a red cab.
A women wearing a tennis outfit, swinging at a tennis ball.
A painted fruit bowl with different fruits in it
Two giraffes are eating leaves from tree branches.
A man riding skis down the side of a snow covered ski slope.
Fruit and vegetables are cut up and placed in small containers.
A computer desk with a computer and three monitors and a black chair sits in between them.
A man is swinging a baseball bat at a game
two zebras standing next to a tree
a woman wearing a helmet and holding onto a baseball bat
four poster bed and bedroom furniture in a bedroom
two big chairs sitting close to a fireplace in a living room
A collared dog standing between two potted plants
A cook dishes a stew from a pan onto a plate.
A couple of guys that are standing in front of a plane.
A hipster emo woman sitting on luggage in the middle of a road.
A person standing in the snow with a snowboard.
An all glass building showing the reflection of another building.
A meal containing soda, salad pizza and rice on a table.
Display of about 100 vintage wall clocks.
A beautiful young lady hitting a tennis ball with a racquet.
A bathroom area with three sinks and a towel dispenser.
A old picture of a building with many people out front
A microwave, bread and rice are on this counter
Two buses wait at a red light along a city street.
A refrigerator with its door open and contents showing
The table is set with 4 boxes of different, delectable  donuts
The cat leaves paw prints as he seat on the car.
A woman twirling a floral print parasol umbrella.
Flowers in a vase placed on a table.
A woman in grey shirt standing in room next to a dresser.
A room filled with dining tables and chairs.
A man on skis heading down the slope
View of bushes next to traffic lights and moving cars.
A table full of different types of donuts.
Two dogs sitting in the front of a car.
A man sitting at a desk with a cat on his lap.
Two people sitting on a bench in front of a statue.
a living room with a tv a book shelf and plants
Two guys on a mechanical lift next to a building .
A large bird perches on the seat of a bicycle.
A stuffed animal that is laying on a carpet.
the train has lots of cars on top of it
A Twins baseball player holding his glove walking on the field.
trees in fall colors and a stop sign to the right.
A group of people on a side street with umbrellas and awnings.
A train door opened with passengers sitting inside.
Two elephants outside, one being fed, one standing.
Window display a different pastries on a city street.
A composite image of an office desk, cars and buildings.
A family of four is posing for the camera near some flying kites.
Kids sitting at a table eating food.
a white plate holding onto a sandwich and a salad
A man in a vegetable shop holding a green vegetable.
Several grassy tennis courts with five tennis players.
A woman holds her tennis racket ready to hit the ball.
Two children sitting at a table that has two cakes on it.
A man wearing a tie holds his chin as he reads a document.
service man in uniform throwing a ball on a baseball field
A white trash can on a beach under two palm trees.
INFRARED PICTURE DEPICTING THE SHAPE OF A HUMAN BEING
Young skateboarder on pavement in rural populated setting.
A little girl wearing glasses taking a selfie.
A clown face made of yellow squash for the eyebrows, cucumber slices for eyes, a cherry tomato nose and a carrot smile.
The airplane is about ready to take off on the runway.
A man riding on  a horse drawn carriage next to a red brick walkway.
A red car parked on the street in front of a parking meter.
A man polishing a horses' horse shoe while another man holds the horse.
A woman holds a tennis raquet during a match.
Pizza on a metal plate sitting on table near phone.
A girl holds her arms out to a Frisbee while a boy kicks his leg.
A man wearing a helmet on a blue motorcycle.
A very dimly lit kitchen with a nice window.
A motorcycle is parked in front of two cars.
A man wearing a black suit on talking into a microphone.
Two boats floating in the ocean one has a crane on top of it.
A picture of a lot of people in the snow.
meat with onions and sauce on a plate next to potatoes and broccoli
Two children plays with a kite in the field
a piece of bread with some vegetables and met on top of it
A man and woman getting married on the beach.
A guy that is using his cell phone while in a park.
A knife and fork sit on a plate with vegetable pizza.
The woman in red and black is skiing down the slope.
A variety of sheep and goats drinking from a pond and eating.
some baseball players playing baseball and people watching
two cup like things with a bird and a wolf painted on them
A young woman sits at a picnic table with her laptop.
A herd of elephants are walking among the desert.
A baseball player is batting with a catcher and umpire behind him.
See picture of a lot of bicycles in the street.
An old suitcase on the sidewalk next to the road.
A woman in a blue dress with no shoes, seated with her legs crossed on a chair in the middle of a room.
A young girl with a nice booty standing in a living room.
A boy in a jacket and tie looks at the camera.
An older couple with helmets preparing to go on a motorcycle ride.
Boy wearing a helmet riding a skateboard down a street.
A young adult looks at a computer screen while doing homework.
An adult and a baby giraffe stand gazing over a grassland.
Stuffed animals sitting on a counter with cups in front of them.
Three square slices of food and sauce at an oriental restaurant
A flock of birds looking for food in a field.
An airplane is parked at a terminal in an airport while luggage trucks unload the aircraft.
A delightful pink frosted doughnut and a cup of coffee.
The purple flower with a yellow center is near a car air condition vent.
An orange and gray bus parked next to a sidewalk.
A group of adults standing by a table with wine glasses on it
people with their head covered on a motorbike
A batter, catcher, and umpire are poised for a baseball.
A birthday cake is shaped like a teddy bear.
A young child smiling while sitting in the grass.
a bird in the branches of a tree
A gang of bikers riding motorcycles down a road.
A tennis player swinging his racket with both hands to return the ball.
Plate covered with french fries and opened hot dog sandwich
Sheep are locked up at a farm and feeding
people in the ocean standing on water boards and wind surfing
A bilingual directional sign to the Hyatt on the Bund.
Edible food items displayed on table with receipts.
Two cows grazing in a pasture by a stream.
A line of baggage in a lobby with several people.
A group of men playing a game with Nintendo Wii controllers.
2 towers stand connected, a large clock in between them.
A man posing for the camera with a red tie on.
A man riding a horse over a red and white striped pole.
The zoo visitor is looking at the giraffes.
A woman that is standing in the rain with an umbrella.
A woman seated looking at her lap top
Three different types of clocks propped against a wall.
A bookshelf with books and other knick knacks
A white plate topped with two slices of pizza.
Three giraffes stand in the grass by a dirt pile.
A young man is holding a giant sandwich in one hand.
two zebras eating grass in a very big field.
A small group of Zebras drink water from a pond.
an image of a snow piled on the ski slope
A fighter jet with missiles flies through the air.
A man leaning on a building talking on a cell phone.
Four individuals on skis headed in the same direction.
The baked potato has sour cream and lots of other condiments on it.
A man flipping a skateboard on top of asphalt.
A man holding a racquet hits a tennis ball.
A close up of a toy squid riding a small bicycle.
Four players posing for a picture on a tennis court.
A bike sitting on a sidewalk in front of a bus.
A public restroom with a urinal installed in the floor.
a person stretching to hit a tennis ball
A street sign that says C have you paid?
The zebra stands underneath the branches of a tree.
Two people riding on the back of an elephant through a lake.
Woman with life jacket and dog in rowboat near shoreline.
A living room with wood flooring and furniture.
Two Dell mouses that go with a computer.
A person standing on a beach and flying a kite.
The woman is learning how to use her new ski skates.
A man and woman sitting a a table with pizza in boxes, in a room with a piano.
A yellow table sitting on top of a hardwood floor with boxes on it.
a herd of cows walks down a city street
a person riding a snow board on a snowy slope
Man with large orange and black kite in park area.
A young boy standing on the beach with a colorful kite.
A picture of a naked women who is using a laptop.
A table is set colorfully with a pepperoni pizza.
A black parking meter, that is next to a bunch of cars.
Man standing on a tennis court holding a racket.
Two people in an open field are playing with a frisbee.
Two people eating slices of pizza while riding bicycles on a city sidewalk.
A woman using a smart phone while standing next to a building.
A jet in the air flying in a dark sky.
Two people sitting on the couch with a guitar in front of them.
some black and white cows in a green and yellow field
A man holding two cell phones in his hands.
Passengers getting ready to board a small aircraft.
A guy riding his skateboard in a small town street on a chilly day.
a man and a woman walking across the street
A rusty bench is near the steps outside.
Someone is skiing in the cold white snow.
A red light that is on a pole.
An old elephant with a long trunk at the zoo
A man plays in the water at the beach.
A skier going downhill with snow flying up.
a zebra has its head down in a field
Yellow lounge chairs and an umbrella are reflected in a pool.
A woman holds a tennis racket in one hand and a tennis ball in the other.
a public transit bus in a city street
a train depot with several trains stationed in it
An all way stop sign at the intersection of two streets.
A view of some alcohol with a glass filled.
A great shot of a mountain near the ocean.
A man sitting in an office chair looking at his cell phone.
A BIG BATH RUBE IS IN A CLEAN SPACE
Men in suits with umbrellas walking through open area.
A bird with food on its beak is sitting on a branch that holds a bitten on apple.
A traffic light and street sign in a large city.
A bike is inside leaning on a white shelf.
A graduate wearing a blue cap and gown holding a cell phone and papers.
An umpire is catching a baseball that was missed by the batter.
A large nicely set dining table displaying a cake and other pastries.
Three people posing for a picture inside of a grocery store.
The contents of a pantry in a house.
Two women are in a kitchen baking together.
A  city street that has police walking along with people, and some are carrying umbrellas.
a bird sits on a wheel next to some plants
A messy kitchen that has the drawers open.
Giraffe leaning over to nibble buds off a green bush.
a boy following a man holding a surfboard in the water
A red baseball player sliding into a plate.
A man and a woman cutting up a big sheet cake.
Traffic light and street light for Belmont Avenue
A person doing ski tricks on the slopes at night
Two giraffes leaning heads down, one with head in feeding trough
a man riding a bike with a cart attached to the front of it
A little girl laying down holding a bear and a kitten.
an image of a cat sitting on top of the desk area
The dog is in the kitchen sink and pizza is on the counter.
A close up view of two men in a large assembly hall.
A person jumping in the air on a skateboard.
A teddy bear wearing a blue sweatshirt sitting on a bed.
An outdoor bench sits empty and covered in water.
Three cows grazing on a hill overlooking a harbor.
An adorable cat laying back on a chair while it sleeps.
A man sanding on a walkway covered in a long green jacket.
A bowl of cherries are shown with a bowl of oranges.
A man near a baby elephant by the water.
A building with a clock that is on top of it.
A white toilet and white pedestal sink sit in the bathroom with newly laid tile.
A person spraying water from a hose, onto an umbrella being held by a child.
An egg, cheese and sausage biscuit sandwich on a plate
A full plate full of delicious food sets on top of the table.
A woman holding an umbrella while standing on top of a wooden deck.
A girl holding a tennis racket in front of her face
A couple of skiers that are at the end of the run.
Three people sitting on their motorcycles near a building.
A decorative propeller plane flying in front of a wooded area.
A small Coast Guard boat meeting a personal boat on the water.
A person swings a bat with a helmet on.
A man on a trailer by trees with a dog.
A man playing a game of frisbee with another man as they gaze into each others eyes with man lust.
The plane is ready to board passengers for their flight.
A view into a living room containing several pieces of furniture.
A guy riding his skateboard down a paved path.
a group of people that are sitting in some chairs
On a snowy area, a man is holding a young child with skis near several people, sleds, and mountains.
A man in blue jeans, has stepped on a banana peal.
Two zebra grazing on an open ground full of grass and trees.
A airplane that is flying in the sky.
A desktop computer sitting on a wooden desk.
two man sit at a table in a restaurant
A group of people are outdoors playing with Frisbees..
There is someone at a table cutting ie ed of paper
A cat plays on a laptop while watching a video.
A teddy bear is seen looking out the window.
A cat sits behind a person on a green revolving chair.
a person flying a kite on a beach with a person near by
A view of a bunch of seagulls flying around the beach,
a street sign sitting between two benches sitting by a sidewalk
A man on a motorcycle drives down the street
A cat standing on top of a car trunk next to a parked motorcycle.
a person taking a photo in a mirror
Chocolate cupcake with a monsters face frosted on top.
A white Nintendo Wii game controller sitting on top of  a table.
A herd of cattle and zebra standing next to each other on a  field.
There is a woman riding on top of the elephant.
An empty bed in a bedroom in front of a small TV.
A woman is eating a personal pizza with a friend.
A girl flying a kite in the sky with her hands.
Three stuffed teddy bears dressed in period clothing.
A school bus by a crane and truck with a mountain view in the background.
A stone oven with many kettle pots, baskets and bowls.
A microwave has a container of food by it.
The dog is wearing a red scarf and is being petted by the woman with red shoes.
Some people on snow boards high up in the air.
A large airplane is sitting on the runway.
Pigeons gather atop the rails on the lighthouse.
An empty park with mature trees and a backless bench.
A person is in the air skiing in the snow.
A dressed up teddy bear is sitting in a corner.
A young man riding on the back of a black motor scooter.
A bowl contains a variety of chopped vegetables.
Orange cat sleeping on a small laptop computer.
One horse trails behind another during a race.
A cat is laying on a lap top on a desk.
A man and woman sitting on a train using laptops.
The girl stands behind the line and waits for the ball.
The skateboarder is checking his technique in the mirror.
Three teaspoons of instant coffee poised over a mug.
A woman pinning a flower to a man's suit.
A motorcycle parked in the middle of a crosswalk on a busy street.
A display of vegetables is set up in front of a pickup.
An airplane at an airport at a jetway.
A person standing next to  a box on the ground
A person about to throw a Frisbee in the park.
a small plane flying by on a cloudy day
Bare feet atop a skateboard on a concrete surface.
A surfer is riding a wave in the ocean.
an image of a man that is by a bench on the phone
A zebra laying down in the grass resting for a while.
An adult giraffe places its head on a young giraffe.
A man on a skateboard passing a bus while posing for the camera.
A baseball player swing the bat at a baseball.
Graffiti has become and famous part of the art industry
A woman in a bus with cars ahead
A cat hangs out in a bathroom sink by a bottle of Method soap.
A man in white and green jersey looking at a cellphone.
Zebras standing in the shade of a fenced off enclosure.
A man jumping a grey horse over three rails.
Overly ripened bananas are being skinned into a pot.
Two cars are parked across the street from a sidewalk bench.
A beach with many empty blue and white chairs with umbrellas
The man is enjoying a snack at the park.
an image of a man in the water waves  with a paddle
A birthday party with a cake is being held for a dog.
A batter hits a baseball with his baseball bat.
A woman stands beside a baby in a high chair a table is set with a birthday cake and champagne.
A woman sitting in a chair laughing while another person holds a cellphone up from behind an overturned table.
A large bed on a wooden frame in a bedroom.
a couple sitting outdoors with some wine glasses
A person with an umbrella is walking down a city street.
A woman swinging a racket at a tennis ball.
A counter is full of platters with different pizzas.
A CITY BUS IS ON THE STREET COMING THRU
Tow cakes resembling the engine of a train.
A dog is in the air catching a frisbee.
A woman having a meal in a restaurant and using a cell phone.
A long white plane resting on a run way.
A down the counter view of a very messy kitchen area
A baseball player holding a bat near a ball.
a clock on a white tower in front of a clear sky
A man wearing a neck tie with a golden clock on it.
Competitors on skis are racing around the course.
Aerial view of a group of people flying heart shaped kites
a box with some big doughnuts inside of it
Two adult elephants interacting behind some trees and bushes.
A food processor with a chopped mixture in a plastic bowl.
Hot dogs lay on an orange plate while hot dog buns are on a grill.
People are riding down a street on skateboards.
Sailing boat tied up to a deck chair on the beach
Two large trucks parked next two each other next to the building
A young man playing with video game controllers.
A Boston Red Sox pitcher stands, holding the ball in his glove at his waist, prepares to pitch to an Oakland A's batter.
A baseball player swings at a pitch during a game.
A young man walks the beach with a surfboard under his arm.
People assembling teddy bears on a table
A sandwich on a plate in front of condiments.
a person in a field with a dog
a green military truck sitting in a warehouse
A number of rose flower sticks in a bundle
There is a old tower with a clock in the center
A luxurious living room with chandelier, bar, and couches
A large elephant is shown walking through the terrain.
this is a man holding a kite in the air
A person standing under an umbrella with other people and lights in the background.
a peacock on a wooden table looking for scraps
A refrigerator that has items on the outside.
A group of elephants on grassy area next to rock and trees.
A man stands on a surf board and rides a wave.
a small boat on a small body of water
Many sheep stand in a large grassy field
A man and a woman are standing by the street
A female tennis player raises her racket to hit the ball on a tennis court.
there is a woman with glasses eating a donut
a baby cow standing by its self in the grass
A man riding a snowboard down a snow covered slope.
A food bowl with vegetable and chicken salad.
A lunch of salad, fries, a sandwich and a drink.
An elephant in an enclosure approaching a body of water.
A bunch of birds sitting in a bread basket.
Wine and desserts are served on a table.
The large kitchen has an island in the middle of it.
A tall giraffe standing next to a tree filled forest.
a picture of a bunch of train cars colored red.
A man is standing partially inside an open refrigerator.
The dog is at the dog wanting to get into the house.
White dog playing in grassy field with red disc.
A pile of fruit sits ina  clean bowl
a laptop computer sitting on top of a homemade machine with wheels
A black cat sits on a bench beside a wooden letter K.
Several boats at a pier in a bay ringed by mountains.
Three giraffe standing near trees in a grass field that appears to be a zoo.
The happy couple cutting their wedding cake together
A woman is holding a tennis racket on a court
Two men sitting on steps and selling goods in the fog.
the small cat is sitting inside a suitcase
The little boy is holding an umbrella over his head.
A Skyteam airplane taxiing on a snowy runway.
A small aircraft is beginning to lift up off of the tarmac.
A herd of deer in a field down a hill from a house.
Twins are smiling with the same attire on.
bedroom with pink patterned headboard and matching curtains
A road is winding in the distance in-between trees.
A boy in a red hat playing with tee ball set.
A motorcycle is parked in front of a cafe.
Skateboarder riding in a concrete with a large cross in the middle.
A gray teddy bear sits on a doily near a card.
A group of people on small bikes on a street.
A girl shows a banana to the camera.
A table set with wine glasses and plates.
Tagged cows are standing in an open field
A bird stretches his wings at the beach.
Little boy with toothy grin talking on a cell phone
Cacti can be seen in a large clay pot.
Black and white horses are standing next to each other.
Boys are playing Frisbee in a yard.
Someone's living room contains a bookshelf with lots of books.
Some young boys are playing with video games.
A tennis player reaching up to hit a tennis ball.
A group of people in a room with remotes.
A man sharing a hot dog with a black and white dog.
a fence that has a bunch of surfboards on it
A man and woman that are standing on ski's in the snow.
A bird sits on a branch in a tree.
There are some vegetables, herbs, and other seasonings and a knife on a wooden cutting board.
A stone tower is has a clock on the side.
A photo taken from an airplane looking down at the mountains.
A man surfing waves on his surf board
A young zebra is nursing from it's mother on a grassy plot near some shrubbery and a mountain in the distance.
A man shows the screen of his phone to the camera
The stuffed bear is next to a toy doll.
Several horses grazing in the grass near some hills.
Baseball player getting ready to catch ball as many fans enthusiastically watch.
A bathroom that is done in checkered walls and flooring.
A group of people that are standing under umbrellas.
a close up of a cat laying on a laptop
An air plane is flying over the roller coaster.
a young black man lying down on a bench outside resting
a close up ofa clock on top of a shelf
people holding a skating pole on the snow
Two women make faces as they stand at bathroom sinks.
A man lying in bed with a cat next to him.
A guy in a blue shirt is surfing.
a person riding a surf board with a parachute
A LOT OF PEOPLE ARE ON THE BOARD WALK
An antique car is parked on a city street next to two others.
A group of people loading the back of a pickup truck.
Two  men in business suits shake hands.
A brown horse standing in the middle of a flower filled field.
That seems like a very small sink for this kitchen.
A group of people skiing down a snow covered slope.
A laptop computer sitting on top of a wooden chest.
A dog and a sheep separated by by a fence.
Several different kinds of vegetables on a counter.
a woman is sitting outside with a blue umbrella
The fridge is full of food and goodies
A man playing Wii while others watch
An older man stands behind a younger woman sitting on a park bench.
A pair of scissors with orange string on a spool leading to the scissors.
Two guys on laying on surfboards riding a wave.
Large Elephants and small Elephants are walking in a line.
Plates assembled near each other with silverware on right.
a clock on the wall saying it is 241 in the afternoon
The leg of a pair of glasses is stuck inside a clear vase.
a white woman in a white tennis outfit playing tennis
A person with skis down a mountain in blue pants and black jacket
A salad with side vegetables and dressing are positioned on a wooden tray.
A man is on the beach with a brown horse.
A flat-bread pizza with melted cheese, and a few vegetables sits on a black tray on a wooden table.
A bathroom has blue walls and a large mirror.
There are many chefs here in this kitchen cooking
A man is wiping down the elephant in the water
a tennis court that has a man on it
Two men with tennis rackets with one racket holding balls.
A cat lies asleep in the middle of a mattress.
A collection of yellow fire hydrants on the street.
two male baseball players in uniform with long hair
A pair of scissors, a crochet hook and a sewing needle are ready to craft.
People are walking through a subway terminal.
A baby girl brushing her teeth with a pink tooth brush.
The person is holding a pastry in their hand
some big and little bears walking across the street
A young boy wearing camouflage sitting in a  doorway.
A warning sign for high water is on the side of the road.
A bus with two levels and a hostess ad is traveling on a street.
A grey cat with green eyes and a pensive look on its face.
A broken cell phone laying on carpeted ground.
A lot of horses grouped together walking down a road .
A home with rooms under construction of them
A zebra standing next to a tree in a field.
Four photographs of a woman in denim shirt next to white plate of food.
animals grazing on a straw field bordered by water and mountains.
a baseball player throwing a ball with a glove
A man wearing a hat riding his skateboard in a skate park.
Various signs written in either Chinese or Japanese and also a sign of a man walking across a street.
A man standing over a griddle in a park.
A galley kitchen with white cabinets and fridge and a wooden island feature.
A man standing under a ball on top of a grass covered field.
A young man with a surfboard is surfing in the water.
A person carrying a surf board on the beach.
Lots of toasters sit in the floor near an oven.
Boats on the water with mountains in the background.
A man riding a surfboard on a wave in the ocean.
An old, dirty toilet in a small bathroom that is falling down.
A fat orange kitty sitting on a black chair
a man standing in front of some tall trees
a close up of a motorcycle with parts missing
A close up of a bicycle  parked on a train platform.
A sandwich on a toasted roll sits atop a green leafy salad with tomatoes.
Dog in the air to catch a frisbee while a man lays on the ground.
A giraffe is walking along a paved walkway.
An elephant is standing on a cloudy day.
A man with a bicycle in a train station walks past it as a train approaches
A bicycle leaning against a street pole in the snow
A little boy holding a bat over his shoulder
Two beautiful women riding horses in the ocean in bikinis.
A bathroom sink with a large walk in shower.
Kitchen table ready for party with beverage cups, citrus fruit, and alcohol bottles.
A child takes berries from a table full of fresh garden produce.
A bathroom with blue walls and a pink tub, toilet, and sink.
Soup and a sandwich on a metallic plate.
A group of people hanging around holding umbrellas.
A desk with a laptop and jars and candles.
Onlookers watch an elephant stop for a drink of water
A train car moving down the track at a crossing.
A blue city bus putting over at a bus stop.
Two individuals posing with funny faces, one holding up a wine glass.
A vase with an elephant head holds a bouquet of flowers.
an image of two benches in the park
Several purple flowers are shown growing with bamboo in the pot.
A cat looks down from on top of a dresser.
A baseball game in progress with the picture about throw the ball.
An old airplane flying above a large city.
Cooked broccoli in serving dish sitting on cloth hot pad.
A person that is laying on a bed with a bag over his head.
A bathroom with a phone mounted next to a toilet.
A train travelling above ground near bushes and trees.
A clock that is on the side of a wall.
Clowns ride an antique firetruck down the road in a parade.
A woman stands next to a parked city bus.
Man in business suit skiing in the snow
Six snowboards are leaning against a red wall.
a baseball player getting ready to swing a baseball bat
A woman on a surfboard surfing a wave on a beach.
Small child in white shirt holding a white controller.
A red and white plane in on display in a field.
two woman playing tennis on a court in front of a crowd of people
banana slices sit on top of toast on a white plate
A man is sitting in a boat on a river and drinking a bottle of water.
a group of sheep standing around while eating some grass
A full view of a nice kitchen and counters.
A man is standing among pink and zebra feathers and a zebra.
Remains of various deserts are situated on a table.
A red fire hydrant standing across the street from two silver vehicles.
a man doing a skateboard trick on top of pool
A boy in white shirt flying a kite on beach.
A man is eating food with a pair of chopsticks.
This is a dirty urinal in a bathroom.
Carrots are laying on a cutting board with a knife.
A woman that is next to a surfboard with a dog.
a photo on mountains skating wearing very warm clothes
A large bedroom with big windows and a patio.
there is a black and white dog standing in the bath tub
Two little girls playing with a kitchen set.
A young man riding a skateboard down a street.
A man wearing a white shirt, plaid tie, a grey hat and glasses smiling with his eyes closed.
A man leaning up against a boat that is almost finished being built.
A group of professionals at a business meeting.
Three giraffes eating leaves off cut tree tops.
A pizza covered in cheese and toppings on a plate.
A tall clock tower flanked by two trees
A young girl tasting food from her bowl
Sunlight streams into the living room through two windows.
A cluster of small boats in shallow water.
A crowd watches a baseball game being played.
A dog that is sitting down in a backseat.
A pair of blue scissors sitting on top of a paper and a container of note cards.
A group of planes are flying through the air with smoke coming from their tails.
A man eating a slice of pizza next to food stands.
A fan is featured in a yellow room.
A fenced in area off a sidewalk with posted signs.
A white plate topped with salad and onions.
A plate that has food on it with a glass next to it.
A woman holding a blue frisbee over the top of her head.
A man cutting a cake with a knife.
View pointing upward of a skyline in a city
There are military people serving others hot food
White bowl with tomatoes and greens on counter top.
A bus with a few bikes on the front
A green traffic light  and telephone wires
This is the sign for the Bart ba building.
A bunch of birds flying over some waves.
Handmade vases, all the same size but all different colors.
A woman reaches down to pick up a video game control.
Dog laying on a green sofa in a living room of an apartment.
There is a man swinging a tennis racket.
a man with a camera is filming some baseball players
The little kid is flying a kite on the beach.
I am unable to see the image above.
A man in a suit and tie standing with a cellphone to his ear.
A black keyboard is hook to a cell phone on a table.
Black and white of two adult zebras from shoulders up playing.
A man hitting the ball during a tennis match
A man standing in a park looking at trees.
an old black and white photo of a man near a plane
An intersection with traffic lights and lots of traffic.
Small floor model refrigerator, so new it still has its manufacturer's sticker.
A crumby chocolate dessert on a plate with a large knife.
A dog on standing on a surfboard in the back of a truck.
A delicious plate of churro with chocolate sauce.
a toilet with a black lid and the tank in the air
A herd of sheep grazing on a lush green hillside.
A black train engine on tracks next to buildings.
a person riding a skate board on a street
Two zebras are staying away from the sun as long as they can
A stop sign with graffiti about the Red Sox
a man handing an elephant a stick in an enclosure at a zoo
a close up of a cat on a window sil
A man sits on a boat cleaning a fish.
A group of zebras and other animals grazing in a field with a rainbow in the background.
A plate of green salad and pieces of tomato.
A couple of cats laying on top of a brown chair.
Little girl walking down a road holding an umbrella.
A wooden desk with a laptop sitting on it.
A scooter parked in front of the door of a stone building.
A giraffe is coming up close to people
there is a an standing on top of a mountain
A kitchen with and island and several counters in it.
A person walks on a bridge with a kite.
Three skiers pose in the snow in front of barren trees.
A table topped with two bowls filled with fruits.
A very large pizza covered in cheese and toppings.
A woman and man riding on the back of an elephant along a river.
A motorcycle racer leans into a turn during a race.
a zebra is standing in its pen and some green plants and grass
a white black and brown cat on a table
a big bathroom with a sink, toilet and bath tub in it
The man in a suit stands next to a woman in a pink dress.
Wooly goat stands near gate with others on the other side.
A book shelf with a large clock on top of it.
A man walking his dog in the park.
a person holding an open umbrella in some bushes
A little girl gets help brushing her teeth.
A motor cycle procession down a wet street.
Snow boarder sliding down the hill after falling in the snow
There is chicken, couscous and vegetables on the plate.
A man holding his tennis racquet on a tennis court
A zebra and smaller brown animal are running in the grass.
A dog crossing a pavement path near motorcycles.
The man in the yellow checkered hat is flying a kite.
A couple of men riding horses down a street with tall buildings.
Boy attempts to hit a baseball with his bat.
A BLACK AND WHITE PICTURE OF A MAN SITTING LOOK
A person in the water being pulled by a kite.
two very tall and white storage towers in a room
Looking past a snowboard in the snow to a city beyond
An Alaska airplane is reaching up to a greater height.
A man swinging a bat as he plays in a baseball game.
A child wearing a red helmet holding a skateboard.
A man having fun in the rolling ocean waves.
a person that is  on some dirt on a baseball field
A plate several cookies and a small sign on it.
A partial view of a formal living room.
a big window that has some birds out front
A little girl is eating a hot dog and riding in a shopping cart.
A grocery store filled with lots of fresh produce.
A guy blowing on a hot piece of pizza.
Two children reading while lying in their bed
A horse and buddy come down the side of a road.
A photo of someone's meal at a restaurant.
there is one orange laying among five bananas
A woman putting icing on a homemade cake.
A couple of people at a counter near plates of food.
A coupe of road signs near a downtown area or highway.
A boy walks along the beach carrying his surfboard.
People and buses are sitting still on a city street.
A young man holding a basketball on top of a court.
a couple of beds  that are in one room
A dog sleeping on the floor in the corner, a man looking down at him.
A large group of men are dressed like Santa.
A couple of people standing on top of a beach with surfboards.
Giraffes and babies are in their habitat in the grass.
A blurry image seen through a rainy window of a person holding a light blue umbrella.
A toilet and a trash can in a room.
Two birds sit on the back of a bench made of logs.
A very large bear sauntering in a zoo type environment
A teddy bear that has been buried in the sand.
A blue and cream tiled bathroom with a stand up shower
This table has three kinds of donuts on it.
a horse pulling a carriage down the road
a man wearing a striped tie holding a microphone
A car driving down the street, some people are watching it.
A man standing on a  tennis court holding a racquet.
The man with the umbrella is looking up.
A door is opened to the inside of a bathroom.
A person on a surfboard rides a wave.
An orange and white cat is sitting in an easy chair.
A man in chain mail checking his cell phone.
a person is skiing down a snowy hill
A refrigerator with a microwave on top of it.
A green and blue fire hydrant sitting on bricks on the side of the road.
A man stands and airs up his bike.
a plate with a small dessert and some fruit
A table topped with vegetables and a pitcher.
A locomotive train on a set of railroad tracks, with tanker cars attached behind it.
The woman in the kitchen is tending to her food.
a table that has some glasses on it
A white basket filled with ripe and unripe bananas.
A fireplace mantle has an ornate clock sitting on it in front of a large mirror near a teddy bear.
A small round clock atop an ornate old building.
A pile of luggage, boxes, towels and other items on a carpeted floor
Small airplanes are parked on a grassy field.
Two bear cubs are playing together in water
A bunch of luggage is on a car in a bathroom stall.
a hot dog covered with some chili, mustard, adn ketchup
two pans of dinner rolls baking in a large oven
Six sheep standing in the grass beside a house.
A calendar with some apples and oranges and pears in it.
a group of people that are getting out of a boat
Some very big commercial planes over the water.
The man is riding a bicycle next to a train.
A brown and white horse standing in front of a red wall.
A field of wooden structures in front of a mountain.
Horses peek through the windows of a small utilitarian horse barn.
An airplane jet flying through the air against a blue sky.
On this table there is bowl containing a bottle and glass vase containing rocks and leaves.
A dug out filled with baseball players next to baseball equipment.
A man taking a photo of an elephant as the elephant stands inside an enclosure.
A dome shaped cake that has lit letter shaped candles on it, and people in the background.
A young child brushing his teeth in the bathroom.
Man playing Wii video game with group in background on couch
A man taking a close up picture of a motorcycle.
A baseball game is being played in a city park.
A small water landing plane is on a lake near a neighborhood
A beautiful young woman holding a tennis racquet on a tennis court.
Two kid touching food that is on a kitchen counter.
A woman holding a tennis racquet next to a tree.
A traffic light with two street lights hanging from it's side.
A very nice looking dining table by a bright window.
A picture of a person in the air on a skateboard.
A man walking next to two horses on a dusty road.
A whit plate topped with chicken and vegetables.
A black and white picture of a lady getting off of a escalator holding an umbrella walking into the city.
Several balls of yarn are sitting on an oven top.
A bunt cake sitting on a red plate covered in icing.
A statue in the middle of a park near trees.
A Stop sign is slightly covered up by a tree.
A group of people and an official player soccer.
A couple of benches next to a street.
A kitchen with a sink, dishwasher, microwave and refrigerator.
A blue and silver fire hydrant on a sidewalk.
a herd of big cows on a wide farm
The hand is holding an open cel phone.
A row of motorcycles parked on the side of a busy street.
A computer monitor, a laptop and some other electronics sit on a tan, wooden desk.
A little girl is jumping on a hotel room bed.
A blue and white bus parked in front of a motorcycle.
Person stands and poses with skis next to a ski lift.
Take-out food in a basket on a wooden table.
A group of people are on the grass playing Frisbee
a big building with a clock built inside the top of it
Some books that have been piled on top of each other.
a person sitting at a bench near a bush
People sitting at a table with plates of food and beverages in front of them.
A man sitting in an overstuffed chair in a living room.
A long passenger train that is going quickly down the track.
THERE IS A DESIGN OF AN ELEPHANT ON THE SHELF
A pair of scissors with white handles sits on a white piece of paper near several sheets of flannel.
Two yellow trains are entering a train station.
some female hands holding a sandwich in a car
This person is about to eat a banana.
A table set for tea reveals finger sandwiches, tea cups and a cream pitcher all on a red and white table cloth.
The snowboarder is performing a jump at the top of the slope.
A white bath tub sitting next to a white toilet.
A banana with a sticker on it, with a person holding it.
A cat sitting on top of a bag of luggage next to a TV that is showing a store about Giant Rats.
Snowboarder rounding top of sloped edge in ski area.
A large type pizza with cheese, spinach, and sauce is on a silver plate.
A group of large red birds that are perched in a tree.
A vase that has flowers inside of it.
A group of people on a field with a Frisbee.
Man and woman standing close together smiling into the camera.
Large polished black truck sitting in a parking space.
Densely growing trees and a low fence frame the top part of a shot showing a tight huddle of grazing sheep on a section of sloping terrain with cropped grass and a cat at some distance behind them.
a couple of phones that are next to each other
A pile of different fruits sitting next to each other in a  bowl.
A couple of motor bikes parked on a beach.
A white beat up bus going down the street .
A bird flies over an island area of a river.
A yellow fire hydrant is shown on this street.
Small child standing in the center of a crowd smiling.
A person sitting at a table eating a doughnut.
The frame of a bench is metal and the seat of the bench is wood.
A large bear walks in front of a rocky formation.
A wooden cutting board with several vegetables sits on a counter.
A woman is playing Wii with sunglasses on.
A red, white, and blue plane is in the sky.
A person holding a surfboard while wearing a wet suit near the water.
A zebra stands between several small trees in tall grass.
PERSON ON SNOWBOARD UP IN THE AIR OVERLOOKING NEARBY TOWN
A view of a street corner in the middle of a city.
A grey black and white cat laying in a chair.
A cute dog lazily sleeps on top of a pile of clothes.
Lots of donuts being processed through a machine.
a woman and child are looking at an elephant in its pen
A group of people in the snow with skis.
A flock of birds standing on top of a wet beach.
The little girl is blowing out her birthday candles.
a kitchen with a small window in it
A bus is stopped on a street surrounded by trees.
A wooden park bench under a tree with long spiky leaves.
Young girl with racket with dog on lap
A black pan filled with mushrooms and vegetables.
The man is holding an extremely large pizza with a lot of stuff on it.
a large window with a city in the reflection
A living room has three televisions set up.
A couple of men walking along a snow covered hill side.
A yellow doorway with a clock above it.
A bedroom with a picture on the wall and a lamp on the side
Many people are sitting at round tables with dinner plates on them.
An elephant swinging its trunk inside of a pen.
A woman wearing white playing tennis, about to serve.
a pro baseball player is swinging a bat
a man on a bus and a man looking over his shoulder both smiling
A gang of bikers riding motorcycles down a street.
Four bowls containing fruits and vegetables arranged decoratively
A person on some skis in the snow.
a girl wearing a fuzzy vest and a girl wearing a flowered top
a toilet and a urinal in a marble tiled bathroom
Two people on skies posing for the camera
a man that is jumping his skateboard on some bricks
A dish features breaded meat, lemon, and broccoli.
An intersection of a regulated entrance showing the stop sign
A group of four people are riding a ski lift as they ride over the snowy mountain.
A living area with a television, coffee table, couch and other items.
Two plates with small, rustic looking pizzas on them
Five snowboarders doing tricks on the snowboard course.
a microwave is sitting on a wooden shelf
A mom duck with a big bunch of ducklings swimming down a river.
A dog sitting on a couch in front of a table with a laptop remote controls and glass on top.
woman taking picture of herself in the mirror
A lady in a winter coat talking on a cell phone.
Zebra, antelope and other wild animals at a African National Park.
Horse drawn chuck wagon followed by Jeep and cattle.
Two boys who are playing soccer against each other.
a horse is standing near a large lake
Teddy bears are dressed in clothing and stand in a window sill
A yellow fire hydrant surrounded by pebbles near a fence.
A snowboarder about to move down the slope.
An elephant sticking it's trunk up another elephants rear end.
A LOT OF PEOPLE ARE ON BOATS IN THE WATER
Two tennis players sitting on a chair holding racket.
a blue frisbee sitting on the beach with dog paws next to it
A semi oval looking bathroom that is in someone's house.
A group of people fly kites over a sand covered shore.
Hundreds of sheep walking in the water and a ranch.
A person is cutting up some fruit on a cutting board
Two young children playing with each other on a bed.
A cutting board with slices of peeled apple and a knife next to an apple and apple peels.
A busy street with busses and cars merging together.
a black and white sign is by the road
The man sets up the ball to serve it.
Three giraffes standing together inside a fenced area by white buildings.
A man and a couple of women sitting on a colorful seat.
a person walking across an odd looking pavement carrying an umbrella
A yellow train parked next to a train station near a loading platform.
A street is blocked of for a festival.
A man sitting at a desk in front of a laptop computer.
A woman on a beach on a cell phone.
A street sign showing the intersection of Beacon Ave and Stevens St.
A man with his arms crossed is sitting in front of green couch with remote on it.
a lone black and white cow standing on a large field of grass
A double decker bus is shown driving on street.
A man wearing sunglasses wearing a green shirt.
this bathroom has two pictures of dogs in it
A piped canopy bed with a wood headboard is dressed in neutral bedding.
A kitchen with counter tops filled with lots of clutter.
Commercial jets lined up at an airport terminal.
The view from the commercial airplane includes the wing and mountains and water.
A desk with two computer monitors and a laptop.
A pair of adults escorting children skiers up a hill.
a fire hydrant on a city street near a pole
Several suitcases sitting next to a chair outside
A crowd of people sitting around a dinner table.
A large teddy bear is wearing a dress.
a woman with a nice little suit case
A snowboarder soaring above a slope looking out on a mountain range.
Two pieces of pepperoni pizza are on a plate.
A young girl is eating cake with her fingers.
A giraffe amongst tall, slender trees in an enclosure
A computer keyboard is shown on a desk.
A train that is riding on rail road  tracks.
A man in a shamrock hat is playing a video game.
A kitchen scene looking at all the pans of hot dogs and sausage.
The bench at the tree offers a respite and a scenic autumnal view of a grand valley
A row of elephants standing next to each other.
A bathroom with a black and white pattern on the wall.
Two young woman walking by a fire hydrant, one talking on cell.
a person riding skis on a snowy surface
A young child is jumping high in the air.
A person on a surfboard, riding a wave and leaning to one side with one hand up in the air.
A full view of an airplane taking a shower.
A close shot of a BBQ pulled pork sandwich.
The skateboarders seem very relaxed as they wait for their turn to ride.
Man with broken surfboard standing in waves in ocean.
Brown leather couch in wood floored living area.
The man and woman are holding tennis rackets.
A man wearing a neck tie and a white shirt.
Group of young adults eating pizza and drinking beer at a restaurant.
A man with a yellow tie and white shirt holding a yellow sweater round his neck.
A man standing in the street on a cellphone.
A red bus is leaving and some people in the background.
Two students are playing games at a party
A soccer player kicks the ball in a soccer field
a cloudy sky during a day with some overcast
A plane with drawings on the side waiting for people to board.
A  young boy holding an umbrella on a deck
A sheep is minding its business near a body of water.
Long-haired male downhill skier flying down the slope, negotiating a turn.
Several people street skatingstreet luging on a road.
A female snowboarder riding down the mountain slope
A man with a baseball uniform on with a baseball and catcher's mitt.
Two cows are standing on a sloped green hill.
A man in suit and tie has a cane and cigar.
there are six jets flying in formation
A group of elephants in grassy field with mountains in background.
A bed sitting in a bedroom between two lamps.
A person sitting in a chair with the ocean in front of them.
A man surf sailing out on the ocean.
Simple silver remote being held out in front of a television.
a brown piece of cake is sliced and on a brown table
A woman is holding a tennis racquet preparing to serve the ball.
an air force jet flying with a sign attached to the back of it
a group of people ride atop of an elephant
A dog sitting on a rug watching television.
A tall giraffe standing in the middle of a green field.
A pair of men sitting at a table in a diner.
A woman sitting on a bench with a bunch of suitcases
A baseball game with the pitcher in his follow-through and the batter preparing to swing.
A brick building with a clock on the outside.
A dog lies in the grass next to a Frisbee.
A kit has markers, a scissors, and other plastic objects.
A stop sign topped by two green street signs.
Numerous parking meters along the side of a street.
A bowl of pasta salad with onions and olives.
THERE IS A WHITE PICK UP TRUCK DRIVING DOWN THE HIGHWAY
A flock of sheep grazing in a big grassy field.
An old pickup truck sits outside among other classic cars.
a bunch of cows eating out of a food trough
This is an image of two bikes on a beach.
A large bus on a open city street.
air force members consulting near airplanes, while a man is near the planes.
A living room with a lighted floor lamp, sofa, wooden coffee table and end table.
A fire-hydrant on a street and near a van.
A truck with trailer for hauling rolls down the road.
A group of people standing in a field flying a kite.
Bananas and other fruit on a white plate.
Man with teenagers at outdoor setting enjoying food and drink.
A very cramped room with a couch and a desk.
several old fashion planes stilling in a field.
A cat sitting on top of a hard wood floor.
A man seated in front of a pizza.
A man riding skis down a snow covered slope.
Carrots, quash, green onions, and parsley all on one piece of paper.
A baby boy holding a stuffed bear animal in his hands
People sit at a table for a party.
A man that is sitting in a train.
Some apples and other fruits at a store
A man flying through the air while riding skis.
Two youngsters in orange tops have catchers gloves and are playing.
young boys in uniform playing baseball in a packed baseball field
A public restroom with several urinals, a black floor and red and yellow walls
A group of farm animals standing in the shade under a tree.
A glorious sunny day at the beach and a man sitting on a bench taking it all in.
A living room with a fireplace and an artificial tree.
A man is surfing in the water in a really big wave.
A large long train on a steel track.
A large number of people outside near some flowers and a road.
A woman is looking at a fire hydrant.
Several people with backpacks waiting to get on a bus.
A bathroom with a large mirror and walk in shower
a small cat watches a cheetah run on television
there is a female surfer riding in the water
A woman is reflected in a mirror as she works on her laptop computer.
A large passenger jet sitting on top of a runway.
two tennis players on a tennis court with a sky background
A group of people standing outside while some hold posters.
A young girl standing on top of a grass covered field.
a yellow and brown fire hydrant on the side of the road
A plate of food containing a sandwich and a salad.
A little girl eating a donut in her left hand.
A small private plane that is coming in for a landing.
A cat sitting on the awning above a stove
A person traveling on a crosswalk on a bike.
A gathering of people around a large table eating.
A group of people sitting in the snow while attached to snowboards.
A large clock sitting in front of a building beneath a tower.
A dog sleeps on the lap of his owner.
some guy standing on a beach with a surf board
a couple of street signs that are by some bushes
A man with a shaved head lights a cigarette.
A man in a pizzeria putting the toppings on a pizza.
A girl with a cast on her arm stands in a bathroom.
A person holding a red bowl filled with cake.
a person riding skis on a snowy slope
THIS IS A BEAUTIFUL PICTURE OF FRESH VEGETABLES
A cat taking nap on top of a pair of shoes.
Two photos are presented white people talking on their phone.
The sheep are grazing on the hill side.
A fire hydrant is alongside an empty road.
A child playing on his skate board at a park.
AN ELEGANT ENTRY WAY WITH ARCHED DOORWAYS AND GLASS AND A CLOCK
A bowl of mixed fruit on a decorated mat.
Red double-decker bus parked on a city street.
a few people on horses are riding down the dirt.
A slanted picture of a woman waiting to cross the street.
a vet is trying to check a dog's teeth
A man holding the strings of a kite on the ground
A woman holding a baby while she has something in her mouth.
A plate that has a glass and food on it.
A bookshelf full of cookbooks, bottles, and magazines next to a microwave.
A child is leaning out of his bed to touch a gadget.
A laughing man is holding a baby with a plate.
The snowboarders are taking a break in the snow.
a bathroom vanity and shower door with towels hanging on a towel rack
A clock clamped inside of a rusty vice.
a living room with couch, fireplace, tv, chair, and window
Three different vases containing several red tulip blooms.
A small bird is perched on top of the branch
A baseball player holds up his bat while a catch squats.
A young woman walks along the beach near the water.
a child and an adult pose for a photo
Roses and other flowers arranged nicely in old-timey vases by a shop window.
A man skateboarding in a skateboard park while another waits their turn.
A man spray painting a fire hydrant on a street corner.
Closeup of the head of a white cow on road.
A woman who is holding a tennis racket.
a baseball game with the batter catcher and umpire
a person in glasses is using a laptop
A cherry pie sitting on top of a piece of tin foil.
A dog leans out of the window of a car.
A shelf with pileed hats next to a teddy bear.
A woman on a cell phone near a man.
A Fiji Air Pacific plane is flying through the sky.
People are walking around a plaza that has a sign that reads "Spring in the City".
Two men trying to get to a soccer ball in a soccer game
A man plays video games in a cluttered living room.
there is only one horse standing on a large empty field
A young lady laughing in a kitchen with a cake in front of her on the counter.
The girl is surfing a small wave in the water.
A laptop computer and mouse sitting on a table.
A group of women cooking and preparing food in a kitchen.
A cat next to a box full of lots of trinkets.
A person on a yellow motorcycle is turning around a street corner
People dresses as zombies boarding a bus at a bus stop.
A man and woman holding up cellphones near each other.
The brown  bench is in the woods
The woman is posing for a picture while skiing.
A male surfer carrying a white board exiting the ocean.
the side of a passenger train at a train station
A man riding a surfboard in a wet suit in the ocean.
A dog is in a living room lying on the couch.
A man wearing a cap, walking alongside a bicycle.
A kitchen filled with black appliances and a table.
A man playing tug o war with a dog over a white frisbee.
A plate of food sitting next to a glass of orange juice.
An Olympic competitive skier furiously rounds the corner.
A man adjusts his tie as the subject of a graphic.
A plate of food including, grilled meat, baked potato, carrots and lima beans.
There are some bananas on a dinner table
A plate filled with broccoli chicken and fried rice.
The view of a crowd of shoppers and vendors at a market.
A man that is standing in the dirt with a bat.
A person on a field flying a kite.
A pretty young lady kneeling down to pet a cat.
A little girl in a red shirt and blue dress standing on a road.
A few skateboarders performing tricks at a skate park.
A type of bread is on a plate next to a variety of sauces.
The living room has an old style fire place in the corner.
A woman standing in the living room with a coach and t.v.
A spindled bed sits inside of a wall papered bedroom
A baseball player pitching a baseball on a field.
A man on skis with ski poles has just descended the mountain.
A man riding a wave on top of a surfboard.
Giraffe relaxes in the shade in the park
A sign cautioning the likelihood of cattle crossing.
A group of giraffes is standing next to a fence.
A harbor in a city is full of boats.
A street filled with blurry traffic and traffic signals.
A tennis player is about to hit a ball in front of a crowd.
a small airplane sits empty on a runway in the mountains
A wooden table topped with four white bowls.
Trays of pastries and sandwiches beside a bowl of soup.
some people some snow and some trees and one person is taking a picture
A messy bedroom with items covering the floor.
Two large bags of luggage in a hallway.
The train is going down the railroad tracks.
A women who is in a field of dirt  flying a kite.
Three women posing for a picture in a dinning hall.
A bench sits between two trees in a flooded area.
Two people with bicycles standing in front of a field of flowers.
A giraffe walks near the gate as people look on.
A group of wine bottles sit next to a glass.
A vase filled with lots of different colored flowers.
An orange has been sliced in half and placed in a red bowl.
A man with a suitcase walking through a crowd
A couple of zebra standing and laying on a dirt field.
The cars are parked on the side of the street.
The steak and broccoli is next to a bowl of soup.
A bedroom with windows with bright lights flowing through.
A man and a child who are in the snow.
Three male skiers standing on a ski slope
A man on skis hovers over a series of small hills covered with snow.
a bird that is sitting on a branch
The person is riding on the back of the multi-colored truck.
a person wind surfing on a large body of water
A small black and white dog sitting on a yellow davenport.
a young horse and its mother graze in a field
A large group of people are sitting at a long dining table set with plates and wine.
This is a long red bus behind another one just like it.
a double decker bus stopping to pick up a passenger
Two uniformed men posing while holding pastry items.
The side of a train showing the entrance and two doors.
Three giraffes lounging around in a grassy zoo enclosure.
Several people are sitting around a lit birthday cake that is under construction.
Small child playing the Nintendo Wii on carpet
A herd of cattle grazing on a grass covered hillside.
People watching a big blue kite on a cloudy day.
there are several bullet trains on the track
A woman poses for a picture while eating
A water-stained cathedralclock-tower enveloped by various green vines.
A large balloon on a beach with a black and white dog looking at it.
A small set of silver scissors used with electronics.
A cat sleeping in a sink next to a faucet.
Three zebras are huddled together in an enclosure.
A cake says Happy Birthday with an image of a horse.
A large vase sitting on top of a wooden table filled with flowers.
A clock sitting on top of a street sign.
A baby that is laying down wearing a tie.
Several people are getting ready to enter the water for surfing.
Sea beach with a bench.Four ships are seen in the sea.
A person in snow gear skiing down a snowy hill.
A baseball player holds a bat across his chest
A desk with art work and photos displayed on it
A woman that has a racquet on a tennis court.
Some dogs stick their heads out the car window.
a large living room filled with a lot of furniture
A large, white cow walking through the streets of a small town
A toilet connected to a wire, next to a speaker.
A yellow bus is driving alongside a small white car.
a baseball player with a bat on the field
A man is skateboarding on equipment specially made for it.
A man holding a tennis racket about to hit a ball.
A man who is playing video games by himself.
A big pretty rainbow over a long empty road.
doughnuts stacked on top of each other in a bowl
Two microwaves stacked on top of each other in a kitchen on a counter.
there is a very tall giraffe standing under a pole
some snow skiers are posing for a picture
A computer monitor that is in front of a keyboard.
A gold clock that is on the table.
a woman sits on a bench and talks on her cell phone that is waited down with key rings
A snowboarder goes airborne with a mountain in the background.
A truck that is sitting in the street.
a young boy holding onto a harness for a cow
A table topped with a bowl of soup and a plate with a corned beef sandwich.
The traditional white sink features two faucets below the mirror..
A man being assisted with a tie by a lady.
People are in a field playing with a frisbee.
A white laptop computer lays on a carpeted floor and a gray and black with white footed cat is on it.
Cattle with horns and red hair standing against a fence.
A jumbo jet on the runway waiting to take off.
There is a table covered with various displays of cupcakes
A living room has guitars, shelves, and a painting.
Someone has drawn a face on the yellow fire hydrant.
a large black giraffe that is out side by some kites
A metal rusted bed frame in a dilapidated room
A pile of apples lying underneath a tree on the ground.
A white bowl filled with meat and green broccoli.
A two floor bus picking up some passengers at a bus stop
Racer riding a dirt bike on a race course.
A pizza on a pan with a spatula.
Many sheep graze in a grassy pasture in a valley.
A man standing on the railing of a boat near the shore.
Group of people in for a group training session
A man is sitting on a bench next a statue of man with dog licking his face.
Steam rising from a manhole cover in the middle of a street with a yellow fire hydrant in the background.
A teddy bear sitting on a bench in the shade
An eighteen wheeler with a patriotic paint job sits in a parking lot.
A very tall clock tower with weird arches hanging off of it's sides.
A todller, a girl, and a man pull a ribbon in the grass.
Horses walking through the yard toward a barn.
The skateboarders are practicing their tricks on the stairs.
A man on the beach kicking the sand.
A kitchen with a standard stove top and wooden cabinets
A couple of women standing on either side of a man wearing glasses.
Two men retrieving their Frisbee from the creek.
A sign that warns of speed bumps ahead.
Cat relaxing on blanked, appears to be stretching
a mobile phone, tv remote, game controller and chips on a blue table cloth
Two giraffes are eating grass in the plains.
A man that is jumping in the air with a racquet.
Two dogs on a bed in an RV.
A snowboarder soaring through the air on a sunny day.
Two female cows looking forward outside in the grass.
Orange seats on a train with Yellow doors and lime green floors.
An intricately decorated bathroom with a peacock light lit.
there is a blue and silver train that is stopped on the tracks
A black and white photo of two birds standing on seaweed.
A commuter train sitting at a station while passengers stand on the platform.
A person lying on the ground with a suit case on top of them.
A fruit that is still hanging from a twig.
Two pieces of french toast with syrup on a plate.
Several umbrella's and chairs sitting on a beach.
A hotel room showing a bed, desk, television bathroom.
Girl moving while holding a Wii remote in a living room.
A man in a suit and a tie with a cell phone.
A kitchen refrigerator covered in various colorful stickers
A wide angle view of this hotel suite
a bunch of people are standing near a bus
Four people are in a room using four laptops.
An elephant standing in the middle of a rocky environment.
A group of children sitting on a bed together
A close shot of a plane flying in the air.
A blender full of smoothies and two glasses on a kitchen counter.
The decor in the house is very elegant looking.
A view of a bathroom showing vanity, toilet and shower.
A baseball player is hitting the ball on the plate.
A bed in a purple bedroom with a wooden dresser topped with a mirror.
Some very fancy looking cocktails with fruit and veggies.
A round clock on a colorful tower near a harbor.
A dog is wearing a Santa hat for a portrait.
A yellow freight train is traveling on a track
A man and boy sit in chairs and enjoy breakfast.
a close up of a street sign with trees in the background
A young man with a skate board standing in a graffiti covered area.
Home library area, bookshelf in background with several laptops, notebook PC and two VDUmonitor and keyboard for desktop in foreground.
A man sitting by produce while another man points to it
There is a plane sitting at the airport.
a covered table with fish on a table
A woman sitting in a bathtub wearing a bikini.
A young child in a snow outfit and goggles with skis on in the snow.
A bear climbs through some plants and onto some rocks.
A sheep standing on a green grass covered pasture.
A pile of books sitting on a table underneath a clock.
Four men carrying a long board that narrows at the ends
A stainless steel microwave with something in it
A red fire hydrant on a city street.
An older photo of a woman on a tennis court posing with her raquet.
Japanese food of meat and vegetable are on a plate.
People at a table with food and wine.
A young girl with a helment stands on a skateboard.
there is a red stop sign and a white truck behind it
Young girl on large grassy field attempting to fly kite.
A group of people playing with a green disc in a grassy field.
A cat laying down in a bathroom sink.
Two animals are standing on a mound of dirt.
Group of seagulls flying around a fishing boat.
Man and woman walking over a bridge in the rain and high wind.
A couple of vehicles that are sitting in the street.
A train driving past a building pouring out black smoke.
A birthday car with a picture of a black bird on it.
A young man running along a beach next to the ocean.
two little kids playing soccer battle over the ball
A bride and groom cutting into their wedding cake together,
Towels on a towel rack of a bathroom and a towel mat on the floor.
A close up of two time expired parking meters
A bedroom with a bed, nightstand, windows, and dresser with a television atop it.
MAN STANDING IN GRASS WITH LOTS OF MOUNDS AROUND AND A FRISBEE COMING TOWARD HIM
A runway that has a jet plane and a truck on it.
Zebra walking on road and other animals on grass.
A light post with a no parking sign posted on it.
A man in a tie and shorts standing outside of a house window
A surfer stands on their board as another surfer watches.
A dog laying next to a large brown teddy bear on a wooden floor.
A pizza that is cooking in an oven.
Firefighters gather around a badly burned moving truck.
Two adults and one dog standing on a snow covered road.
A cat that is curled up on a laptop
a airplane that is parked on a runway
A man wearing pink underwear is sitting on top of a stove door looking surprised.
A tower with a clock on it's face stands in the sky.
A table topped with broccoli, apples and other produce.
A public restroom with focus on three urinals.
A pile of tiny sandwiches without crusts sits beside a pile of crusts and various sandwich fillers.
A train approaching a station where people are waiting to board.
A bus sits next to a tree and sidewalk.
There is a pizza that is on the table in the room
A man taking a swing at a tennis ball
A group of people watching a boy skateboard.
A man on a cellphone using a water hose.
The young woman is making a face at the horse.
Four giraffes are in a grassy area with several trees.
One boat sailing next to one canoe in a body of water.
Girls competitively playing Frisbee in a green field
Doofy young man shares his umbrella with an Asian woman.
hounds running in front of a horse must be a fox hunt
A man flying a kite on the beach next to other people.
A bathroom sink under a mirror and lights.
a train on a track above a body of water
A person looks at the camera while holding a black cat.
a stack of suitcases out on the street
Boats are moored near a city that borders a large body of water.
Kids playing tennis on a clay tennis court.
animals in a field of tall grain near a tree
Tennis player returning volley during match play on grass court.
Giant dolls sitting in giant beds next to a man wearing an orange safety vest.
A person covered up in warm clothing sitting on a bench, with two bags next to them.
A bathroom sink designed as a bowl next to its reflection in a mirror.
Woman walking on train platform as train filled with passengers prepares to leave.
A commuter train going through a tourist area.
A dog with a pink object in its mouth.
some zebras standing on a hill while eating some grass
a car with a cargo full of steer and symbols painted on the side.
very many teedy bear with their price label
Hamburger on a bun with ketchup and onion.
A guy riding a surfboard on the water.
A person holding a toothbrush to their face in a crowded room.
A person on a skateboard riding next to a road.
there is a small bird that is standing on the branch
A couple walking in the snow while under a purple umbrella.
A male on a snowboard on a rail in the snow as five time-lapsed stills in single image.
The fire hydrant has been made into a fountain.
A green candle and a vase on a table with one chair
A giraffe is standing near a fallen log.
A cat sitting underneath a wooden stool next to shoes.
a track moving on the road with two people
A plate of pasta and bread sit next to a beer bottle on a table.
A dog is sniffing a chew toy on the floor.
A living room filled with furniture and an old fashioned TV.
A lady in a blue life jacket skiing.
A couple of large gray elephants standing next to each other.
A bird sitting on snow covered ground next to a statue.
A yellow hazard sign sitting on the side of a road.
A grilled hotdog with mustard and relish is sitting on a white plate.
A cat staring at a camera laying on a floor next to a shoe.
A small kitten is playing with the tv set
A large cabinet in a corner next a picture.
A living room with an chair and large couch sits in front large bookshelves with computers on top.
A road sign that says reduce speed for motorcycles.
A man riding a skateboard over a stone block.
The man is holding a pink iced doughnut.
A fire hydrant covered in leaves sitting in front of a tree.
A black dog sleeping on a yellow and white striped comforter on a bed.
A view of a kitchen with a  very elegant look to it.
Three people riding skateboards down a hill next to grass.
Local fresh fruits and vegetables displayed for sale in a market
a man and woman stand in front of a cake
A bathroom sink and shower separated by an open doorway.
chopsticks holding broccoli and noodles in a white dish
Women are selling bagged and fresh bananas under a colorful umbrella on a street corner.
A giraffe that is standing on all fours on a dirt surface, in a fenced in area.
there is a bird that is sitting on a branch
A man sitting on a horse while rubbing him and kids are rubbing him also.
A plate topped with rice, broccoli and meat.
There is a dog sleeping on a couch in a cluttered room.
A baby holding a busted up umbrella whle sitting on the ground next to a pile of garbage.
a woman sitting on the back of a pink scooter in the road
Two sheep stand in a field with mountains in the background.
A small room with a television screen monitor.
A cute little girl eating a hotdog almost as big as she is
A Wii remote and nunchuk that someone's hand is holding on to.
A woman posing for a picture in a kitchen.
A large pizza sitting on a counter next to a glass of beer.
There is a man smiling with a banana in front of his mouth
A boy in a chair with a teddy bear dressed in a railroad outfit.
A large group of people protesting outside in a parking lot on a sunny day.
two people walking in an open field with a sky background
A desk with two computers on top of it
a man is at a snow slope jumping with a snowboard
A red and clear small glass filled with candy on a desk next to a green plant.
A man and woman are preparing pizzas on a table.
The girl is running through the grass in a costume.
A market with a variety of fruits and vegetables.
A young person ridding the waves on a surf board.
a truck is parked with some rafts by the water
A white cat sticking his head out through something.
Two women with loads of green bananas on dirt ground.
A wooden swing hangs above plush, green, grass.
a female in a black jacket is riding a brown and white horse
The train is travelling down the tracks of the road.
People are using a boat to travel through a flooded town.
A plate of food with pasta, mashed potatoes and broccoli.
Two girls involved in some sort of a game.
a group of zebras standing around by a fence
A large tall tower with a clock on top.
A traffic light sitting on the side of a road.
people sitting in the grass with some of them chekcing on their cell phones
A woman is about to hit a tennis ball.
A man and his child holding onto their skis.
A man in a shop that sells bottled liquid tapes up a paper bag
A CARGO AIR PLANE IS PARKED ON THE RUNWAY
A man looking back while standing in a market below a clock.
A boy stands on artificial grass holding a Frisbee.
A female professional tennis player dressed in white.
A large white stove sitting against a wall in a kitchen.
The baby girl is sitting in a high chair playing with broccoli.
Three pizzas with nontraditional toppings, a statue and a bottle of wine.
The young boy is sitting on the couch playing a video game.
A subway sign at night beside Big Ben.
A woman holds a Weiner in each hand.
An old photo of four men in a boat with a bicycle
two people using clear umbrellas that have fringe on them
A man standing in front of a kitchen counter using a laptop.
A zebra chews a flower in a fenced in field.
A painting of a horse drawn carriage traveling through the country.
This wall oven has just cooked a homemade pizza.
Crowded market street filled with pedestrians holding umbrellas in the rain
A beaver sitting on top of a tree stump.
Two skateboarders practicing their flips on a wooden ramp.
Two people in the water on surf boars on a wave.
A glass plate holds crackers, cheese, and vegetables.
A man pointing at something in front of a bus.
A woman sits in the grass talking on her cell phone.
A girl smiles at the camera while making candy.
A cat standing in front of a television screen with a picture of a fish.
A toy model of a kitchen that has a refigerator, stove, oven and baby play pin.
Tennis player prepares to play with racket in his hands.
A man smiles as he sips wine in an outdoor restaurant.
woman wearing a black coat and boots sitting on wooden bench
A man cooking on a grill with a fire.
Three Zebras and a Giraffe in an enclosed area.
The eight lane street is packed with cars in traffic.
A women serving a bundt cake with candles to a child.
A person in black shirt walking on sidewalk with an umbrella.
Commercial plane taking off from a runway with water in the background
There are plates of cheese, crackers, and sandwiches on a table.
A group of men are waiting for their bags to be unloaded.
A zebra eating grass by the barn gate.
A person holds a cell phone inside a car.
Three birds are looking around while on the ground.
A puppy is laying on a blanket with toys.
a family sits down to eat at a lighted dining room table
This suitcase is full of CD's and apparently they are for sale.
Large flowers are sitting inside of the vase.
a zebra drinks out of stainless steel tub
A guy and woman dressed up for Halloween.
A man is playing a game of tennis.
There is a bathroom with green walls and a white sink and toilet
A pair of shoes with a baby kitten inside one of the shoes.
A person on a snowboard jumping in the air.
A bunch of hot dogs in a bowl with beer being poured on them
A giraffe surrounded by a group of zebra in the grass.
A monitor and keyboard sitting on a desk.
two cows eating the grass on a urban area
Some trash sits at the side of the road at an intersection.
A group of people watching a woman jump a horse over obstacles.
a steam engine train driven down a rural area
A man looking out of an airport window at planes.
Empty benches in the park after a storm.
A group of young children sitting next to each other.
A traffic light is sitting next to a pole
A very cute little cat standing on a desk.
There is a plate with one slice of bacon a half of orange and bread
An empty bus is parked on a street.
A woman preparing a young boys lunch in front of him.
Three well dressed people are standing and laughing together.
A group of people looking at stuffed animals lined up in a street.
There is some trash is a kitchen sink.
a cupcake with a blue umbrella in it
An outside table and chairs with a pink lamp.
People are standing in a field flying and watching kites.
An empty kitchen with wood-paneled cabinets and black appliances.
A man is flying a large kite in a field.
A man that is standing up holding a surfboard.
a girl is sitting on a horse outside
A woman sitting in a chair while holding a purse.
A giant inflatable shaped like a spiked ball placed on a field.
A group of people carrying ski equipment while walking on snow covered ground.
Tennis players in action on a court with shadows.
a person holding a doughnut up to their mouth
A marble table with plates of food and utensils.
many people are trying to avoid the sun by holding umbrellas
An Emurates airplane flying through the sky
A bird that is standing on a keyboard.
A white plate topped with two pieces of stead and a salad.
A group of people watching something with one man looking off into the distance.
A woman crossing a city street while carrying groceries.
A sign post with signs that read "Maciel Ln" and "Wonder Stump Rd".
A bike with a box on it's back wheel is parked
A lady in a blue dress is posing for the camera in front of her plate of food.
a couple of men that are sitting at a table
a baseball player swinging a bat on the field
A family watches two boys singing into microphones.
Grey and white cat sleeping on a pillow and a sweater
A man holding a tennis racket raising his arms up in the air while two women clap.
A puppy rubbing its face on a pair of shoes.
People that are sitting on the grass eating food.
A woman walking down a street at night holding a red sheep umbrella.
Two men in  suits taking a picture together
a close up of many fruits on a table
A couple of giraffes that are blocking  the path of the safari.
A pair of giraffes stand under a canopy together.
There is a full view of an outdoor area and it is nice.
A small dog standing next to a table with a white plate on top of it containing two chocolate donuts.
A group of people are standing or sitting around a table taking pictures and looking at a phone.
Motorbikes and other vehicles move along a one-way city street.
The little girl is petting the horse in the barn.
a blue and white plane is on a runway
A cat laying on a desk with a book and laptop.
looking up to a clock on the side of a building
A person standing on a white square playing a video game.
Two women on cellphones laughing with trees in the background.
A baseball cap with sunglasses sitting on top of a baseball glove.
We are looking at a propeller plane flying in a cloudy sky.
A group of sail boats on a small pond.
A toilet with many buttons is sitting with the lid up.
Child standing in front of a stop sign on a suburban street corner.
A metal bicycle on the top of a wooden bookshelf.
A lady is standing outside in front of a bus station.
The plate is piled with rice next to a whole apple.
Man stands inside a building talking on his cellphone.
A rather large heard of elephants, including a baby.
Two wine glasses and wine bottles sitting on a wooden table.
A peep hole view of a a man biting a sandwich.
This tennis player is watching the ball after hitting it
A young woman that is sitting on a couch with her leg resting on some pillows.
a yellow and white bird is closing its eyes
A local bar has appetizers and tapas to enjoy the game in the background
Multiple boats are docked on the water by a pier.
A snowboarder lies face down in the snow.
A tall stack of suitcases arranged largest on bottom to smallest on top.
A child reading as his mother and dog look on.
A man wearing a cowboy hat, riding a horse in a parade.
A boy laying on a small wooden bench with and umbrella held up over him.
A variety of luggage is stacked in a compartment.
A group of zebras standing in tall grass
A giraffe walking in the grass past trees
A giraffe bending down to drink water from a pond.
a man helping a young girl walk on snow and ice in snow shoes
A lamb that is around a group of people.
A girl is posing by something that was just taken out of the box.
A jet and a small unknown aircraft are flying in the sky.
A spoon and a blender on a counter.
A bowl of bananas being placed in the middle of a table.
A large black cat sits on a desk near a laptop.
Noodle bar near cookie on plate near glass of milk.
A close up of a not so happy white kitty.
Two slices of pizza sitting on a ceramic plate beside a box of cheesesticks.
there is a small glass vase with white flowers in it
A person skiing on a mountain, in the snow.
Woman laying down across her personal bathroom with her feet hanging over the tub.
a cat looking out a window as one sits by the laptop and looks at the camera
Trees in a park are in front of some parked buses.
A man and woman traveling on the subway with surfboards.
A girl smiles from the backseat of a car on the phone
A young girl standing on a surfboard ride.
Two young ladies that are dancing in the room together.
A womab looking at herself while brushing her teeth.
there is a man wearing a red and white uniform that is at bat
A women who is sitting on a horse.
This kitchen has a black stove, stainless steel refrigerator and white cupboards.
A person riding skis down a snow covered slope.
A man standing next to another man wearing headphones.
A clock is built high into the side of a tower.
There is a man talking on a cell phone.
An old man sells a variety of kite string spools.
A little boy sitting on a suitcase on the floor.
A black and white picture of an overturned truck in the middle of a street.
A large billboard that has words in a foreign language.
Kids running in the grass after a soccer ball during a game.
The back end view of two zebras standing at a fence.
A woman is on the tennis court holding a racquet.
a large pizza is in a black pan
No image is being shown on the page right now.
High speed train stopped at a station underground.
A boy performs a trick on a skateboard
A batter holding a bat at the home plate.
A herd of sheep standing on top of a lush green field.
An Indian man straddles a horse beside a stone building.
There's a sideways traffic light next to a building.
A group of fluffy sheep in a big grassy field.
A stir-fry wok is filled with cooking vegetables.
A baseball player standing on home plate with a bat.
A woman leading a horse inside of a building.
A parked white, green and red double-decker bus.
A kitchen with brown cabinets and plenty of space.
Baked pizza with meats and vegetables displayed on table.
Steam rises from a bowl of colorful food, while a glass of juice sits on the sill in the background.
people on a table at the beach eating
a black seat on a white toilet in a restroom
Skateboarder jumping down concrete steps outside on his board.
A clock tower is seen in front of a tall building.
A colorful glass vase sitting on a table.
A snow covered area with a car with it's brake lights on in the distance.
A giraffe in front of the doorway of a building looks around the corner, casting a shadow on the building on a sunny day.
A yellow motorcycle is parked on a road with many bystanders
Many zebra and one wildebeest on a savanna
Man placing a white container into an oven.
A stuffed animal sitting in a pizza box with some slices of olive and cheese pizza.
A blue and white vase filled with flowers.
People are getting ready to fly kites in a park
A crooked one way sign pointing into the ground
A skier is seen riding down a hill.
top shot of a boy sitting on the floor eating pizza
A kitchen with all the appliances such as a fridge, microwave and stove.
Three riders race around a track on dirt bikes.
Two rows of bicycles parked side by side on a sidewalk in front of a building.
A tennis player is on the court preparing to swing.
A tennis player holding her racket in the air.
A bench is on a deck overlooking the water
there are two brown bears that are playing together in the water
A large clock below a flagpole with a flag on it.
a united jet liner loading passengers before take off
Vase with water holds a bunch of flowers in front of window
A typical living room with all the furnishings.
a sandwich laying on paper and on a table
A person takes photos of sheep laying down.
a white plate topped with a piece of chocolate covered cake.
A family of elephants is walking along a dirt road.
a person riding a surf board on a wave
A bathroom that is all off-white with a mat in front of the shower.
A bathroom has a custom bathtub with no curtain
A computer desk with an old pc and lots of clutter.
3 adult elephants stand with a baby elephant behind a fence.
Two partial pizzas with cheese, olives, green peppers and tomatoes.
A couple of zebra standing next to each other.
A tennis player with both feet off the ground leaping for the ball
A kid standing at a table eating some food.
A zebra that is putting its head on top of another zebra.
An old man is getting ready to blow out some candles.
Several buses are lining up on the street.
a small dog tied to a bench on a leash
a woman is sitting down talking on a cellphone
A woman sitting at a table with a glass of wine.
A girl in a white dress at the beach with two surfboards.
A light brown bear sitting down near large logs.
The taco pizza have a lot of olives on it.
A table with four bowls of food on the top.
A man sits at an outdoor restaurant table eating a soup with chopsticks.
A plated filled with a fish, potatoes and broccoli.
A fluffy cat laying on top of a white laptop computer.
People riding their bikes down the middle of the road.
Woman walking with a horse near a standing man.
A couple of black bears standing on top of a rock area.
A horse in a stall with three people.
Three very different giraffes at a big zoo
A cat curled up in a shelter made of printer boxes.
RED DOUBLE DECKER BUS WITH CARS IN BACKGROUND
A steam powered train pulls out of a busy station.
A cow is standing in a field in front of a building.
An old looking two level bus in a parking lot.
A slice of cheesecake sits beside a fork on a plate.
a bunch of people watching as two people play video games
A man on a scooter sits beside a stop sign.
There is a woman holding and playing with a baby.
A boy is playing frisbee golf in a park field.
White decorated porcelain vase in front of others.
A woman at a product show holding a cell phone
a large kitchen that has a stove and a dishwasher
A couple of cows walking submerged in some water.
Young, tagged calf looking through a barbwire fence.
A little girl in a pink snow suit on her skis.
A train is pulling into the station.
A baseball game in progress with a full crowd.
Sheep stand outside of a wooden building on a snowy day.
A makeshift tent is constructed at a camp site.
A woman posing with a stuffed bear in uniform.
A bus stop with a slightly damaged bench.
The young woman is sitting on the bed fixing her hair.
a close up of a young baseball with a glove
A woman walking her bicycle with dog walking beside her.
a shower door a sink a mirror and an outlet
sliced orange and a knife resting on the cutting board
A large bus that is sitting in the road.
Person sitting on elephant walking in a muddy river.
A mother bear following her cub across a meadow
a person sticking a knife into a cake on the table
A man standing with his arms folded while smiling.
A woman holds up an electronic cigarette underneath an umbrella.
A model of a kitchen with a sink dishwasher stove refrigerator.
two small birds on a bench with a blurry background
A man on a snowboard caught in each phase of his trick.
A man, a lady, and a youth together and enjoying a pizza.
A person holds a skateboard and stands on the sidewalk.
a group of men holding a long surfboard on the beach
A wooden street sign in a residential neighborhood.
A skate boarder rises on the crest of a concrete wall
Two gentleman in formal suits, one of them is adjusting the collar of the other.
A shower has a removable shower head and a glass door.
a woman standing at an outdoor display of assorted fruit
a close up of a dog on the ground
Animals grazing on a lush green hillside covered in grass.
a close up of a persons hand holding a large knife
A pizza in a box in a drawer of a motel desk where a TV displays "Inspired By A True Story"
A red and green bird on a perch eating.
An office area set up with multiple monitors
Surf boarder finding waves in a river designed for surfing.
A glass vase that has dried flowers in it.
A man is standing in a river with a cow.
A garden with vegetables planted in it
Banana bunches hanging at an open air market.
A large blue bus parked at a bus stop in a city.
A pizza with an assortment of toppings such as lettuce and radicchio.
A empty bench on a snow covered beach.
This man is standing in a kitchen eating food.
A modern sink is on top of a bathroom counter top.
Little girl with black hat sitting on a pony with two girls beside.
A group of people sitting around a dinner table.
a cat walking on a floor next to a contruction area
Food is shown in a display case at a deli.
young child is eating a powdered doughnut at the kitchen table
A counter to an office area with an orchid in a flower pot next to a balloon sculpture of flowers.
Cars and a motorcycle waiting at an intersection.
An empty street and stop sign at night.
A clay rendition of roses in a pot are displayed.
a person standing on grass holding a large box of pizza
A zebra and foal are standing on the ground.
There are two boys preparing food on a table.
An old picture of a building and trucks outside the building.
A gray cat standing on top of a black car.
A plate of food and drink on a table.
A young woman holding a green baseball bat on a field.
A small backyard garden with freshly grown vegetables.
a plate filled with pepperoni and mushroom pizza
A man is racing a black motorbike around a race track.
Man in black and white outfit swinging at a tennis ball on a court.
An airplane flying in the blue sky with some clouds.
A woman is approaching a tennis ball with her racket.
A man flying through the air while riding a skateboard.
a short tree in front of a pink wall
A group of people at a busy restaurant and a close up of a restaurant dish on a white plate.
Some young children are looking at the black device.
a street corner in a town all bright from lights
A man standing on a tennis court holding a racquet.
The hotel room is clean and ready for guests to use.
A large bowl of food is sitting on the table.
A girl is holding her phone and looking at it.
A large body of water next to a shore filled with clutter.
Several zebras are walking across a dirt covered area.
A horses head handing over a iron fence.
A close of on an entree containing meat and vegetables.
A bathroom has two sinks and a bathtub in the middle of the room.
A group of people on the green grass about to catch phrase.
A tennis player gets ready ready to hit the ball.
Several elephants are walking up a dirt hill.
A wet window blurs the image of an apartment building beyond.
A group of children playing ball in a field
young man catching frisbee right arm under left knee.
a jetblue plane sits on the tarmac at an airport
A picture of a street during the night.
A stuffed animal that is next to some hot dogs.
A motor bike parked on a city street.
The rhinoceros lays down next to the zebra in the safari.
A person sitting at a picnic, eating some food.
A man on a skateboard who is performing a trick.
A polar bear walking along on icy ground
A pizza with fresh mozzarella and basil on top
some one with a glove on holding a sparkling drink in the cold
A plastic horse standing on top of a chair.
A woman unpacks a picnic basket with her teddy bear.
there is a cat that is laying inside of a sink
A banana and a yellow apple in a woven basket
A bicycle parked next to a motorized sitting scooter.
Cat laying on top of someones arm while using the computer.
A city train stopped at a boarding station.
Two purple teddy bears one with pink bows sitting in a shopping cart.
An open white toilet next to toilet paper in a bathroom
A train parked inside of a train station next to a person.
A golden colored Shar Pei dog and a dog of indescribable heritage sitting on dog bed.
Teenage kids playing a Wii Game, while others watch.
A teddy bear lying face down on a bed on a pillow.
A pizza covered with vegetables is on a tray near plates.
A dog sits on a couch with a book.
A very large elephant standing near two younger ones.
a baseball player swinging a bat on a field
a bunch of cows lay down on some grass
A child eating a sandwich with relish on it.
a stadium of people watching a tennis game
A man is playing Wii in his office.
A bedroom with a bed and small tables on each side
Two sheep standing in a field against the sky.
A man is performing stunts on a skateboard in a parking lot.
Fans watching a baseball game on the field outside.
people standing around with some holding onto to what look like drums
A snack truck in the street in front of a building.
A group of men on top of horses playing a game of polo.
A person cuts grass in a yard using a small pair of scissors.
A red flat bed truck with a load of lumber on the back.
People are watching a man cut a birthday cake.
A notebook computer by a window with an image of the same window on the screen.
two people on a tennis court at night
there are two pictures of a small black and brown dog
A group of people skiing around a snow covered slope.
a man riding skateboard down the side of  a hand rail.
A kitchen with an automatic dishwasher and window.
A snowboarder is seen from below while jumping.
Three giraffes resting under a shaded area at a camp.
A white toilet sitting in the corner of a bathroom.
Home plate at a professional baseball game, batter not quite ready.
Some people sitting around at various tables, with a railing dividing them
A pizza in a pizza box cut into eight slices.
Two men stand besides an elephant and gesture toward a crowd.
An old blue iMac with a sad Finder face wallpaper
A woman holding a smart phone at a table.
A polar bear in water puts his paw on a cage.
A woman with a cup of coffee and a donut smiling.
a big train that is on a rail way
A bird perched on a power line looking at a house
A white and brown horse pokes his head out of a stall.
A train moves along train tracks in a grassy landscape.
A man holds and gestures toward a sandwich
a skate board being picked up off the ground by a person
A person has a stuffed bear on their wrist.
A puppy sitting on top of a sneaker.
a uellow and blue bus is driving down the street
Two zebras are on a grassy brown field.
A young man sitting next to a young woman both of which are holding Nintendo Wii controllers.
A blue and yellow train is sitting on some railroad tracks.
A train bears the numbers 4790 painted on the side.
A group of three men standing next to each other without shirts.
a couple of people are flying kites on the beach
A plate with chips, salsa and a burger, on a table with a glass of beer.
A palm tree is on one side with an evergreen tree on the other side and snow capped mountains are in the distance.
A small herd of zebras walking past the camera man.
A collection of teddy bears bearing Swiss flags
a man putting some cheese on top of his pizza slice
Chickens are feeding on the ground while horses hover above.
Black and white photograph of a man using a cell phone on the street
a modern flush toilet in a bathroom with tile.
A photograph portrait of a male teen in coat and tie.
a motor bike is parked outside on a road
A bus driving down a street with a bears face on it's front.
MAN ON SKIS STANDING STILL POSING FOR A PICTURE
a dog poking  its head out of a car window reflected in another car's rear view mirror
Two elephants are chained to the outside shed.
A toddler has a baseball and a mitt and going to throw the baseball.
A wooden object placed next to a tree on the side of the road
A lot of building on each side of the road, with a very curvy road in the middle.
an image of a boy that is lying under the bed
A large yellow double decker bus driving past a guy riding a bicycle.
a bathroom with a toilet a stool and a toilet bowl cleaner
A surfer wearing flippers skims along a wave
Red and yellow train cars hold gravel on a train track.
A mattress top on a bed in a small space.
A bare loaf of chocolate cake sitting on a counter.
An arched doorway leads to a furnished living area.
a close up of a person talking on a very old cell phone
a man is sitting down as a child pretends to be cutting his hair with fake scissors
A group of tourists riding a tour boat down a river.
several sheep graze on grass near a tree with a protector around it
LARGE DIMLY LIT BATH ROOM WITH A DOMED CEILING
A picture of the president standing at a podium
a close up of a person with a large sandwich
A replica sculpture of a baseball player holding a bat ready to swing.
A man stands in a screened in area with a cell phone to his ear.
A batter prepares to swing at a pitch during a game.
Small bathroom with lights on above the sink.
A person that is standing with objects in their hands at the beach.
Some giraffes and ostriches in a grass field with trees.
close up of a thin crust pizza with tomatoes
A giraffe's head is framed by the posts of its enclosure.
a purse a pair of shoes and a horse behind a display glass
a man sits at the table an leans over to blow out the two candles on a cake
A man snowboarding near a frozen pond and a tree.
a cat laying on the couch next to a remote and a pillow
A plate of food including broccoli, sweet potato, and pork.
plat bread pizza with BBQ chicken on it
A zebra standing on dirt area with fence and bushes in background.
A fighter jet is flying through a clear sky.
A young boy with his tennis racket in hand is waiting for the ball.
A large teddybear float is on snow skis.
A picture of a person sitting down under an umbrella.
A collection of trunks are piled against a wall.
A plateful of meat, fruit, vegetables and bagels
Two men and a woman are standing in an elevator.
Several people going down a snowy street in skis.
A man is taking a big bite of a folded pizza in a cafe.
A man is playing catch with two children and a dog.
A man standing next to a large brown horse.
A towel hanging on the bar in the bathroom.
A kitchen has black appliances, wood cabinets, and a large window.
A woman stands in a room that has two small beds in it.
A chocolate dessert slice sitting on a clear plate accompanied by a fork.
Horse drawn carriages lined up in the street.
A train parked at a train station next to a loading platform.
A bike is shown hooked up to a rack.
A man with no shirt on a skate board.
Elephant standing in an exhibit behind a fence with a park keeper.
Two Pug dogs setting on a green park bench wearing harnesses.
Two zebra standing on a lush green grass covered field.
A happy sun is painted on the building behind the bench.
A snowboarder jumps very high above the snow.
This is the side of an intersection with a red sign
An empty bench looking out over a bay with numerous boats on it.
a young child shows off his smile after brushing his teeth
A woman riding a horse with lots of purple flowers.
A train soaked street lined with lots of street lights.
A close-up of two ducks swimming with fish.
A computer desk containing a laptop and computer monitor with a printer located on the left side.
A red train is traveling undergroud on the tracks.
He loves the thrill of snowboarding down the slope.
A man and a dog standing on a dirt path in the woods.
This looks like a McDonald's in a Chinese or Japanese community.
A picture of a plane that is in the air.
Two adults and two children sitting on a couch.
A green light is shown on this busy multi-lane street.
A group of girls celebrating  as they leave the field
A couch and a television in a room.
A couple of boats on the open water.
A half full glass of red wine with food arranged on a table behind it.
A red umbrella hangs from an ornate stair rail.
The orange and white fire hydrant sits on the edge of the street.
A man sitting at a wooden desk using a laptop computer.
A group of men sitting by tables working on laptops
Two people stand in front of a bunch of elephants
Two women sitting next to each other on a boat.
A cat that is laying down on a bed.
A young  baseball team sitting on benches together
A bowl filled with fruit on top of a green table.
A little girl standing in a forest holding a black umbrella.
A model of a beach front scene showing the parking lot, beach and the sea.
Several skiers ski by a direction sign and a fence.
two men looking a little boy beside a table
A bedroom with a lamp, bed, and dresser.
A truck parked next to another truck near a building.
Two buses in a downtown area,, near a boat dock
a man that is surfing on some water
A woman decorating a fancy cake in her kitchen
a table with a shake and some fruit on it
Skiers at the base of a mountain, one is fixing bindings.
A tennis player readies to swing as they await the ball.
A man with a hat getting food from the refrigerator
The man is on his surfboard in the water riding it.
A girl is pulling back a sling shot on her fingers
A group of boats sit on the shore line.
a person using a laptop in front of a television
A group of Zebras grazing in a field.
There is a toilet in a bus or plane stall
A kid is touching an elephant's trunk near a fence.
A photo looking down at a parking area with garbage and old vehicles.
A crowd of people standing outside of a bus.
A man is holding the waist of a woman as they both stand and smile together and look straight ahead.
A herd of animals walking across  a grass covered field.
A rectangular vase is displayed, surrounded by flowers.
A bear sleeping in a tree, with the branches hiding its face.
A meal with meats, salad and eggs on a plate, a cup with soup, and a dish with something in it.
A display of a variety of fruits and greenery.
Two giraffes standing on bare ground in a zoo.
A small bathroom with a patterned tile wall.
A group of people in a small boat in the water.
A group of people is sitting in the living area of a loft apartment.
A horned cow  standing in a green grass field.
The interior of a bathroom with a toilet and soiled floor.
A child standing in the snow with pine trees surrounding him.
A box that contains a cooked pizza in it.
A gray and white cat near a black goat outside of a barn.
A person playing baseball with foot up in the air.
A giraffe head sitting next to a branch.
A nigh time elephant parade or show in a street
A group of people on a court with a tennis racket.
a person skiing down a snowy slope
A black and white image of some electronics near a pen and cup of coffee.
A little dog balancing itself on a surfboard.
The plate has two sausages, noodles, and broccoli.
A mother bird sitting with her baby birds.
A woman is showing a white teddy bear to a man.
A man and baby are holding their arms up while at a dinner table.
Two giraffe standing next to each other near a stone mountain.
A man riding on the side of a wind sail.
a woman riding down part of a snowy hill with a snowboard
a small bathroom features a tub, large vanity and mirror.
Serving dishes of fruit and cheese sit on a table
two large air planes on a run way
A traffic light suspended over a snow covered road.
a cat is sleeping next to a laptop
Man looking at cell phone while on another at a game.
a computer PC monitor and a keyboard and mouse
there is a man standing in the field with two cows
Cute teddy bears with flowers lying around together
There are people on an outside platform waiting for the approaching train.
A kitchen with a stove, refrigerator and dishwasher.
A cat laying in a wooden chair with a patterned cushion.
a woman on the tennis court playing tennis ball
A man holding a white and yellow frisbee.
A person feeding a kitten from a bottle.
A young man is on his skateboard going down the road.
three baseball players and one is hitting the ball
Sheep on a grassy hillside overlooking a river.
The guy is skateboarding while walking his dog.
A teddy bear with a red hat sitting on a bed with fluffy pillows.
A stop sign has street signs crossed on top of it.
there are many people that are standing around this building
A hot dog sitting on top of a plate with a salad.
Red train giving tour crosses a beautiful bridge
A couple of birds sitting on a tree, with a blurry background.
Bundt cake with icing sitting next to another decorated cake.
A batter has just hit the baseball in this small-town baseball game.
Sandwiches
Displayed for sale at a shop by keeper
A man in a suit carrying an umbrella walks across a tight rope while a woman in a gown waits for him on the other side.
A person takes a picture of people holding different pink umbrellas.
The cat is looking inside of the open backpack.
Men and women are playing a softball game.
A man skateboards up the side of a wall.
Four boxes that have pizza on them in a row.
The small bathroom has a metal toilet and railing.
A colorful public restroom focused on the sinks.
Decorative clock with three owls for the framing hangs on wall next to mirror
A plane flying low over a snow capped plain
A train is traveling down a road with buildings.
A clock tower with the American flag on top.
A ripe banana is sitting on a table with a cat key chain on top.
Happy people sledding down a snowy slope together
A wet rain soaked street surrounded by buildings and trees
A woman presenting on a computer to a large group
Two cats resting comfortably on a double bed.
A busy street has many cars parked on the side.
A black cup with a spoon sticking out next to a folded pair of glasses.
An airport filled with jets next to a parking lot filled with cars.
The manager is having a conference with his pitcher and his catcher.
Home base of baseball field with an umpire and catcher squatting down, and  a hitter bent legged, holding a bat against shoulder.
A pizza clock mounted to the side of a wall between two windows.
Two kids using an electric toothbrush at Christmas time
three zebras in the foreground and wildebeests look around
Woman holding a banana over her face in the guise of a smile.
A computer monitor sitting on top of a desk.
A living room filled with furniture and a window.
A room filled with luggage sitting next to furniture.
Two men skiing on snow in the woods
A toddler on counter top eating a banana near the electric stove
View down a city walkway and street, with grass, pedestrians, trees, cars on street and parked on side of street, a bench, and some buildings in distance.
Small toilet with tiled wall and patterned flooring next to it.
A stuffed toy is packed in a bag.
a counter with cleaning supplies ice cube trays and racks from a fridge and a drawer missing
A man is seated at his computer desk and looks at the camera.
Two people on a beach throw a frisbee.
Seating area with many benches outside a building.
A man in a black cap is purchasing a bottle of Aquafina water at a grocery store.
two buses and a streetcar on a busy street
Two large horses stand nose to nose in an open field.
A modern kitchen with recessed lighting, appliances and an island with a marble countertop.
A kitchen with a large stove and hanging pots.
A warthog and a zebra running in a grassland.
a person doing a trick on a skateboard
A bare bathroom with a sink and toilet.
A woman trying to eat a donut tied on a string
Kitten laying in a brown loafer stretched out
A group of planes near a large wall of windows.
An apple and orange resting on a table.
A red pick up truck parked on a field next to another truck.
Several commercial jets lined up at the gates at an airport.
The clocks are built onto two sides of the building.
a person that is standing on his head with a skateboard
there is a red bus that is parked outside
The cows are standing on the hay in a meadow.
A marina filled with lots of small ships.
A skate boarder takes flight on a high jump.
A small dog carries a frisbee in its mouth
A toilet with a full roll of paper and plunger.
a bath room with a toilet a sink and a mirror
An umbrella obscures a person sitting outside a store.
Two giraffes walking through a fenced in enclosure.
Two very large pizzas sitting on top of wooden cutting boards.
A young girl squishing her body into  a suitcase
an image of a woman sitting on the bench
People sit around low tables eating pastries, drinking juice and coffee.
Burgandy colored train coming around the tracks in wooded area
A herd of sheep grazing in a grassy field
A young girl brushes her teeth with an electric toothbrush.
A half eaten bunt cake sits on a white plate.
A group of people standing on the side of a ramp.
A man is holding a skateboad and a pepsi.
a blue and white plate with a sandwich on a wooden table
A fluffy cake is on a metal cooling rack.
An asian girl taking a photo cuddling with a teddy bear.
A fire hydrant painted to look like a soldier.
A couple of people with ties in a room.
a woman in a black top on a couch with a brown black and white dog
A decorated garden with a sheep standing in it.
A gold plated Chopper Motorcycle on display at a convention.
A fishing troller boat docked next to a lighthouse.
On main street is the Wisconsin state fair presented by U.S. Cellular.
A boy with his baseball mitt and ball.
A street plaza with horse riders and onlookers.
A baby is sitting on a potty chair.
A gray cat is laying on top of a suitcase.
A motorcycle parked in front of a brick building.
a person holding a paper sheep beside a busy subway car
A bunch of bananas hanging on the tree
A hotdog with relish in a basket with a receipt.
A kitchen with wooden cabinets and a gas range.
Two birds are standing on a very tiny rock island.
A Tennis player getting ready to hit the Tennis ball.
A man is walking his poodle as the poodle stops to rest against a bench.
A bedroom with the drapes open, and a television on.
A plate holds french fries and a sandwich.
A BLACK CAT LAYING ON A WOOD BENCH
a kid in pink is holding a stuffed animal
A fire hydrant in a garden on a suburban street
A parking meter on the side of a city street.
The man in the tuxedo is also bald headed.
A man that is standing up with a cellphone.
A white cat with a brown head sits in the window sill of a brick house.
A small dog lies on a pillow near a toy banana.
A white, black and green plane cake that is decorated.
This is a babes room that has a crib and a small couch and a dresser
A bicycler is stopped at an intersection waiting to go.
A woman pouring some wine into a glass.
a polar bear siting on rocks near a body of water
A few men ride on top of elephants while they carry large pieces of wood.
A giraffe showing his head to the camera from an enclosed area.
A Zebra and a horse are together in the wild.
Plums and bananas are in a glass bowl.
Two people sitting in chairs under an umbrella  in the water
A black and white picture of people in a park, flying kites.
A water hydrant on the sidewalk with plants nearby
an image of the wilderness with a brook
A dog lays in a bed and looks a little sad.
An adult elephant and baby elephant loving on one another.
Afternoon at a dock with seagulls flying overhead.
Several white chairs lay on a grassy field while cows mill about them.
A yard and cars on the street covered with snow.
A small red couch in a living room with a coffee table topped with a flat screen tv
A passenger bus that is parked in a parking lot.
A woman with her pants pulled down on the toilet.
There is a large window over the kitchen sink.
A man riding a skateboard down a sidewalk.
Two giraffes standing in a wide open area.
A person leaning back holding a tether while water skiing.
a tall tower with a clock on top with a sky background
A little girl in fashionable rain wear is walking under an umbrella.
A boy asleep in bed with his Christmas teddy bear
A table with crusty bread and cheese platter on it
Several horses standing on a hill while grazing.
A skateboarder doing a kickflip in a skatepark.
A couple of beds sitting next to each other.
A giraffe is drinking water from a pond.
Several people standing together with a red stoplight behind them.
Giraffes are standing in an enclosure peering over a fence.
A computer sits on a desk with a red chair in a bedroom,
Some cute small kids sitting and playing a video game.
A young man holding a doughnut sitting at a table.
A group of men holding up a bunch of bananas.
A man in a hat riding down the street on a skateboard
Some cattle are walking on a dirt trail
A man in a carnival outfit posing for a picture.
A calico cat taking sun bath in a window.
A woman holding a plastic utensil passing out a piece of cake.
A group of five zebras walking in a grassy area next to a rhino.
A motorcyclist stops on the road to allow a pedestrian to cross.
Recyclable material in garbage bags are left outside.
Toilet with blue rug and blue rug cover saying please do not use. Sorry!
A skateboarder plants his board at the end of a bowl.
Little boy swinging a plastic bat at a ball in yard.
A man holding a racquet toward a tennis ball.
a dog going for a frisby with a house and vehicles in the background
two men in womens pajamas playing on the wii game
a grizzly bear is standing in some grass and brush
An old city with canals filled with water.
The airplane is flying really close to the tower.
a close up of a toilet with a device over it
A microwave mounted in a shelf with the microwave door open
a man with a tie and headphones sitting at a table
A white toilet seat in some lavatory somewhere.
A dog sitting on top of a bed under a window.
A couple of people in a room with remotes.
A cloudy day with two airplanes getting ready for take off.
A few people are doing something at this point that is darting.
A brown long horn cow standing on top of a field.
these are three giraffes on the grass outside
A male tennis player jumping and swinging a tennis racket.
An orange cake with whipped cream frosting sits on a plate beside a book on the table.
A black and white photo shows a man hanging out of a plane.
the double Decker bus is not in service
a couple of skiers on top of a snowy mountain
A baby in a high chair at a table.
A pizza oven with a baking pizza inside it.
Three giraffes walk together across a field with trees behind them.
a close up of a bike at a train station
The woman is sitting at the table and eating pizza.
Man with glasses and a mustache standing in front of a door.
This bathroom has a handrail in the shower.
PEOPLE WAITING IN LINE TO GET FOOD FROM A FOOD TRUCK
A hand holding up a cell phone that is taking a picture.
Several motorcycles are parked outdoors facing each other.
THERE IS A TOILET IN THE CORNER OF THE ROOM
A man and a woman are standing besides a parking meter on an urban and colorful city street.
A man and a woman pose for a picture at a party
A panda bear rolls around looking ridiculous.
A cat has made itself comfortable on the chair.
A bathroom that has different posters on the wall.
A sign for a pizza place rests on the ground.
a couple of birds that are on a branch
a yellow and white concrete truck next to a bus
A man doing tricks on a skateboard on the street.
A large bus and some people on a road.
a close up of a cow near a wooden bench in a field
an airplane is taking off from the runway
a large pizza is laying on a table
A girl holding on to a large, white teddy bear.
A skiier posing in front of a mountain range
this is a park with people flying kites
A green passenger bus is boarding passengers near some water.
many people sitting at desks near one another
a stop sign with the red color looking all cracked
A living area with couch, cabinet and many windows.
A ham and chili sandwich is close up.
A white bowl filled with vegetables on top of a wooden table.
a black and white god with blue frisbey
Cropped up carrots, onions, other vegetables on a on a cutting board
Is this a Honey Dew donut or a bagel?
An airplane beneath a cloudy sky flying over a bridge.
a person cooking meat on a grill
Very large TWA plane sitting on the runway with passengers milling about
a tall giraffe standing on top of a dirt field.
a close up of two people holding a video camera
A large elephant walking towards a watering hole.
A man is sitting down at a table, eating his stew and tortillas
A tall clear glass with a very pretty flower in it.
A man riding a wave on a surfboard.
A trainer picks up his horse's lead rope.
These two cats are playing in a room that has a large TV and a laptop computer.
an orange bathroom with a sink toilet and mirror
A man is balancing on a skateboard while others ride and stand.
an area with snow and lots of skiers and orange cones
A man sitting at a table about to enjoy a healthy meal.
A man that is sitting in a chair by a skateboard.
A group of men and emergency responders surrounding a table.
A man wearing blue jeans and a white shirt is on a skateboard in a skate park.
A bird sitting on top of a log in a lake.
A passenger plane sits on the tarmac awaiting passengers.
a bunch of bananas and apples for sale
A small black dog sitting inside of a car.
A passenger bus that is driving down the street.
An empty living room with a charred fire place.
A person laying under the sheets watching television.
A big bear sits on the ground and grabs on to a guy's leg
A view of a airport with people towing luggage.
A white refrigerator next to a counter with an orange box.
A modern bathtub, with a water hose next to it.
a cat sitting by a person using a laptop.
An adolescent giraffe near the fence in its enclosure.
a close up of a person laying in bed next to a book
A man riding a surfboard on top of a wave.
A glass shower door near a sink counter.
The food is ready to be eaten on the table.
A bathroom with a toilet, sink and red tile flooring.
People at a ski lift, with people off to the side one leaning down in the snow.
A flat bed is on the floor with blue blankets.
A herd of zebras drinking from a watering hole.
A young bear and a mother bear foraging for food.
A man holding a Wii controller in his hand.
Motorcycle parked on road waiting for train to pass.
A brown table holding a vase and three flowers.
A cat walking past a bicycle on a rock path.
A large group of people are having a pizza party
A hot dog covered in toppings on top of a container.
Red and white flowers in a vase on a table
Man directs two horses on an open field.
A kid buying ice cream at a truck
An outdoor patio with chairs and tables made of wood.
People are typing on their laptops in a room.
Someone who is cutting a cooked pizza with a pizza cutter.
A little girl hits a tennis ball over a net while a man stands on the other side of the net.
Man wearing a blue shirt and pink tie posing for a picture sitting by a window.
Woman flying a kite on walkway next to water.
A kitchen with lots of black counter top space.
A man in an arena rides a bucking horse.
A city view with buildings, bikers and walkers.
A beautiful white horse pulling a green carriage.
A small girl sitting at a table with several foods.
A mix of broccoli and other items in a pan.
A living room with a brown sofa, chair and coffee table.
a living room filled with furniture and a dog
A white plate with a small piece of cake and a cup of coffee.
A white toilet and hanging towels in a small bathroom.
A stuffed teddy bear is sitting on the sidewalk next to a street.
The man is riding his horse on the land.
People swimming in the ocean on a clear day.
A clock tower with a blue sky in the background.
A group of people in a wine cellar.
An elephant is standing in a grassy field.
A train is coming down the tracks near a building.
A bus driving on a rain covered street
Two laptops sit next to a tv on a tv stand.
A desk with three computer screens and a desk chair.
A vase with flower on top of a table
A man in a vest is eating a banana.
A small horse is standing in the grass next to a larger horse.
A white vase with some cherry blossoms in it
A vase full of roses on an office desk
very many trains  at the railway station to their directions
a old train that is on a train track
Two elephants stand face to face as if conversing.
A man sitting on a park bench holding paper
A cake on a plate next to some oranges.
One slice of simple cheese pizza on a paper plate.
An electronic device that is available for free.
A large truck next to people on a scooter.
Table top with two sharpie markers and pair of scissors.
a close up of a dog laying on a couch
An old building with clocks at the tower.
Lots of silver and black remotes sit stacked on top of each other.
A woman is talking to a man and holding a plate with a piece of cake.
Several plates with snacks and sandwiches in a display.
three people sitting on a bench holding plates of food
Two guys are playing with the wii together
a bird is standing in a patch of dirt
A white bear is laying out on the rocks
A large herd of animals drinking at the water.
A bus is parked on the road next to a building.
A baseball player slides into the base, as the opposing team waits for the ball.
A small kitchen with microwave and fridge.
A man standing in front of a shelf filled with supplies.
The man grins in a restaurant holding a glass of wine.
A black and white photo of a person swinging his tennis racket towards the ball.
A snow field outside of a ski resort.
A man standing in front of a motorcycle on a driveway.
A small yellow bird sits atop a hanging water supply.
a bathroom with a toilet a sink and a mirror
A cup of liquid with a fancy design on top of it.
This is a photo of a building with a large clock in the front of it.
a man standing next to a big red truck
A baseball player looks up and drops his bat.
A man on a tennis court holding a racquet.
The lofted ceiling features two white ceiling fans.
Two people holding remotes in their hands standing near a couch.
A computer monitor, keyboard, and tower with peripherals and plugs sit on a desk.
The bears are at the water, along with a seagull and another sea bird.
A white bus that is sitting in front of a crosswalk.
A white bowl with shrimp, broccoli and rice.
Street signs with trees and rocks in the background.
A photograph of a tiny bird on top of a tree branch.
Three couches are in a living room arrangement.
A man wearing a hat, standing on a snowboard in the snow.
A toilet near a wooden stool with a container on top.
A sloppy joe being displayed on a plate.
An elephant statue painted black, blue, white, red, green and yellow
Two adorable dogs enjoy a nap on a bed together.
A narrow room with various luggage and two men.
a person holding a carrot with a bike in the back ground
golden delicious apples, coffee beans, and blueberries are in the foreground of this photograph, in the midground is a banana, and in the background are varieties of cookies.
a baby zebra nursing from an adult zebra
The multi-colored cat is standing on the roof of a car.
A dog sitting on the floor in a room.
Small children in red and blue uniforms, kicking a red soccer ball.
A cat is sleeping on a wooden chair
A man flying through the air while riding a snowboard.
A couple of giraffe standing next to each other.
A man in leather and a dog with a hat and sunglasses on a motorcycle with people walking around them.
A giraffe is standing erect on a dirt path and grass and trees are in the background.
Many cars are parked at the curb or are traveling down the street.
a small child in a black top a kite and some grass
An empty chair at a desk with a computer
A antique style bedroom with hardwood floors and accessories.
A man riding a wave on top of a surfboard.
some girls playing a softball game with some people watching them
P.O.V. of laptop with people walking by on the path
Two horses standing on a grass covered hill.
The  man is driving the horse fast
A bus parked at a bus stop letting passengers get on.
Two people are attempting to catch a Frisbee.
Lighted urban street at night with cars and buildings.
People milling around a row of two story busses.
A plate with a sandwich and a salad next to a pickle.
A group of people flying kites in the national mall in Washington, D.C.
A plate of food with carrots, green beans, brussel sprouts and sauce.
An old white boat sits in the port.
A stoplight with street signs on it
A cat eating food off of a wooden floor.
Tour bus parked on an empty street in a tropical city.
A skier is carrying their skis and poles in the snow.
A woman with yellow gloves on looking at herself in a mirror and covered in blood.
A model set has boat in water going under a drawbridge
A plate with two grilled hotdogs noodles, macaroni and cheese and corn.
A blue dish filled with steamed carrots and broccoli.
A person on a surfboard on the water.
a well made bed in a hotel room with a window
A woman poses next to a statue of a giant piece of luggage.
a mushroom and broccoli stir fry on a bed of rice
A desk topped with snacks and electronics with office supplies.
A man in white shirt doing a trick on a skateboard.
A woman in shorts waving to a teddy bear mascot.
A woman sitting on a bench outside holding two donuts.
A group of people in a kayak rowing together
A woman that is standing holding a remote.
A horse is on a brushy hillside on the gravel.
A man cross country skiing in the snow under dark clouds.
A dog standing in the grass as a frisbee flies pass him towards the bushes.
the men are in the middle of a tennis match
THREE MEN STANDING NEAR A PARKING METER, ONE OF THEM PUTTING IN THE MONEY
A white dog running across a field with a frisbee in it's mouth.
a school bus having a colorful shirt sale
Some very cute small boys at a table with food.
a woman staring and some do nuts in a plate
A man on a cell phone sitting at a booth table with books.
A toilet that was set outside and a small part of it was broken.
A woman brushing the teeth of a baby in a bathroom.
Two giraffes in the savannah with buses in the background.
A bus pulling into a bus stop in the city.
a gray fire hydrant with eyes and a girl with a backpack
A woman in a white tennis dress playing a game of tennis.
A cat is looking at two pigeons perched on a ledge.
A group of people walk through the middle of the street.
A man is using his large laptop in the living room.
A young man skateboarding on the rim of a crater
some kind of cabinet that is in a building
a number of people in a field with many kites flying above
A TV showing two men in hats and women.
A sandy area with an elephant made from sand.
A small red and white airplane sitting in an airfield with a wooded area and a mountain in the background.
A person holding a hot pizza on a pan.
A baseball game in progress with the batter at the end of the swing.
A bathroom with a glass shower door, toilet and a rug.
a wooden desk with a computer keyboard on it
A boat in still water at a harbor at dusk.
A horse tied to a post next to a tree
A man with a helmet that is sitting next to bananas.
A plate with a sandwich and chips.
A zebra grazing in some very brown grass.
A plate is full of a vegetable medley with a spoon in it.
Man in full red winter gear on skis in the middle of snow.
Two orange sheeted beds in small room with desks.
A triangle sign with an English and foreign warning
three zeebras all walking together in a row.
A man looking at the bed in this lamp-lit bedroom
There is a man walking through the snow
A man preparing to hit a tennis ball on a court.
There are several skaters at a skate park skating around.
A feathered bird is sitting on a tree branch.
A white table topped with tubes of tooth paste and tooth brushes.
A counter with carrots, onions, peppers and other assorted vegetables on it.
Men with horse and buggies pose in front of a train.
A zebra that is looking at the ground.
A black cat is sitting near a mirror and a picture.
A kitten on a laptop sitting on a desk.
Two mean are playing tennis and both are wearing sunglasses.
A sheepdog at work herding some sheep in an enclosure.
A bus that is sitting in the street.
an image of a truck driving down a dirt road
a man sits on a bench next to a dog
A plate wit some very tasty looking treats.
A woman glides over the water while standing on a glider.
a person riding a skate board on a city street
A kitchen with dim light in the evening.
a small boat on the ground tethered by a rope
A person in a mirror in a very small rest room.
A large jetliner flying over a forest in a  blue sky.
A man and woman beside a red motorcycle.
A group of people walking and cross country skiing in the snow in the middle of the city.
A very pretty lady touching a cute bow tie.
A bathroom with a toilet, sink, tub, towel rack and a window.
Two empty motor boats floating in the water.
A women wearing a top hat who is riding a horse.
A group of people at some sort of function
A pizza sits on a plate on a table cloth.
A couple of cows and some people near a bike.
There is a baby sleeping on the bed.
A bleeding cut on a thumb near the nail.
this is a dog looking through a arear view mirror
a group of birds eating on some pizza
A hand holding a donut with a grassy field in background.
A room with a bed and some furniture.
People are skateboarding by the ocean at sunset.
A train drives past a station during the day.
A small girl is smiling next to a large pizza.
A closeup of this Giraffe shows his interesting head.
A white plate topped with different types of cake.
A living room with a large decorated Christmas tree.
Two hands that are holding a rose next to a tie.
Three women sitting at a table with drinks.
a couple of boats sit parked on a beach
A train stopped at a train station with passengers standing next to it.
Bottles and other items on a counter top.
Several different kinds of julienned vegetables in a bowl.
Two men playing tennis with one man preparing to hit the ball.
A yellow taxi cab that is parked illegaly parked in front of a fire hydrant.
There is a little toy hanging on the key chain.
A teddy bear sits on top of a sandwich board with writing on it in front of a cafe with outdoor seating.
A tasty looking slice of pizza with some toppings.
A freeze-frame series of a baseball player making a pitch.
A living room with a couch that has blankets on it.
a giraffe walking on a green field next to trees.
A cat that is laying down on a couch.
A table with a lamp next to apples sitting on top of each other.
A lot of different size trees in the woods.
Mangos, strawberries, and other fruits being prepared in a kitchen.
There is a road filled with busy traffic including odd buses.
A red and white, beige and pink moped parked on the street.
a man in motorcycle gear standing next to a motorcycle parked near a tent
A pile of band aids and medical supplies.
A small elephant standing underneath a wooden structure.
A man asleep sitting up on a metal bench.
a couple of women pose for a picture
A bird bath with three birds amongst some greenery.
A man and a woman sitting on a bench with laptops.
A man standing next to an older man near a plane.
A zebra is standing in the middle of a field.
A small boat sailing along in the open ocean
A man is trying to ski down a small hill.
A woman holding a cell phone is standing near a person on a bike.
A city street filled with heavy traffic flow.
a drain on the floor next to a trash can
A tan dog rests on a public bench in a city at night.
A couple of giraffe standing in the middle of a forest.
A dog sits in a car, looking out the window.
A street sign in the city giving directions to several intercity areas.
Blue bench in front of a large sandy beach.
A herd of cattle standing on a lush green hillside.
A large truck on the side of the road.
Young men playing with Frisbee in sports like competition.
A computer, coffee cup, and books sitting on a table.
A group of people on a grass field with kites.
A plate of chicken, green beans and mashed potatoes.
A giraffe and another animal are standing on the grass.
two males are on some grass playing frisbee
Two large blue bird with red heads walk on a grassy area near a body of water.
Two brown bears are wrestling in the water.
two cats laying on both ends of a bed
a table with fresh vegetables and some dressing
A group of people waiting by a large clock.
A picture of a street sign on the street.
A keyboard and monitor are sitting on a desk.
A herd of elephant gathered at the edge of the water
a bed room with a bed and a window
Three men with skateboards standing above a ramp.
A rooster and a hen are standing on a bed of hay.
A damaged, leather suit case sitting on a dirty sidewalk.
There is a clean bathroom with a blue floor.
There is a veritable banquet of fresh fruits on the long buffet table.
An area in front of a building has fountains, trees, benches, and people.
a young female standing in front of a large cupcake sculpture
a beautiful horse and a lady is standing by it.
A happy woman sits on the couch while holding a glass of wine
A woman with a tennis racket on a tennis court.
A fire hydrant that has been colored red, white, and blue.
The cutting board has apple slices on it.
A group of zebras walking in a grassy savanna.
THERE IS A WOMAN THAT IS PLAYING WITH WII
A zebra standing next to a parked car.
Three different dishes of food on a wooden table.
there are a few ducks that are sitting in the river
Sandwich on a bun in a white plate with a blue rim.
an old photo of an elephant near a body of water
Several people watching a snowboarder grind on a rail at night.
A woman standing outside with her umbrella open.
a black and gray pigeon some windows and a building
A group rides horseback down the beach.
two giraffe standing side by side next to a group of trees
A fence separates three people seen inside the dugout.  One young boy is in the batters box with his bat ready and another one is standing behind him and a man has his hand up to block the sun and is looking off to the distance.
A public restroom that is kept clean for it's customers.
Some elephants walk together through the grass.
A person holding an opened umbrella walks down a wet street.
There are four cows in the field together.
One man playing the wii by his lonesome.
Baskets of oranges are lined up on a table at a market.
Several woman are at a table while one of them slices a cake.
The kitchen needs to be cleaned before we can use it.
A wine glass sitting on top of a glass sculpture.
a close up of a stop sign and a street sign
Someone placed bananas, strawberries and oranges in a blender to make a smoothie
Woman walking a small white dog behind her.
Two riders on horseback cross a desert landscape.
A person is riding down a hill in the snow on skis.
Party of people in canoes going down a river while site seeing.
A man on a snowboard is performing a trick.
A corner of a bathroom showing the sink, medicine cabinet and small window.
Picture of what might be a TV remote control and a distorted picture in the front.
This double-decker bus is headed for White City.
A kitchen with green walls, white trim, and a refrigerator.
A Captain Jack Sparrow look alike plays tennis with school children.
A person is carrying some luggage near the train.
A pita and some fries sit on a large, white plate.
A giraffe is stretching its neck above trees.
A small dog sleeps in a basket on a computer desk.
A kitchen filled with furniture and a painting on a wall.
A street sign warning cars to keep clear of a driveway.
A bathroom with gray walls has a fan in it.
A man and a woman are throwing a Frisbee.
a person eating a hot dog, with a basket of toppings.
A street pole with multiple street signs pointing in different directions.
A Vietnamese woman stands on a boat laden with produce.
A display of ceramic items on a street.
A group of people wait in a red reception area.
A dog tied to a sign next to a man on a bike.
A white cat and a wooden bench by a building.
A man sitting down with a brown teddy bear on his shoulders.
white plate with a variety of vegtables on a scure
two kids are eating pizza at a small red table
A man is smoking next to outdoor tables.
A garbage truck is going down a well lit street with buildings all around.
A beautiful colorful angel portrait in on the back of a vehicle outside with nobody around.
A line of buses stopped at a crosswalk as someone crosses the street.
A rusted-out farm truck in a mountain field beside trees.
A women holding a wooden board that has a desert on it.
THERE IS A BATHROOM WITH A SINK AND A MIRROR
a baseball bat and a brown glove on some grass
A bench next two a table holding several pamphlets.
Two cats are laying on a bed in a bit of sunlight.
a man leaned over a toilet inside a bathroom
some skies are on a stand outside in the snow
A menu board at a fast food restaurant.
A bench sitting on top of a sandy beach next to the ocean.
A kitchen with wood cabinets, stainless steel oven, stainless steel microwave and a refrigerator and a hole where the cooktop goes.
A goalie is guarding his end at a soccer game.
A clock on a boardwalk near a beach.
There is no image here to provide a caption for.
A black dog running on the sand with waves in the background.
There are several stuffed animals standing near a brick building.
A laptop computer sitting on a table with a glass of beer.
A table topped with lots of different foods and sandwiches.
A group of elephants sitting in the middle of the forest.
The young man is playing a game of Frisbee toss.
Two vases and a potted plant sit atop a worn dresser.
A wire fence divides a background of backyards and houses from a yard with a child kicking a large ball.
Three friends are getting ready to ski on a warm, sunny day.
different colored ribbons a basket and a pair of scissors
An adult and baby giraffe in an enclosure.
A kite blows in the wind above a large sandy beach.
A television monitor mounted on the ceiling of a plane or bus.
A boy stands in the grass with his mitt open.
a man with a cell phone attached to his hat sitting on a bus
A kettle sits on a kitchen stove beneath a shelf storing a blender, canisters and other items.
A plate with a couple of scones and a kettle that may be tea.
a bunch of kites fly through the air
A pizza sitting on top of a box on top of a table.
Park benches are lined up in a room in the grass.
Little girl holding a ball over a red and white fire hydrant
A blond toddler in a pink shirt brushing her teeth.
A swan is floating down the river by the boat.
A woman plays with a Nintendo Wii in her living room.
Contemporary living room setting in urban residential building.
a toilet and a sink that is in the bathroom
A pair of individual using sails to surf.
A man standing in the grass with hid dog.
A man with a pizza playing on the computer
A man holding a ball on a mound of a baseball field.
Man in black vest with orange tie looking at the camera.
A person walking with a kite in the air.
a number of people standing near by parked motorcycles
a couple of people climbing a hill of snow
The outside of a house that has a clock in front of it.
A slice  of cheesecake with a red sauce and berries for topping.
An empty bed with a teddy bear laying on it.
A bear is seen walking in a forest in a blurry photo.
Children sitting down to eat lunch at school.
Several skiers wearing colorful attire ski slowly across a snowy mountain
A few cows graze in a big wide open prarie
A man standing next to a baseball player laying on the ground.
A white dish filled with vegetables on a white table.
a bathroom that has a sink in it
A pizza slice,with tomato on it and cheese
An open air market with a lot of fruits in a bowl
A man in a wet suit riding a wave on a surfboard.
A train is on going down the track while people watch.
A woman smiles as she holds a parasol.
A pole pointing where different things are at.
A dog on a leash is standing in a grassy field.
A large tall tower with a clock on top.
Several people are sitting around a table with food on it.
There is nothing but beer bottles in the fridge.
A young man with ear phones holding something.
A black and white photo of a martin Luther King next to a Lincoln statue.
A dog stretched out on the grass with his tongue hanging out.
A train sits parked on the tracks in front of a billboard.
A young blonde girl holding up 2 cell phones.
a baby giraffe has its head under its mother
The animal has very large horns on its head.
A shelf with hygiene products in a bathroom.
A clock mounted to the side of a brick building.
A woman dressed in black is playing tennis on the court.
A man eating a hot dog at a sporting event.
there is a woman holding a little girl taking a picture
The pay station for a parking lot is in a location that has recently had snow.
this bathroom is big but has a small tub a toilet and a sink
a white and red tow truck some trees and a building
A large group of girls enjoying a pizza party with pizza and soda.
a man that is riding on a bike
Male tennis player rushing hard to hit a ball.
A half eaten piece of pie is all that is left.
A man sitting in front of a table with a box of cupcakes.
the boat is sitting on the shore line away from the water
A couple of airplanes flying through a blue sky.
three large dogs sitting outside near a forested area
The young soccer player is kicking the ball.
A car and a motorcycle parked side by side.
an image of a man on top of a snow mountain
A batter waiting for a pitch with the umpire and catcher behind him.
A zebra stands in snow in front of a wall.
A stuffed monkey on a computer desk with two computers.
A table full of asian styled dishes and soup
A two stories bus is parked on the side of the street.
A woman riding a bike near a bus and other people.
a man performs a trick on a skate board
A group of people on motorcycles at an intersection.
A woman that is standing next to a dog.
A dog is in a yard with a Frisbee in its mouth.
A group on people standing on steps and posing for a picture.
a couple of people that are sitting on the porch
Rockers with crazy hair holding out a racket and smoking a cigarette.
A woman teaching a little girl how to ski.
The car us upside down on the road way.
A child washes grapes in a stainless steel sink.
A man dressed in a suit is eating carrots.
A stuffed elephant standing in a museum window.
Two keyboards and a computer mouse on a table.
A crosswalk signal at an intersection with a car and a bus.
An elephant walking across a dirt road in front of cars.
a small bed with blue comforter and sheets
An asian market has hanging bananas by the roof.
These people are posing in front of the trees.
There is a bathroom and a shower in a bathroom.
A giraffe standing next to some dead brush near a bird.
A pile of locks of hair next to a pair of scissors.
a green plant  is in a glass vase
A man that is holding up a camera.
A man is eatinga beignet covered in powdered sugar.
Two pieces of bruit are set bside a keyboard.
a bathroom that has a toilet and some nasty stuff all over
Shower with a removable shower head and a soap dispenser.
A man in a wetsuit is holding his surfboard on the pier.
A herd of sheep in a grass field.
Mountains with steam coming from them with horses on the lowland.
Two pizzas are on a pile of white plates.
A toilet in a bathroom with large signs on the wall.
The horses are grazing in the grass along with another animal.
This is a woman getting on a motorcycle posing for a picture.
A man prepares to swing while playing a video game.
A young blonde boy leaning on a toilet.
Two glasses of wine, two hot dogs and some tater tots.
a close up of a plate of food with salad on a table
The silver cover of an Apple laptop computer
a woman is holding a baby in her arms
Two people are walking down a snowy path with an umbrella.
An airplane is descending in the air to land.
A very drab looking room with a mattress on the floor.
A bin is piled high with many apples.
A young child is standing in the grass with a frisbee
people are sitting on a bench together outside
There is a burger in between two glazed donuts
a man surfing on an ocean wave headed to a beach
The head of a Giraffe with its mouth on a tree branch.
a little girl is holding a video game controller
A bicycle is chained up and locked to a sign post on the sidewalk.
A pile of Chinese noodles with broccoli mixed in.
A black and white photo of people flying kites
A group of people next to a person on a surfboard.
Closeup of a hand holding a Wii controller.
A glass vase with flowers in it next to a pair of computer speakers.
A store window with the reflection of a parking lot with a stop sign.
Two young men playing a game involving a disc on grass.
A large yellow train on a steel track.
A person is in the distance while a brown dog is in midair and is running after a frisbee.
A giraffe looking down while in a zoo pen.
A group of people are standing on the beach flying a large kite.
A slice of slightly eaten homemade pizza on a plate.
Stacked kites with long streamers being flown in grassy field.
A tray filled with plates and dishes full of food.
a plate with bunch of diffent foods mixed together
The dog and cat is laying in the bed with the man.
Family sitting at the table together enjoying dinner
a little girl sitting on a small kids toilet
A clock is affixed to the wall of a religious institution.
A large clock that is on the top half of a building.
A bathroom has blue guard rails by the toilet.
a man in a blue jacket and helmet on a black horse
A man sitting in a chair sitting inside of a living room.
a snow skier wearing black shorts and a blue jacket
Three park benches at the edge of the water
a couple of baseball player high five
A large dirty airplane is sitting in a dirt field.
A fancy steak sandwich served with fries and dipping sauce.
A beautiful living room view with a vase sitting on the table.
a bathroom with red walls and a tiled floor
A cow standing in the alley near a building.
A girl on skis is grabbing a man's head for support while several people watch.
A couple of giraffe standing on top of a green forest.
Children are in the living room playing a video game
a black computer keyboard on a wooden desk
A very narrow bathroom with a walk in style shower.
A light airplane flying in a cloudy sky
An older man watches a kite fly from across a body of water.
A desktop computer sits on an old and scarred wooden desk.
a couple of men that are standing up
A person that is kneeling in the sand near a bike.
a bunch of people that are skiing around in the snow
A person is sitting in a chair and a bird is on the ground.
An old fire hydrant on the edge of a city street
The painting shows a naked woman using her laptop.
Several people walking across the street in the rain with umbrellas.
A clock tower sitting in the lobby of an airport lobby.
Bar stools at a bar separating a dining area from a kitchen.
A guy sitting at a dining table with some tasty looking food.
A kitchen with white cabinets and a cool tile design.
Pictures of a bathroom taken at different angles.
a person standing under a colorful umbrella and wearing a big hat and sunglasses
A boy on a skateboard in a skate park performing a trick.
Cafe tables with table cloths and orange umbrellas over them.
A dog standing with his head outside a caged area.
The interior of a kitchen with wood floors and large appliances.
A broken black umbrella laying in the street.
A black and white cat beside a wood carving.
A small group of zebras is standing beside a water hole.
A big, fat bird has some crazy hair
A woman watches a dog watching a man eating a sandwich.
A person wearing an orange back pack standing in front of park benches.
A king size bed in a hotel room.
people walking around with a bus and car on the street behind them
a group of people with umbrellas walk on a side walk
A small bedroom with a bed and a desk
a stove on the front lawn near a side walk
Two elephants in a herd playing with each other.
A tour bus stopped in traffic on a busy street
A large pizza sits on a large white plate.
a living room with two big couches and green chairs
a teddy bear sitting on wooden steps leaning on a pole
there is a small pizza and broccoli on a plate
A plane sitting on the tarmac at an airport
Two adorable chubby dogs sleeping next to each other.
A woman in shorts giving a thumbs down signal
A Siamese cat staring at a laptop computer screen.
a big plane flying through the blue sky
a clock tower near many buildings wit ha sky background
A cat lounges on the arm of a sofa near a window.
Three people with a video game remote in their hands.
A vase with a white long stem flower in it.
A parking meter with no time left in front of it.
A four faced clock a top a stone column in  a parking area.
A group of people standing outside of a blue ice cream trucks.
A cluttered desk with books, bag  and electronics
this is a baby and a blue chair
A bathroom sink with travel size soap and shampoo.
The skier with the animal cap is standing on the mountain.
A baby observing a calf eating hay outside.
a little kid holding a toothbrush standing in a doorway for a bedroom
A red parrot eating a piece of fruit from the palm of a hand
A sports announcer talking on a cell phone while on a ball field
A very small bathroom stall with a toilet and several rolls of toilet paper.
Several sheep standing around in the grass.
People standing around talking and doing different things
three people dressed similarly playing frisbee on a tiled floor
A citrus fruit sliced in half on a plate.
Two elephants are walking through the tall grass.
the person is standing next to the animals in the water
A woman looking at a tablet while standing outside a train car.
A cat with a irritated look sitting on a bed.
a man sitting on a green bench in a park
A wooden cutting board next to a window topped with fruit.
A cat is sitting behind the keyboard of a cluttered computer desk.
A red double decker bus is seen in London.
A fire hydrant is painted red, white and blue and sits on a sidewalk in front of a brick wall that shows graffiti.
A pack of elephants are trampling in the sand.
A person kneels as they ride a wave.
A group of elephants walking across a large river.
A little girl tossing a red Frisbee in a driveway.
A group of zebras grazing in their enclosure
A white plate with a hot dog topped with mac and cheese.
A giant sheep with a lot of fur eats outside
A black and white picture shows a tree covered hill.
A couple of elephants washing a baby elephant in a river.
A young man that is wearing a nice suite coat with a skirt and a purse.
A big elephant playing in a puddle of water
Four People riding two elephants across the water.
a person riding a snow board on a snowy surface
A red train traveling down train tracks through a rural countryside.
A toilet, sink, mirror, and tub in a bathroom.
A truck made to look like a train parked on the side of the road.
many different vegetables are sitting on a white counter
a woman walking outside with oranges on a stick
Airliner being moved by tow vehicle near airport terminal.
a river that has a bridge with a train on it
A cat rubbing its head on a laptop.
A plate of toast and other breakfast items.
A light blue sky filled with colorful kites.
A bear reaching up towards a tree on a rocky hillside.
a bunch of guys in front of a table with cake frosting on their faces.
A truck waiting in front of the warehouse.
Two lamps by a window looking out at a forest.
A bathroom with a toilet and a sink.
A baby sitting in the grass watching kites fly in the sky.
Some motorcycles are parked on a brick area
The fire hydrant is painted all completely yellow.
A table with a bowl of food and some mugs.
The layer cake is on the flowered plate along with a fork.
A woman with good posture sits at a wooden desk with an open laptop.
a horse drawn carriage on a city street
A little boy is smiling at the camera in front of a brown chair.
The antique furniture and mirrors are next to the wall.
A woman running through a city while carrying a Frisbee.
A boy in camo shorts stands before an overturned skateboard.
a day of the dead offering with fruit
Some people are hanging out and playing the nintendo Wii.
A car in flood waters in front of a camping area with camping trailers that is flooded.
A very cute curly haired dog with a toy.
A man and woman sitting on a vintage motorcycle.
A street at night time with many different lights.
a person at the zoo feeding a giraffe
Someone holding a sandwich like food object with a few bites taken out of it.
A cross country skier walking in snow during the day.
A blue two layer cake sitting on top of a counter.
Two teddy bears one dressed as a female and one male
A black stuffed animal sitting on top of a toilet in a bathroom stall with blue floor tile.
A woman leaping into the air while holding a tennis racquet.
A carrot is being sliced as well as an onion
Several dogs on a yellow school bus with a stop sign below the window.
A woman sitting down with a large cell phone holder on her pants.
A seated angel figure next to a clock dial.
Two beds sitting next to each other in a bedroom.
a nun rides around on a motor cycle around on the street
A horse standing around in teh middle of a farm.
A family of four sitting on an outdoor sofa
Fine food served with sauce on a white plate
A kitchen with a microwave oven next to a stove top oven.
A small-furry dog on a red seat in a living room.
A wet woman with two horses wading through a river
Some elephants that are together in an enclosure.
A couple of people with many bikes on a street.
A couple of men standing on a tennis court holding racquets.
Two halves of a sandwich sit on a white plate on a table.
A peeled orange sitting on a white table next to the peelings.
A man in an orange outfit is directing traffic to drive slowly.
A woman in tennis attire swinging a tennis racket.
An elephant standing eating hay in an enclosure.
A blue and white plate with a chocolate dessert on the plate and powdered sugar on top.
Woman holding red cased cellular phone in room.
The man is showing the mess in the fridge to the ladies.
two tennis players with rackets and balls on a court
Picture of arctitecture probably a church or university.
a dog on a skateboard in a shirt
A group of giraffes and zebras in an  enclosure
There are boxes which haven't been unpacked but the television is already up on the wall.
A fire hydrant painted in the American patriotic colors
A green, grassy field with grazing animals on it.
A mid sized commercial airline flying in the air
A man taking a bite out of a doughnut.
People are standing in the grass playing with a frisbee.
a man about to throw a green frisby
A woman with her head out of the photo is standing barefoot in a simple dress holding a suitcase.
A cat is on a table with stuffed animals.
A highway with several cars on a cloudy day.
A streetsign with one side pointing to Maciel and the other pointing towards Wonderstump.
A room that has two people sleeping in a bed together and another bed on the other side of the room and a person at a desk and computer.
A polar bear in a polar bear enclosure at a zoo looking up.
A man sitting on a couch and a man on a chair.
A woman is hanging up post it notes in a kitchen.
Three giraffes under the shade of the trees.
A woman sitting on a yellow surf board on the beach.
A boy is running while holding on to a kite.
A meal is being prepared on the stove in a kitchen
A room of bookshelves with books, suitcase, area rug and tv
a small copper vase with some flowers in it
They are holding a frisbee together while hugging each other.
a little girl that is outside with a umbrella
A toilet in a white bathroom is seen in this image.
A man dressed in a suit and tie posing for a photo.
A man actively plays wii in front of a television screen.
A bear looks around in a rocky enclosure.
A commuter train passing through a small town
A couple of computer monitors on top of a desk.
A four sided clock on a raised pole.
an image of a slice of pizza on a white plate
A low to the ground stop sign on the corner of a suburban street
a kid sitting down eating a slice of pizza
a body of water with buildings near by
there are many people that are sitting on this bus
a person sitting on a toilet while operating a computer
Young woman with long brown hair in very dark grey jump top holding electronic instrument like a remote control.
a woman with glasses is eating a hot dog
Two small teddy bears sit by the vase with flowers
A man pitching a baseball on a baseball field.
A cow with a tag in its ear looking observantly.
a toddler playing the piano with a stuffed animal
A meal is being displayed in a tray with separate compartments.
A toddler is brushing her teeth in a bathroom.
this is a close up picture go two broccolis
A collection of differently colored trucks in a field.
a desk with a computer  a laptop and monitor
a man stands on a beach with a bunch of surf boards
Several bunches of carrots on a cutting board next to a squash on a counter.
Altered photograph or painting of a necktie creatively knotted
A man with a bucket hat riding a hose on a beach.
a little boy batting a ball while his family looks on.
A group of bikers passing through a crosswalk.
A monkey holding a strawberry and a banana.
A boat that is floating in the water.
A MAN IS ON HIS SKATE BOARD ON THE STREET
A couple of men standing near a sailboat.
The cat is sitting on the ground near the bench.
Motor and photon boats moored in the water.
The bus is parked next to the curb.
a room filled with white furniture and books on the ground.
He is writing to his destination on a skateboard.
A green bus of some sort moving along a road.
A elephant walking the edge of its raised enclosure at a zoo.
a cow that is laying down on some hay
A photo looking out of the side of a plane at another commercial plane.
A large cut pizza on a wooden surface.
A street sign that says Pee Wee Reese Street.
A baby zebra rubbing up against it's mother while she eats grass.
A bus leads traffic down a city street.
A picture of a person standing by a bicycle.
Several colorful foods are sitting on a large plate.
A close shot of a green bathtub and a toilet.
Two male tennis players posing at center red clay court.
An egg-topped hamburger and arugula salad with broccoli
A fish tank is inside a underwater themed bed room.
A bred and silver plane resting on stands outside.
A young man standing in forest filled with trees.
a home made breakfast that looks super awful
Red and white bus parked next to a glass building.
A large polar bear walking near some rocks.
a woman is jumping up in the air by another girl
Man in blue pants and white short on a stage
Old model Harley Davidson motorcycle and old cars parked.
A bird swimming in wavy water, with a island in the background.
A zebra is trying to stand in the shade.
A woman cutting cake while another woman is holding a plate.
An orderly bathroom with two sinks and a large mirror.
A bath tub sitting in a kitchen next to a brick floor.
A man in an apron arranging a stack of oranges.
A group of people riding an elephant through the jungle.
A person climbing up a snow covered mountain.
a woman sits on a chair with a laptop on her lap
Two guys sitting on a couch conversing while another guy looks at his camera.
A distorted black and white picture with clocks.
A girl holding a racket and touching her head
A man does a skateboard trick up a ramp
Two people sitting at a table with laptops in a bookstore.
A foreign candy sitting next to it's open wrapper.
A stop sign has been tagged to include the hammer time song.
a person walking beside a boat sitting next to a fence
a train going down the track all by itself
a traffic sign two people walking and a van in front of a large building
Several women working with some type of production equipment
Business people having a discussion during a luncheon
A giraffe caged in while grass falling from his mouth.
women riding on the backs of elephants at the circus
Two snow patrol people at the bottom of a snow hill.
A large bear standing in front of a bunch of leaf filled trees.
THERE ARE A BUNCH OF SHEEPS THAT ARE ON THE GRASS
A dessert is sitting on a plate by a teapot.
A snowboard sticking out of snow covered ground.
three giraffes walking outside near a wood gate
A cow rests in a pen with a turkey, chicken, and duck.
A family of people hanging out on a beach.
Several pedestrians crossing an intersection at a bridge.
A little boy that has birds on his arms.
A large jetliner flying through a sky filled with clouds.
Two cats eating out of one food bowl.
A blurry man standing next to another man laying on a bench.
A single file row of dark colored luggage backs.
A man is laying on the couch with a large cat.
A woman with a concerned look talking on a cell phone.
A plate topped with two sliced of pizza.
The traffic lights glow green in the night sky.
There is a man sitting on a bench listening to music
A red fire hydrant sitting in the middle of a sidewalk.
a sign in front of an old house in the city
The medium sized zebra is looking into the camera.
A young boy holding a remote control standing in front of a TV.
A teddy bear is on the hand rail of a train door.
A woman sitting in a car smiling while sitting beside a bunch of suitcases.
A small group of cows standing in front of the camera.
A large bunch of broccoli growing with the leaves around it.
Several street signs displaying street names, addresses and driving option.
A metal sink with a cupboard of knives sitting on it.
a fridge is shown with some pictures on it
Two blue and white vases are sitting on a table.
small bathroom with tiles on the floor, sink, toilet and a window
A group of men sitting on a lush green field.
four jet plans are flying across the blue sky
A group of men waiting for a bus at a bus stop.
two people riding down the middle of the road on a moped bike
A woman holding down a dog with a swab in her hand.
There is someone standing in water holding a board.
A man with a bandanna on serving himself food.
A man and woman that are standing near a table.
A woman that is holding a camera taking a picture in the mirror.
Washed clothing is hung out on a clothesline in a cattle enclosure.
A group of people standing near a number of blenders
The men are playing a game of baseball.
A young man is body surfing and paddling in the water.
A young girl sitting in front of a bunch of bananas and grapes.
A double decker bus driving past a tall building.
a couple of people are holding tennis racket on a court
This man is skiing down a snowy slope
There is a surfer riding a wave in the ocean
A bus parked along the side of a busy street.
a man cutting  up carrots in long strips
A man sitting down eating a pizza at a restaurant.
A body of water containing boats, kayaks and people.
A woman is preparing to make dinner at her kitchen counter with the cabinets open
An arrangement of doughnuts grouped in front of a store window.
A fork sits next to a piece of white cake.
A ram sitting on top of a hill in the day.
A man on a surfboard riding a wave.
A black girl removing her denim jacket top.
A male elephant stands beside a shady bush.
a young baseball player starts running to first base
A small white dog lays in front of the fireplace.
There was a lot of organizational effort put into planning this kitchen.
A bull is next to a large group of people outside a train.
A skier is shown kneeling while on a flat patch of snow.
Two horses in grassy field below power lines.
A group of people outdoors next to a large white building.
A slice of cheesecake sitting on top of a white plate.
A blurry image of a gauge on a pipe.
a couple of hot dogs that are on agrill
a newly married couple cutting up a colorful wedding cake.
I sign in a video game warning dog owners to pick up after their dogs.
there is a baby sheep that is laying on the ground
A large bathroom features tiled walls, two mirrors and two sinks.
A bird perched on top of a branch in a tree.
a small boat on a large body of water
An elephant and a handler in an enclosure down below.
A manual or book about ten-speed bicycles
A skateboarder doing a trick at an event.
A stop sign between two traffic cones in the middle of the dessert.
A brown dog carrying a frisbee in a grassy area.
A picture of two smart phone display screens.
A United plane flying close to the runway.
A large airliner with a kangaroo on the tail wing.
Passengers waiting for their bags at a luggage carousel.
A rural train station is loading and unloading passengers
A group of soldiers  sitting at a table with a woman.
A group of ties hang off a pole
A bear is sitting on a rock in the sun.
A person surfing on a continuous wave ride in a city.
Lady laying across a bed with a dog.
an image of a small airplane flying in the sky
A woman taking a swing at a tennis ball
A family is posing with their luggage at the airport.
A white bowl filled with rice and broccoli beef.
A group of passengers with a lot of luggage.
A boy sits in a living room using a laptop computer.
Three carrots being cut by a large metal knife.
a cat sitting underneath a vehicle on the cement ground
A man wearing a red striped tie is seen talking
A man is sleeping with the covers pulled up high.
A puppy is learning to retrieve a frisbee.
A double bed with white sheets and floral pillows and blue trimmings.
A lucky bamboo plant in the window of a small bathroom.
Two people in suites posing behind some serving bowls
A bowl of salad is sitting next to a dessert on a plate.
A group of men standing next to each other.
a number of motorcycles parked near each other
A snow covered street with a person walking down it.
a couple of sheep stand in front of a rock
A black and white dog sitting on a bench.
a man eats a sandwich and drinks a cup of coffee
A close up of the side of an orange train.
A man holding a large soup pot in a kitchen.
A dog standing on blocks outside near deck furniture.
A man looks down at his loose necktie with disdain.
Three people cross country skiing in a wooded area.
Two zebra eating hay outside in a zoo.
A airplane that is sitting on a runway.
Batter, catcher and umpire during as baseball game
A big elephant standing beside a small elephant in tall grass field with other animals obscured in back.
An airplane flying through a cloudy blue sky.
a girl in a white jacket and orange sun visor playing tennis
Four pieces of toast with olives, cheese, and other toppings.
A zebra that is close by is grazing on some hay.
The large winged bird is looking for some prey.
The cook is slicing lengthwise  several  bananas on the cutting board.
Train on the tracks at a station with people sitting on a bench.
A cat with a cone on sitting behind a man while he is sleeping.
A fairly normal looking bathroom that's in someone's house.
A man with a knife and chopping board cutting apples
A blond girl carries a tennis ball on top of her racket.
a baby with a pacifier sleeping in bed
a dog wears a baseball hat on his head
there is a very tall giraffe in a zoo
three people closing their eyes standing in a line together
A large number of identical wooden boats float close to each other on the water.
An emo girl laying on top of a bed on her back.
A train trolley with a car in front of it.
A view of kitchen missing everything except the microwave and top cabinets.
Woman walking down a icy walk way next to a stop sign.
The salad is inside of a clear bowl on the table.
People observing a display of a concept motorcycle.
A background of blurred shapes is fronted by bunches of green bananas  of which one's been ripped off.
a couple of cars are parked outside a church
A cat lies up against the arm rest of a couch.
A crowd of people in a metropolitan area at dusk.
A little girl at the picnic table eating a cake.
Plate full of cooked carrots, potatoes, and other vegetables.
A dish which consists of roast beef, broccoli and potatoes.
Laptop computer next to monitor on wooden desk.
a white plate with some food on it
some phones on a wooden table and a laptop
a red and yellow train is going past some red lightstrain signals
The giraffes walk next to each other down the wilderness trail.
Three glass vases with a single yellow flower in each.
the truck has been painted red white and blue
A picture of a scene in a baseball game.
A giraffe reaching for a tree branch on a sandy zoo lot.
there is a owl that is sitting in trees and bushes
These people are riding horses through the mountains
A pizza with veggies and eggs on it.
a cat laying down on top of a cardboard box
many fruits arranged in large containers indoors near a weall
A young boy in glasses paying video games
a slice of orange sitting next to a sliced cake
A green street sign sitting on top of a metal pole.
A pair of gray shoes are sitting on a bed.
A cake that is made to look like a pink castle.
A tan cat wearing an old bowl as a hat.
A bathroom with a toilet, sink, and other bathroom items.
A small cow stands near a market display of soda bottles.
a train on a train track with trees in the background
A white toilet and a dark cherry paneled wall.
A brown and white animal standing next to a marina.
Two young men standing next to two dogs.
a living room that has a coffee table in it
Two children stand beneath the tail of an airliner near many others.
A zebra standing on a lush green field.
A black train parked next to a  red train in a train station.
a clock that is sitting on top of a table
A messy desk with a computer, cups, glasses, bottles, books on the desk and the floor.
A MAN SITTING AT A TABLE WITH NICE DINNER GLASSES
A man on a surfboard kneels down as a wave breaks.
Rice with ground beef and asparagus in a bowl.
Boat sitting by the dock at the river
A commercial stainless steel kitchen with white dishes
a male in a red shirt cooking pizzas in an black oven
A large elephant standing in a grassy field.
View of one of the clocks surrounding this tower top
A person with a red bike jacket is riding a red bike
a bunch of toilet seats in a building that is being renovated.
a small child is holding up a bottle
A drawers of various supplies in different sections.
People walking through a multi level shopping mall.
A giraffe standing behind a wire fence on a grass covered field.
A view of a small room with a bed, and small kitchenette.
A woman holding a plate with a pizza on it
A chef playing salad in bowls in a kitchen.
A variety of donuts in a glass case.
Two girls walking with umbrellas on the sidewalk.
A skier in a panda hat poses for the camera.
a small child with an open umbrella on the ground
A skier on the snow with gear and ear muffs.
Antique military biplane at waters edge at beach.
Fighter jet on a airstrip with low hanging clouds.
A baby lies on blue and green bedding next to a teddy bear.
Old single engine plane on display in open building
a man is holding something next to a motorcycle
a guy dressed in leather sitting on a motorcycle next to a bus
A white sink in the corner of a grey tiled bathroom.
A fenced in area with a giraffe reaching it's neck and head over a fence that separates it from people.
A man flying through the air on top of a skateboard.
A group of people sitting around a table with food.
Man walking up mountain using ski poles with backpack on
a cow sitting on top of a hill eating in the rocks
Two pizzas sitting on pizza pans on a oven.
a child and adult in ski gear walking in the snow.
A woman in bed beneath red linens having a conversation with a man.
an image of two people playing outside with cups
Two motorcycles side by side in a building.
A boy skating on his skate board at a skate ramp.
A man riding a surfboard on top of a river.
A woman talking on a cell phone standing next to a  parking sign.
A woman under a sheet in the bed with her head on a pillow.
A elephant standing close to a fence in front of trees.
Cattle in a fenced area resting and eating next to a lush green field.
a little green bird sitting in a tree next to a house
A black and white cat stands on a bathroom sink.
The young woman smiles shyly while washing dishes.
A bus that is driving in the street.
Three zebras hurry across the road in front of car
A hand with a glove over it above a toilet.
Boy holding an umbrella at the edge of a cliff.
a boy looking over a gate at a cow.
Hazy image of a surfer riding a wave on the ocean.
a bathroom in an outhose with a wooden window on the side of it
A plate that has food on a table.
A person holding a purple stuffed teddy bear.
a line of buses that are parked in the road
robot dogs playing soccer in front of people
Bathroom vanity show featuring the sinks and a stool.
Yellow fired hydrant on the side of a city road.
An oversized picture of a train has a conductor standing by it.
A green and white bathroom with folded towels
A red double decker bus traveling down a road in the snow.
An old man riding a skateboard down a street.
there is a woman standing by a trains window
a close up of a man taking a bite out of a chocolate glazed sprinkled donut
The desk has multiple computers screens and mouses on it.
Many cows grazing outside on hills in the grass.
A bed with grey sheet and two red pillows.
An grown elephant standing beside its two babies.
a tennis player on a court with a racket
A kitchen is all white with gray counter tops.
A man's legs standing on a skateboard on a road
A man is surfing a wave in the ocean.
A man with long hair is about to hit a tennis ball.
An open kitchen with dark wood cabinets opens to a seating area which is vacant.
Two women on a balcony cooking on the grill.
A look at a sign signaling no skateboarding.
a black kitty laying on a bench licking its paw
A High flying skier is doing a mid air flip.
A man sitting on the sidewalk under an umbrella
A sandwich on a green plate on a kitchen counter
Three cupcakes with blue icing are on the table and the middle one is split in half at the top.
A large truck driving down a road next to a car.
A Christmas display featuring stuffed bears and rabbits.
People looking through the tents at the book festival
a male skateboarder in a black shirt is doing a trick
A group of  men standing around a giant sheet cake.
A kitten toy is on a desk with a computer.
A herd of long horned cows laying on the grass.
a city street with some cars driving down it
A man swings at a pitch during a baseball game.
two dogs laying down in a pillow on a wooden floor
Workers in a restaurant kitchen preparing meals
passenger train in front of a depot on a late afternoon
Two giraffes feeding while standing behind a fence.
a horse in a field of grass
A white and brown dog is covered in a blanket.
A young female wearing black is holding a purse and a cell phone.
A giraffe walking away in a zoo exhibit.
Pen and paper on desktop with computer equipment.
A women reaches out to catch a softball
A large clock sits in the middle of a flower bed on a street.
The surfer sizes up the waves as he holds his pink surfboard
A man working on a laptop looking at the camera.
two parked motorcycles umbrellas shops and people and a tree
A huge double couch in a living with a TV against the wall.
An area with blankets and food containers laid out with people holding umbrellas sitting on the ground.
A kitchen in a dollhouse with various dolls in it.
Baseball batter gets ready for the pitch during the game.
The cat is laying down in the window resting.
A red fire hydrant pouring water onto a sidewalk.
There is a vase filled with water that has rocks and a plant in it
there are many people walking in the rain with umbrellas
A horse wearing a saddle standing in the sand.
A statue of a baseball player extending his arms to catch a ball.
a piece pizza on a white plate with tomato
Two red trains are on one track as a yellow train rides down another.
The man and woman stand next to each other holding video game remotes.
A group of people socializing at a dinner table in a restaurant.
A man sitting on a bench in front of a bunch of pigeons.
a collection of animal kites flying into the air
A bus headed to Manchester is on a street.
A view of a total gym exercise piece laying against the wall.
A headboard attached to a bed mattress in a room.
An old truck, painted over blue in the desert
Two suitcases that are sitting near each other.
Open door going into a bathroom with black and white tile floor.
A man riding a wave on top of a surfboard.
A woman catching a red Frisbee while standing on a dirt road.
A dog is posted by the window with his reflection in a mirror.
A picture is taped to the bottom of a stop sign.
A lot of food that is on top of a table.
Five unknown objects displayed on a beige counter.
A couple of of surfers talking on the beach, with other surfers in the background.
A woman putting a pot into the oven.
Stairs lead down towards a fire between benches in a garden.
A man standing on a very busy sidewalk in a city
An elephant standing next to a green plant with purple flowers.
A girl brushing her hair by a bed n a room.
People on skateboard and with bikes on a ramp in a parking lot.
there are two elephants that are walking on the road
a big pink house with some chairs out front
A MAN HIS HOLDING A SURF BOARD WALKING ON THE BEACH
Slice of dessert items served on plate with fork.
A living room with big windows looking at the ocean.
A train traveling down a track during the day.
Several street signs hand on a pole as a brick building stands in the back ground near some trees.
A tall giraffe standing on a  lush green field.
man on blue tennis court preparing to make serve
a man talking to a pretty girl under an umbrella
A pair of zebras standing in pen, in the grass.
A vase containing water as well as a flower in it.
A blue water hydrant on a roadside in a city
An adult elephant standing next to a baby elephant.
a rest room and a bench inside the dugout
A brown and white cat laying on a tan sofa.
A person in a room with a remote.
A picture of a black cat sitting on a young man.
A Bathroom with a toilet, sink, and records on the wall.
A young man eating food on a kitchen counter.
A large refrigerator and freezer sits in the middle of a kitchen.
two kids playing in a park with their kite
A chestnut horse stands in the surf on a beach.
A couple of buses parked across the street from each other.
Two people next to a metal bench stare into a river.
A trash can sitting next to a bench outside with a trash bag next to it .
A group of teddy bear on a shopping trolley
One man with soccer ball touching his head while another stands near.
A narrow bathroom with a thin door is shown.
A collection of items for an advertisement are arranged on a table.
This is a bathroom that is in someones home.
A dog in mid air catching a frisbee on a field.
A woman sitting on top of a brown horse.
A group of cute stuffed animals in a bed.
some people and two laptops on a yellow table
A man is taking a slice of thin pizza
a chair made out of skis with people playing on the grass
A woman walking a path by snow with her dog.
a couple of horse that are pulling a wagon
there is only one boat on the sand at the beach
Maintenance city man inspecting fire hydrant on street.
A roller skier pushes off down a street.
A man leaning on another man both in suits and ties.
A modern kitchen with a glass of wine on the counter.
An old cement wall in a home is decorated with garland.
a ball game being played on the field in front of an audience
Three people at a intersection are waiting for the light to change.
A group of passengers on a public transportation bus.
A cat sits by four matching luggage bags.
A long haired cat is sitting in an open suitcase.
A young boy and man eat food at a cafe
A table topped with oranges and a bowl of salad.
Cows lay down resting in the foreground while a flank of trees highlights the background.
An adult walking beside a child in a field.
A child stands with his bat ready to hit a ball.
A baseball player is going to hit the ball
panda bear sitting between two trees in  forest
The sandwich dominates the plate and comes with soup.
a person reaching up for an open umbrella
An airliner is descending over the water to an airport.
A boy is holding a dog that is wearing a hat.
A woman hitting a tennis ball with a tennis racket.
A white tub sitting next to a window and shower.
a woman sitting in a chair at a dining table in a restaurant
A bald man in a suit on a television.
There is a half eaten piece of pie on the plate.
Three zebras standing near each other in an enclosure.
There is a woman that is sitting down playing wii
A brown leather piece of luggage sitting on a luggage stand.
a Shetland pony  with  tennis shoes on
THERE IS A CAT THAT IS ON THE BACK OF A DOG
a group of people skiing down a snowy slope
An open Swiss Army knife rests on a table.
A variety of kites flying over the beach and ocean.
People at a bus stop getting a a bus.
two cooks in a kitchen sampling their food
A lady looking into the sun standing on a hill wearing skis.
Several cake doughnuts cooking in large fryer full of oil.
a guy sitting on his motor bike under some palm trees
a living room with a bright red couch next to a yellow wall
Two cats find room to stretch out and rest themselves end to end, even on a cluttered desk.
A man in a red snow jacket is standing on skis.
A painted fire hydrant next to an old tv.
A close up of a pole with several street name signs.
A bathroom with a toilet, and sink with the lights on.
A man is presenting someone with a chocolate cake.
A close up of a television remote being pointed at a TV
The young woman is licking the bread of a sandwich.
A cat looking at something on the floor
a room showing a fridge well cleaned and a microwave
Two Zebras eat grass in a dusty area.
A silver sports car is parked beside horse droppings left by a group of horses.
A china cabinet filled with fine blue and pink china.
A bathroom with a sink, toilet, mirror and toilet roll stand.
A man is standing on his skis in the snow.
Several boats out off the shore of a lake.
Man displaying bunches of fruit in arid area.
a male in an orange shirt in a black suitcase
People sit in a hot tub that is surrounded by snow.
A baseball player has just swung his bat.
A large white clock tower sitting in the middle of a city.
A group of four people standing next to each other in the snow.
A heard of sheep are roaming in the pasture.
A dog sitting in a chair next to a table.
A person they sitting down in a chair.
A room with low ceilings and old furniture.
A nice big living room with a big fireplace.
Couple of goose standing at the water's edge while ducks swin in it.
A couple of women sitting at a table next to drinks.
A mass transit train moving across a small bridge.
A piece of cake is sitting on a blue, green and white decorative plate.
A fighter jet flying through the air above the clouds
A pole is holding up street signs in the city.
A photo of two people sitting on a couch, one playing the Wii.
Adult and juvenile cows roaming in a grassy field
A very cute dog laying down in a child's bed.
A picture of a person touching a cupcake.
A person wearing a helmet is holding bunches of bananas.
A woman sitting on a bench that is facing the ocean.
A man standing on top of a beach near a surfboard.
Small Prop plane drives along the runway in the day.
A woman hitting a tennis ball on a professional court.
A broken flip phone sits, in two pieces, on the counter.
A produce stall at a farmers' market displaying baskets of carrots and cauliflower.
A bathroom scene with two bathtubs and a toilet.
A man holding a new sign under a stop sign.
A small bathroom with a stand-up shower.
a cat is sitting on a wooden bench outside
Two guys getting ready to jump of a ramp with their snowboard.
A zebra stands in the dirt in its enclosure.
Several men in suits and military gear standing near a table.
A smiling boy wearing a white shirt and red tie.
A blue single engine airplane in the air above a landing strip.
A group of stuffed animals sitting on a bed
A bathroom with a sink and a tub and a minimal, modern style design.
A goup of people at a wine tasting.
A woman in black shirt resting on a luggage carrying cart.
A young boy riding a skate board on the walkway of a park.
A kitchen with white walls and wooden cabinets.
A group of open umbrellas piled on top of each other.
An assortment of fruit for sale at a market.
A single engine aircraft parked in a grassy field with other planes.
Group of people all showing off their cellphones in a group seating.
A bird soaring through a foggy sky over a snow covered mountain.
a bird flying just above a body of water.
some giraffes standing next to each other in their pen
Several herd animals are on the grass by a mountain.
a close up of a plate of food on a table
Vehicles on the side of the road and a herd of sheep.
A young boy standing on top of a green field holding a baseball bat.
A wet dog running on a beach with a neon green Frisbee in it's mouth.
A man in an empty parking lot trying to pull something
A man is in the picture above a plate of food.
The thin woman is standing between a man and an eating dog.
An apple being held by a hand with a knife tip presses against it.
A bathroom with two white toilets and a large bathtub.
This group of steer are laying in the grass
A zebra standing next to a zebra sitting on the ground.
A man camping with two dogs eating a meal.
Two people are flying a kite on the beach.
A plate with a hot dog and fresh pickles.
A group of animals standing in a grass field.
A woman sitting on a bench looking at her cell phone
Several paraskiers engaged beneath a cloudy winter sky.
Young girl dressed in blue and pink skiing down a hill.
A smiling grey teddy bear with a plaid bow lies on a green carpet.
A tennis player holds his racket in the air after hitting the ball.
Couch and chairs in living area with television.
A cup of coffee next to a laptop of some sort.
A cat cleaning itself on the top of a suitcase
There is a bowl of fruit with apples, pears, and oranges in it.
A person is skiing down a mountain next to a  blue line in the snow.
A young girl standing in front of a plate of food.
A cake is the table along with some fruit.
A group of celebrating fans in a city street.
Someone laying on a wood floor with a dog
A group of giraffes stand together in the field.
A giant clock on the side of of a neon sign.
Three Red Sox baseball players stand smiling in a dugout.
THERE IS A CAT IN THE MIDDLE OF A BUNCH OF KNIVES
A boy biting into a piece of broccoli.
Two garbage collectors standing behind a garbage truck gathering up bags.
People on a safari look at an elephant in the road.
A choice of poached eggs and bacon on a bagel or donuts.
a mirrored door showing the reflection of a couple
there are train tracks that lead in to a train station
Kids out on a sunny day while skate boarding.
A bathroom with a tiled backsplash over a sink and bathtub.
A street sign is pointing towards 8th avenue and the other is pointing towards 22 34 street in the middle of the forest.
a man that is outside with a kite in hand
A red and yellow sign for the life guards and an umbrella on a beach near the ocean.
Two men and two women are hanging out at a skate park.
An orange container filled with office supplies sitting on the ground.
A plate with food on it next to a bowl with salad.
A large airplane flying through a sky above a city.
Boats tied up in  a harbor with cranes in the background.
Playing on a small laptop and a phone at the same time is not recommended
A man wearing a suit has a boutonniere pinned on his chest.
there are two men on a field playing with a frisbee
Four people are skiing down a snowy hill.
A very cute small child touching a fire hydrant.
A table with bins of food that include pizza, fruit and salads.
a cow eating garbage on the side of a road
A man in a short sleeve shirt with a tennis racket
A large multiple layer cake with yellow frosting flowers.
A large jetliner sitting on top of an airport tarmac.
a boat partly submerged in a body of water
An animal that is looking at something on the ground.
A mission style bed is dressed with bright white sheets and a striped folded quilt sitting in between two matching nightstands and lamps.
Two plates have a meal prepared on each of them.
A person on a skateboard does an air trick.
Two side-by-side photos of different living room settings.
A large kitchen generously adorned with shiny metal surfaces.
A man in front of a horse working on its hoof.
The woman stands on the cart behind a man driving it.
A man holds a laptop that has a message about Barack Obama written on its screen.
A motorcycle is parked in a lot by a store.
Various trains at a train station next to people on loading dock.
A guy rail grinding a skateboard on a ramp.
A woman sitting on a bus next to a dog.
a giraffe is eating a piece of food
A booth with salesman trying to track down
THERE IS A WOMAN THAT IS PLAYING TENNIS ON THE COURT
A woman texting on her phone while on her laptop.
A brown donut on a thin piece of white paper.
A bunch of sheep grazing in an open field.
A baseball player is up to bat during a game.
A stand with various tv and game equipment on it.
A closeup of a train at the station for people to board.
A small pizza sitting on a wooden table next to a bread maker.
A baseball player is preparing to swing his bat.
Two kids sitting at a table eating a meal.
a young man is performing a skateboarding trick
A man holding a tennis racquet on a tennis court.
An image of some baseball players in front of some money.
Some food that is on a glass plate.
The owl is looking at the camera in an intense fashion.
Cat sitting on cabinet in front of large screen television.
A brown clock tower with a gold, black and white clock.
A table with cut up vegetables and cheese with it's rind cut off.
A group of people sitting in a chair, working in computers.
Two road bicycles are locked to a pole in front of a man talking on his phone.
A teen boy and teen girl standing on skateboards in front of a stone brick wall.
The man is riding up a hill on a motorcycle.
A person wearing sandels standing in front of a cat.
A toilet filled with Hershey squirts with a blue lid.
A man in grey shirt doing a trick on a skateboard.
Two hipsters sitting down at a table cutting up a chocolate cake.
THREE BASEBALL PLAYERS STANDING ON A BASEBALL FIELD PLAYING A GAME
A person is driving a speedboat quickly through the water.
a teen holding onto a brown teddy bear
A man rides a motorcycle that is decorated with three teddy bears.
A Virgin Mobile train driving in the middle of a city.
A restroom has a toilet and a decorative sun wall plaque.
The man is carting his suitcase around the city.
A toilet with the lid open and a phone on the wall beside it.
A little boy is in a batting cage with his dad, who is serving as catcher.
A kitchen area with a large pot, dove and a wooden cabinet.
a person behind a stand selling fruit with a person near by
This small kitchen has pots, pans and spices on display
a man getting ready to serve tennis ball
A plate of vegetables and meet on a table
A public bathroom that is dimly lit by a window.
A person jumping on a rail on a skateboard.
two white sheep, a black goat and a white goat in a field
a man holding a surfboard on his back
A bench is sitting near a wooded area.
Two guys wearing nice clothes are standing outside.
A door opens to a view of a toilet.
Two girls in cowboy hats riding horses waving.
A giraffes face and neck while he eats leaves from  a tree
some baseball players are playing baseball on a field
A pole stands in the dirt with a biker in the back ground
Two men and two women enjoying an outdoor meal.
Two men walking a dog and watching an airplane about to take off.
An elephant putting its trunk in another elephants ear.
A clock is shown on the top of a tower.
Three laptop computers sitting next to each other on a kitchen counter.
A person is looking down with ski boots on and skis next to them.
A white toilet, sink and shower stand in a bathroom.
A car driving in an intersection, past a furniture shop.
A smiling woman perched on a chaise long under an umbrella
A couple of cars that are parked in the street.
A man playing a game of tennis on a brown tennis court.
A plate with chicken and broccoli on it.
Two young boys laying on a carpeted floor playing on laptops.
The hotdog is next to a bucket of popcorn and a soda.
a plate with a cheese shrimp and scallion pizza
A man lays on a bed wrapped in a white blanket.
Many plates of food with their silverware.
A giraffe leaning down drinking from some water
Two jets high in the sky with white trails.
A small bathroom has toilet, medicine cabinet, and small sink.
An industrial sized blender filling a jar
Car and motorcycle traffic in a large city
A living room filled with living room furniture and decor.
this is a dog running near some water
A classic military motorcycle is parked in front of a crowd.
A computer desk with two desk chairs at it.
A group of people with kids sitting in a living room.
A man on a skateboard standing on asphalt.
A zebra foraging for grass among dead branches.
A refrigerator is filled with a lot of food and beverages.
A small elephant playing with a toy suspended from a wire.
A plate with sliced pizza and a bottle of beer.
Several people standing next to two people in cell phone costumes.
A plate of food with a sandwich and a salad.
a toilet some white brick walls and toilet paper
A boy riding a skateboard on the sidewalk.
A man goes to strike a tennis ball.
two men looking angry at each other.
A person has a sandwich on a plate.
The old vase is on display on the table.
A silver car next to a parking meter.
Three zebras are standing in a filed under the clouds.
A man walking down the sidewalk, and a blue briefcase in front of a post.
A cat laying on a handbag on a bed.
A couple of coaches in a large room
A baseball player swinging a bat on top of a field.
This sewing room space is small but well stocked.
White kitchen cupboards with grown counter top and black stove.
A church stands in a country field, underneath blue sky.
A sumptuous table setting in a royal dining car.
Several bottles of wine on a display table.
A guy with a white shirt and jeans riding a skateboard.
She is checking her messages before finding a good spot to enjoy the concert.
A smiling young woman holds up a bottle.
A man wearing a patterned shirt and tie and glasses.
A woman is dressed as Merida from Brave.
A couple of guys at a picnic of some sort contemplate sweets arranged on a paper plate.
A group of cows mill about on a grassy pasture.
A kitchen with green cabinets and tile back splash
a bunch of food is sitting out on a table
A very thin cow standing near a herd of elephants.
An industrial kitchen with a strainer on the counter.
A man sitting on top of a cement ledge.
Young adults in tennis clothes are playing Wii.
A living room filled with furniture and a fire place.
Two officers are riding horses near a crowd on the sidewalk.
Several people are skiing in the snow by a tree.
a train approaching a station with people waiting to board
A skateboarder is crouching and arms fixed as if to run into something.
A man releasing a baseball at the end of a pitch.
A boy is blowing his candles on his ninth birthday.
A vase full of flowers sits on a counter.
A bathroom with a pink sink and blue tiles.
A man in red shirt kissing a woman's forehead at a table outdoors.
A woman pushing a stroller and looking at her cellphone walking down the street with people walking or riding bicycles behind her.
very types many ripe  fruits in a basket
People are standing on a sidewalk in London.
People pulling their luggage as they walk
skiers riding on a ski lift to the top of a mountain
A chair at a desk in a room.
A hot dog in a paper boat sitting on a person's jeans clad legs.
Skate boarder performing a stunt in a vacant area
A man carrying a white surfboard across a beach.
A group of students posing for a photo.
A open laptop is on the table next to a box.
A man standing on top of a boat on a large body of water.
The garden vegetables are blooming outside and are ready to be picked.
A plate of food sits next to its dessert
Some smiling guys in a very big crowd of people.
A street intersection with street lights in a small town.
A baby sitting in front of a stuffed teddy bear.
Looking down on a very winding twisting road
a bunch of chairs and umbrellas on a beach.
A woman sitting in front of a desktop computer.
A bike parking white tent cover is set up.
Five planes are flying in formation in the sky.
A black and white picture of accessories in a store.
A sandwich and condiments sit on a white plate next to a drink.
a person in fancy clothes rides on a horse
Two people jumping in the air to fight over a frisbee.
a skateboarder skating on a  stone skate ramp.
A stove top topped with three pans filled with food.
The rider and horse canter onto the field to compete.
corner cabinet and sink area of a green kitchen
A group of elephants that are in the grass.
The couple are dressed up and posing for photos.
A snow skier is off the ground in the snow.
A man holds a bat awaiting his turn in the batting cages.
A siamese cat lays on a wooden desk
The snowboarder is grabbing the board while jumping up.
Door leading into a compartment on a train.
Cows grazing on the grass in a green pasture.
A bucket full of toothbrushes rests on a rock outside.
A hand holding a spraying hose to a toilet bowl in a small toilet stall.
A couple of guy sitting at a table with a couple plates of food in front of them.
two elephants are walking together down the street
a large stack of old and antique multicolored suitcases.
A man with a backpack sits on a non-functioning toilet outside.
Two green metal street signs with Spanish words on it.
A girl is sitting and eating a biscuit.
A toilet in a bathroom next to a plaque on a wall.
A group of guys in a field playing soccer together
A picture of a orange cat in a bowl.
Two young kids play soccer against each other.
A group of oranges are sitting in the bowl
A glass sitting on a table next to an oven.
Food prepared on a bun and set in a basket
An old double decker green bus says London Transport.
A mother and daughter are cooking together in a kitchen.
A living room with wooden walls and a tv.
This little league player is catching a ball during a play
A pair of parking meters sitting behind a row of parked vehicles.
A child looking at an elephant that is standing in an enclosure.
a person on a skate board comes off a ramp
A girl with a black eye and pig tails sits in a suitcase.
A group of snowboarders gliding down a snowy mountainside.
A man in a jersey swinging a baseball bat at a ball.
a small bathroom with a toilet and a sink
A plane flies over, painted in right colors.
three people sitting in chairs and a teddy bear
an image of baby sleeping next to a woman
Many cows in a pasture with trees eating grass.
An old woman attempting to play a video game.
An airplane sitting on top of an airport tarmac.
A blue vase with a bird painted on it with flowers in it.
A young man is standing at the bottom of a staircase.
a number of people walking on a side walk near a building
A crowd watches a large giraffe through a wire fence.
Several people on skies in the snow
Two ducks by the water one is spreading its wings.
A passenger train that is pulling into a station.
A man and a child standing on top of a beach.
A bike sitting in front of a beach in the evening.
a couple of baseball players are out on the field
an old steam powered locamotive at a station filled with passengers
A half dozen assorted doughnuts in an open box
People standing outside of a hut with several bunches of bananas and other fruit outside of it.
A blurry screenshot of a green street sign.
The stop sign below the street signs has writing on it.
three surfers are walking on the sand at the beach
a man has a refrigerator on his three wheel bicycle
a bunch of people stand next to some suit cases
a sheep and baby sheep standing in a field
A smiling man is holding a skate board near a street.
A man on a surfboard riding a wave.
A young girl who is looking in the refrigerator.
A baseball bat leaning agains a wall beside a yellow box.
A larger standing horse is standing protectively over a smaller resting horse in some tall grass.
A man catching a white frisbee with his hand.
A cake with a couple of birds and other animals on it.
A tall clock tower and a tree against a blue sky.
a female in a white top is playing tennis
A yellow fire hydrant gushes water onto the street.
Two giraffes are walking next to one another.
Model train locomotive on track in small village display.
A boat with a wooden hull is on a beach.
A girl sticking her hand in a large bowl.
A vase filled with lots of colorful flowers.
An old refrigerator displays its open door and contents.
A picture wedged in between a bunch of bananas
Modern kitchen with counter and cabinet and hardwood floors.
A little boy watching two elephants in an enclosure.
The motorcycle rider is on the road with all his gear.
Two large slices of pepperoni pizza on a table.
a red and white stop sign and a street sign
Two women are at a table with laptops.
A young man holding a tennis racquet on a tennis court.
A man playing with a soccer ball in a field
A Blue dish full of green broccoli heads and asparagus.
A bowl has onions, shredded carrots and other ingredients in it.
A red stop sign with a car parked behind it.
A large church clock tower towering over a city.
tourists riding and petting an elephant at a tourist attraction
three people are sitting on a bench watching a train go by
The men are playing doubles tennis on the court.
A man and a woman carrying a surfboard down the road.
a small bird in a field of green grass
a close up of a vase with art behind a display glass
The wooden bench has spray paint on the back.
A pitcher winding up to throw a ball on a baseball field.
A table topped with a plate with a pizza on it.
A male competitive speed skier coming around a curve.
traffic cones in a bathroom that's under construction
A male getting an object out of a tree.
THERE ARE PEOPLE THAT ARE SITTING AT THE TABLE
A men's tennis couple watching a ball hit the net.
Group of people enjoying food at a market.
A full view of a building that has a huge roof on top.
a male with short hair is looking out of a trains window
A street sign surrounded by orange and red leaves
A woman feeding a white dog a small carrot.
Three individuals flying a geometric kite on the beach.
A 4 way stop sign on the corner of a city street
A girl took a selfie of her taking a selfie on her cell phone.
a wooden piece of art consisting of two birds standing at opposite ends of a log with a cone shaped vase in the center with a group of red berries sticking out of the cone.
It's easy to imagine a dinosaur as an ancestor of the giraffe.
A young man holding a tennis racquet on a court.
White police car passing through a stop sign in front of the building.
a vegetable sandwich with cucumber pickle and tomato
A hand wearing a ring reaching for a pair of scissors.
a man is riding on a ramp on a skateboard
A woman sits on a bus, presumably waiting for a bench.
This bathroom has a wood floor and wood on the wall.
Two guys stand by bottom of stairs playing the Wii
A group of zebras crossing the dirt road .
A chef standing in a kitchen preparing food.
A small bird sits on a corn plant.
Several park benches lined up under a row of trees.
A red Two level bus stopped to pick up passengers
A person standing on a sidewalk with a black umbrella
A table with pots of sliced carrots, green vegetables and baked bread.
A living area with two chairs, stool and a television.
A man is next to a horse in a window.
A white woman and an indian man shaking hands
A child's lunch, of soup, fruit, and veggie, sits on an A B C place-mat.
A very sophisticated bathroom with a white theme
An old man in religious clothes reaches to catch a frisbee.
Some zebras eating together outside in a grassy area.
a double decker bus rides through london happily
Kiwi fruit, banana, apples and an avocado in a dish
A table with two bottles of wine on it.
a plane sitting on a runway with a ladder sitting there for it
a jet on three pillars in front of a building
a kitchen with a stove a sink and some cupboards
A man is cutting small hot dogs and adding toothpicks to them.
this is a man hitting a ball in a game
A flock of ducks are swimming in the water.
A table with a tin of hotdogs and a plate with bun.
a grill that has some pans on it
A light pole and street sign in front of a store front.
A city bus that is sitting on the road.
A man bounces a tennis ball as he prepares to serve during a match.
a lot of people are on a tennis court.
A collection of knives and a pair of scissors in a wooden block.
A cup of coffee sitting next to a sandwich  chips.
A pair of scissors near a stick of butter.
A surfer is riding a wave in this aerial photograph.
A man riding a skateboard down the side of a hand rail.
Train coming down a rusty train track with scrub grass.
A single seagull standing on the coast with waves in the background.
A white plate with a cut in half sandwich on top of it.
A rectangular toilet bowl in  a tiled bathroom
Surfboader riding the crest of a ocean wave
a person riding a horse puling a lot of hay
The hotel bed is designed for the business traveler.
A yellow fire hydrant is standing alone in a parking garage.
A motorcycle and car are parked in a garage.
a white chicken with a black tail and a red head
An adult cow walking along side the river bank
A yak needs long hair to survive in these mountains.
a man sitting on a bench and laying down
Man posing with a tennis racket for a shot.
A passenger train that is pulling into the station.
a person riding skis jumping in the air
A desktop computer sitting on a a desk.
The woman's  on the horse giving presentation with flags
A white beach chair with a red, white and blue striped towel under a yellow umbrella.
A person on a snowboard anticipates a jump.
a vase having a bunch of flowers inside of it
A baseball pitcher throwing a baseball on a field.
Two people and a dog are sitting under a sheet-tent.
A group of people standing at a table with bottles of wine.
The train rides on the track past the station during snowy weather.
Black and white photograph of a modern commuter train
a dog with its frisbe in its mouth walking in a water way
The woman in white outfit swings the racquet at the tennis match.
A close up view of a pizza sitting on a table with a soda in the back.
A child snowboarding down a hill in the snow.
A cat is lying on a table, watching a television.
A cat sits with hisher toy on a blanket.
A man on a skateboard standing on a ramp.
little kids sleeping all over a big bed
A 24 hour recovery truck traveling down the road.
A snowboarder sitting on a ramp in the snow
A kitchen table, refrigerator, garbage can, chandelier and window.
a clock with big numbers at the end of a table
A person is standing in the snow on skis.
A canopy bed in a white and brown room
A towels hanging from a towels rack outside a shower.
An unfurled sailboat in the water under a pink sky.
A blue and yellow meal pole with street lamp lighting
A toilet is made of wood with accents on the back of it.
bacon, lettuce and tomato on toast with slaw and a pickle.
A family of giraffes standing by a puddle.
Five giraffes are standing in tall grass, in their habitat.
A girl playing some video games with her family.
Parking meter and flower in vase displayed in window.
Students are sitting at tables with books and laptops.
A plate displaying vegetables, meat and bread on a table.
an image of a stop sign that is posted in the three way zone
A laptop computer and mouse is on a sofa.
A man with an old-fashioned hat is looking at the camera.
Two gray elephants fighting each other bumping heads.
a group of horses graze on some grass
A bowl full of oranges and leaves on the table.
A man doing a trick on a skateboard over a ledge.
Three people ride an elephant while a man on the ground directs him.
some bananas peaches apricots and and apple
A woman standing next to a motorcycle and some health aid trucks.
Player at bat and umpire holding a ball .
A large plane sitting inside of a hangar.
A row of red stop signs sitting next to a  lush green field.
A view of a mountain range is seen from an airplane.
A woman talking on a cell phone while walking down a street.
A woman sitting on a trunk wearing a polka dot dress with a red belt.
A couple of people at the beach during the day.
A blue street sign in front of a building with many windows.
A woman in a blouse wearing a striped tie.
Someone riding on an elephant as it stretches it's trunk out
a bunch of motor cycles all parked together
Two zebras stand next to each other on a field.
An altar with purple cloth, vase and two candles.
Man poses sideways wearing a plaid shirt and a tie.
A small plane is parked on the tarmac.
A van follows behind a bus on a rural road.
a little kid starts to learn how to ski
a little boy bending down taking a bit of a hotdog
The laundry is hanging in the tilted room.
A young boy riding a skateboard down the side of a ramp.
A couple of cruise ships in port with a large building in the background.
This is a picture of a woman playing tennis.
Two giraffes stare out of their enclosure at a zoo.
A small naked boy holding a tennis racquet on a beach.
A homemade cheese pizza is made and ready for the oven.
There's enough wind to fly a large box kite.
A clock that is siting above a sign.
The man reclines in his seat from the table with doughnut in his mouth.
A white chair with two glass birds on top of it.
The blonde lady answered her cell phone because she was waiting for an important call.
The street light, the electrical box, and the sidewalk are littered with bird poop.
A bunch of people lounging at a beach near an ocean.
Two men hold hands around a dining table.
A large train gains speed on the railroad tracks.
A couple of cows grazing in an open meadow.
Two young boys sitting on a bed with three teddy bears and a sign with the number twenty crossed out.
a truck with two off-road vehicles in its back compartment
A cat is sitting on a motor scooter.
A group of people walking around a train station.
Young boy posing in front of a flying kite in the park
A stop sign has collaborate and listen on it.
A compact kitchen set-up with shelves for storage and a small stove.
A herd of elephants walking across a ground near a river.
There are two cows walking on the sand.
Apples and oranges pile in well lit color photo.
Fresh fruit and vegetables on a kitchen counter
A group of young women getting food from a table.
A white fireplace that has pink candles lit on its mantle.
Two boys with an umbrella and chair on the beach.
A man and boy are sitting on a couch.
A woman and some children near a zebra behind a fence.
A jockey is on a brown horse with a crowd watching.
A skateboarder doing a trick in the air at night.
A topdown view of floor with sheets, shoes and a desk on it.
A herd of cattle grazing in a lush green field.
A woman in a suit and tie standing with her hands in her pockets.
A person taking a picture of a stoplight on the side of the street.
There is an old-fashioned clock tower in front of a building.
Two boys are standing in front of a train with backpacks.
Two officers are riding horses near the ocean.
A cow looking at the camera from inside its fenced in pasture.
two geraffes in a feild next to a tree.
Snowboarder impaled on a tree during dusk with fire.
two people feeding each other cake at a wediing
a dog outside playing with a ball in the grass
A two level bus with a large advertisement on the side
two little kids sleeping on a pink bed
A fleet of small air crafts are flying over sea.
A royal, horse-drawn carriage moves along the road.
two giraffes are laying down in a park like setting.
a woman stands with some luggage by some chairs
This room is caught in a design time warp.
Two blue and black parking meters sitting on a sidewalk.
The man is cutting into a large cake as others sit around the table.
The girl in shorts is attempting to hit the tennis ball.
A group of people are riding horses near a train.
A cluttered countertop with a celebratory pink and white cake and opened containers
A man on a surfboard riding a wave.
A rectangular pizza served on a wooden cutting board
A cat and dog standing buy their human in the kitchen.
A pregnant woman is in bed reading a large book.
A person that is holding a dog and a bowl.
Several pieces of wood lined up near a lot with several axes around.
Three children on a sofa by window eating bread and pasta.
a magnetic knife holder on the wall above a kitchen counter
Three giraffes stand near each other in a field.
A sign at the corner of St. Clair Street and South Main with flowers above.
Many people have come to tour an authentic military aircraft.
A bus moving past a street sign opposite a building
Staring into the camera to take a picture.
Bottom view of an airliner flying directly over head.
There is a man sitting on a wall talking on a phone
a living room with a couch and a tv
A small kid on a field with a bat.
A blue sign posted on an overpass that people walk across
a number of large kites on a beach
A man standing in front of a car with it's hood open and a dog standing in front of the car.
Men laughing and playing tennis on Wii Sports
A big building in front of a tall clock tower.
A small cats eats out of a food bowl while standing in another bowl.
The snowboarder is jumping high above the ramp.
A plastic container of food with rice and vegetables.
A small showcase of an assortment of funny and cute items.
A row of motorcycles filling all the spots in a parking area.
A giraffe and other animals in a field
a black and white image of a man on the phone
A large kitchen has a stainless steel counter.
A woman sitting at a table eating a plate of food.
A double decker bus stopped at an intersection.
a girl that has a racket in her hand
a man standing on a tennis court holding a ball and a tennis racket
A man prepares to hit a ball during a tennis match.
A man standing next to a produce stand with tomatoes and other vegetables.
A man is in a hospital bed has a teddy bear.
A cat laying on many shoes on a brown rug.
A kitchen counter with a candle display on it
A little girl that is standing on a surfboard in the water.
A small orange vase is on a table with a small branch in it.
A group of cyclists are riding across an intersection.
Two people are on a motorcycle driving down the street.
A herd of different colored sheep walking near rocks.
A table with two red vase type items
A number of people flying kites on a clear day.
A bath and sink with a woman in a room.
A parking meter sitting on the side of a road.
A cat drinking out of a sink faucet.
A grey tabby cat stretches out on some clothes
A kitchen with wooden flooring and white wooden cabinets.
Dog sleeping in his bed next to rocking chair
Man on a surfboard under a large wave
A roadwork crew constructing a guard rail along a mountain road
two woman stand in the snow and pose for a picture
Two men having beers in a dimly lit room.
Long woman sitting on a raised log looking at the mountains.
A lone horse standing next to a fence.
A trainer feeding two giraffes from his hands
Old picture of a sumo wrestler playing baseball.
A child holds a game controller for Wii.
a large dog is looking oof to the left
Living area with small desk and leather couches.
A train is pulling into the train station.
A large chair statue with a large horse statue on top of it.
A dog running in the sand near the water.
The man in stripes holds onto the plate of food as he poses for a picture.
Two people standing in the grass playing with a soccer ball.
A small toilet and trashcan across from a dirty sink in a very small, dirty bathroom.
A large white sign above a brick wall with a yellow vehicle to the left and a parking sign to the right.
A black bird is perched on a tree limb.
Cowgirl at rodeo riding house with a Texas flag.
A man with many bags walking on street next to fence.
A skier is being towed over the snow.
A small pizza on a plate that is sitting on a checkered table cloth.
A picture of a large apple and walnut pie on a plate.
A woman petting a horse in an open field.
a couple of men are eating at a table
A few pieces of luggage sitting on top of a wooden floor.
A herd of goats walks by a car and its driver.
A large group of competitive cross country skiers.
A woman smiling while sitting on a bed.
A slender high rise building is fashioned behind a pole clock.
A man dressed as Elvis sitting on top of a bull statue.
The city buses are parked together in the parking lot.
A woman is in water catching a frisbee near a boat
A man takes a bite of his food at an event.
A red airliner is parked on the tarmac at the airport.
Dock area with urban area on cloudy day.
A bus is seen coming up to a bus stop.
A long tunnel with a long table with lots of seats and candles next to wine glasses.
A man and woman are walking in the rain with an umbrella
A woman tennis player serving a tennis ball.
A red double decker bus parked near a red telephone booth
Surfers on surfboards ride in a row on the ocean waves.
A person stands next to a train parked on tracks
A group of boys stand around a museum exhibit.
Candles, flowers, and stuffed bears are set in a corner near a poster.
A mother elephant and baby standing near the water
some people sitting at tables eating pizza and drinks
a room that has a bunch of tables in it
Four bowls of snacks crackers, broccoli and carrots, nuts and dip
A woman riding a carriage pulled by a brown pony in a race.
People stand near a desk with laptops on it.
A small child holds onto a fire hydrant to stand up.
A man holding an umbrella for another man in the rain
A sail boat with a large Colgate Clock in the distance.
Skateboarder with an elongated shadow at an outdoor skate park.
A group of people riding skis on a snow covered summit.
Two people standing on a tennis court with ball in the air.
The men are playing a game of baseball in the field.
Spectators watch the players at a baseball game.
a road that is next to some trees
A small farm animal steps through the short grass of a green field.
A Dairy Queen sign on a major road advertising it's special.
Horses pull carriages on a dusty dirt road.
A young boy in a green outfit holds baseball mitt.
A city bus going down a city street.
close up of fingers holding a slice of pizza
A man with a tennis racket stands on a court.
A picture of a street with parking meters.
THERE IS A BED WITH A SKY BACKGROUND
A woman with her laptop on a bed in a dark room.
A man on a horse going down a track.
A young couple poses with a cake decorated like a keyboard.
Two cats sit on top of a towel on a counter.
A plate that has an apple and sliced kiwi on it.
Grizzly bear grazing in grassy field in daylight
A computer is set up with gaming equipment.
An elephant walking through a brushy field
A brown and black bird standing on a tree branch.
a number of elephants near a body of water
Three stuffed animals next to a radiator and below a rocking chair.
The man is focusing on something in his hand while holding up his bike with his leg.
A tennis player holding a racket on the tennis court.
A stop sign and people on the street in front of a double decker bus.
a kitchen wit ha stove some cupbaords and drawers
Black and white photograph of houses and a clock tower.
a cat licking its lips while holding onto a toy in the shape of an elephant
The young man is jumping on his skateboard.
A batter swings the bat as the crowd watches attentively.
Three people with surfboards standing near the waves.
a group of people that are eating some food
Two men in suits and ties with woman behind one of the men
two college graduates pack up after a long day
two yaks are out in a grassy meadow
A baseball player swings at a pitched ball.
The man is holding a tennis racket in his hand.
A group of women that are in a kitchen.
People at the picnic while an elderly woman shows a pizza.
A statue of a man and woman with luggage in a city.
A pizza sitting on top of a cardboard box on a table.
a large plane is parked on the run way
The building has several umbrellas suspended in mid-air for decorative purposes.
There is a window with a cake and other baked goods showing.
An antique fire truck parked on the side of the road.
four traffic lights over a city street
A woman sitting on a pier near boxes of fruit.
Apples, oranges and bananas all mixed in a bowl.
A sign with a large hand with five dollars written on it.
two elephants giving people rides down a street
A man eating something from a paper bag
A couple of people that are watching a baseball game.
a very tall clock tower sticking out of a building.
A chrome colored microwave oven in a custom cabinet space.
A group of people sitting down at a dinner table.
A lady reaching for a huge wine glass
A man riding a skateboard down a road.
Three skiers in bright outfits start down a slope.
The stove top and oven is separated in this kitchen
Two rams are staring at each other in the woods.
Two people dressed up entertain a little girl.
Two people are riding a sports motorcycle down the street.
A view of a bright hallway and a room with a wood burning stove .
A person holding a toothbrush in his hand.
some people standing around one man is wearing a tie
A picture of someone riding a snowboard doing tricks
A woman is leaning on a car talking on her cellphone.
A plate of food with a bite taken out of the hamburger.
A woman on a motorcycle is next to a man walking a dog along with other people going down a dirt road.
The entire baseball team has gathered on the field for a celebration.
A giraffe that is sticking out its tongue.
A woman in dress leaning against stack of concrete blocks.
A black and white scene with a lady answering a phone
this is a horse and a dog by the water
A white and gray bird perched on a human's hand.
People riding on the backs of lavishly decorated elephants
A large black horse standing on a field filled with green and brown grass.
a van driving down a cracked street
The dirt bike has seen many hill climbs in its history.
A cat and dog napping together on the couch.
A visitors desk with a vase with sunflowers in it.
A skateboarder in mid air after a jump
a hotdog with toppings in a paper tray
A group of people sit in a open living room and kitchen area.
A group of four zebras standing in different positions.
A simple wooden bench is in the woods.
A man in a red uniform and shorts throws a ball while wearing a baseball glove.
Multi colored cat laying on the floor next to door and liquor bottles
The person is holding his cell phone while on his laptop.
A man standing in a kitchen using a blender.
Two men stand by a trunk of a car next to which are both a surf board and some folding chairs.
A variety of shots of a man doing skateboarding tricks
a couch in a living room with three pillows
A marble bathroom with automatic toilet and bidet.
A white and grey airplane sits at a gate at an airport.
Horse grazing in a field in front of a cascade of mountains.
A white dog has curly matted hair in it's eyes.
A group of zebra crossing  a river together
a living room with couches and a table
A woman looking at the camera while holding a cell phone
The person on the motorcycle had a big helmet on.
A young lady reading a paperback book on her bed at night.
light brown cocker spaniel dog howling in street
A cat licking a bowl clean on the counter.
a bathroom with a chipped sink and holes in the walls
This looks like a bunch of burned food on top of burned bread.
A baseball player has just launched a ball.
a living room with a chair near a tv
Person high in the sky after jumping snow ramp with snowboard
People are standing behind the bakery counter.
Two cows in a large green grassy field.
These people are sitting on a street bench.
A row of cars and trucks parallel parked at parking meters.
Group of people riding bicycles on a busy city street.
a man sits next to a monument on a bench
a caddy holding one cell phone and another cell phone in a holder
A soccer team is praying on the field.
two black and white cows on a green hill
a hot pizza topped with cheese and olives to be eaten
a close up of a cat at a table with a plate in front of it
A women who is picking up a large sandwich.
A very large cat sitting in front of a television set.
A store on a street corner called "James Smith  Sons".
The old metal bed with dirty linens is the only furnishing in the abandoned room
A plane flying with a dark, cloudy sky in the background.
A boy is sitting down and eating a donut.
The plane is taking off from the runway.
White and grey cat laying down on a white sheet.
A gray tiger cat sleeping on a bed under a blanket.
A photo-shopped image a cube drawn around a lego in the kitchen.
A young man catching a yellow frisbee on a green lush grass covered field.
a square white appliance with a blue thing on top
three people riding horses on a beach near a body of water
The laptop has an attractive image on the screen, and there are welcoming flowers and munchies
A man playing tennis on the tennis court while his coaches watch.
a street sign on a pole with buildings in the background
Street sign edited to look like a man is holding the white bar
A businessman wearing a suit and close up picture of suit.
Boy sitting at table with food and a cellphone.
A man holding a woman's hand and cutting a wedding cake together.
A beautiful woman sitting on the back of a moving truck while clutching her dog.
A bathroom cubicle showing a toilet, sink and waste can.
A wedding cake with flowers descending down it to the plate.
A bathroom area with three different sized and shaped urials on a mosaic wall.
A plate of cakes with frosting and topped with berries.
A large truck next to some trees outside.
People can be seen boarding a ship through the windshield.
A tower of brick holds clocks and a bell in a courtyard.
People are playing in a field, flying a kite.
A young giraffe stands near some trees in a wooded area.
Young baseball player running in open grassy field.
A few skiers are enjoying the calm snow-trodden mountain tops.
Mexican food is layed out on two trays.
a small boy and a big chocolate donut cake
A brown horse standing next to a metal fence.
two men standing in front of small car
a boat a larger ship a buoy and water
Shot of a nice quaint living room with an ascending staircase on the side.
the skateboarders are taking turns using the ramp.
A giraffe extends its tongue to drink water
A park bench is in the white fluffy snow.
A person is standing with their face near a toilet.
A couple of elephants are dawdling in an enclosure.
A giraffe standing near a tree in a field.
An African American man wearing a bow tie is taking a selfie.
Four men are dressed up with a tie.
People having a drink in a basement bar.
An underground Asian subway train, on the tracks and in transit.
The strawberries are supposed to make this dessert look less fattening.
Men playing Frisbee on the lawn at a get-together.
The man is carrying bunches of bananas.
A young child that is looking at a birthday cake.
A person doing a trick on a skateboard caught in motion.
Two giraffes and two zebras are standing in a grassy field.
vegetables laying in the soil next to a trowel
An adult elephant standing over a very small baby elephant.
A bunch of vegetables that are stacked together.
An old photo of railroad tracks passing through a western town.
Two giraffe standing next to each other.on a lush green field.
A man holding a tennis racket getting ready to hit a tennis ball.
Batter, Catcher and Umpire wait at home plate for ball to be thrown.
a formation of fighter jets flying by in the air
A group of people standing around a nearly empty field.
A young lady that is smiling and holding up a box with a blue tie in it.
There are people watching a game of baseball.
a brown and white dog lying on a bed and brown pillows
A couple of trains parked in front of a tree.
A train is stopped at a train station.
The skateboarder is jumping down the stairs on his skateboard.
A sheep standing with his behind to a fence in the snow.
A round plate that has a white and red pie type dessert on it and a light green pitcher behind it.
Pedestrians on narrow alleyway with archway between buildings.
A blue and yellow fire hydrant sitting in a field.
A soldier is cutting a large decorated cake.
a small girl in a field with a blue yellow and red kite
A yellow fire hydrant in the middle of a plaza.
Four children eating pizza in a booth at a restaurant.
A train traveling through a tree covered wilderness.
a lady holding a game controller and a man giving the rock on sign
An antique gold clock with a man and an eagle.
A woman eats a hotdog while holding another.
A man riding water skis on top of water.
A bundle of six apples are hanging from a tree.
a bathroom with a white toilet next to a tub.
Several elephants standing in a lake near trees.
a grassy medium between a two way street in a city
A macro image of an apple keyboard.
A single person is working in the cluttered kitchen.
A young boy holds a baseball bat above his head.
A donkey draws a carriage carrying two people
Two men reach up for a Frisbee at a park.
Fire hoses are attached to a fire hydrant.
This blue and yellow transit bus provides information about the service it self rather than advertisements.
A group of people standing around a table together.
An apple sliced into four, fork and knife
A very elaborate cake decorated to look like a bear's forest dinner table.
A giraffe standing by a brick building with a ladder.
A couple of women preparing food inside of a kitchen.
A bed containing two small boxes and an electronic item
A brown and white cow standing on top of a lush green field.
A motorcycle is parked next to a blue tent
two people are sitting on two different elephants
A very cute toddler playing with a laptop that is fully open
A black and white cat standing on a table next to a pizza.
A plate with fries and a napkin with eating utensils.
Tourist train with several cars driving on street.
A group of young boys playing a soccer game in progress.
A train passing on bridge over a busy city street.
A bicycle parked on the side of the road beside some doors.
Lego clock and wall setup for a interior Lego house.
A couple of hot cars in a packet on the table
A small plane is taking off from a grassy field.
A look at a hotel room with two beds in it.
A man standing on a tennis court hitting a tennis ball.
A guy with a motorcycle helmet stands behind a motorcycle.
A dark kitchen with many cabinets with a small light on above the stove.
A bus drives down a city street featuring larger brick buildings.
Open textbook near a computer keyboard and mouse on a mouse pad.
A desk with a computer a keyboard and a mouse
A basket with a sandwich, coleslaw, and onion rings is sitting on the table.
A bed sitting in a room under four pictures.
A tray of assorted food including fruits and vegetables.
A box with six divisions, each with its own variety of donut.
Person flying a red kite in a grassy area of a park.
A photo of an outdoor with many things in the scene.
A train is moving or resting on railroad tracks.
a fire hydron that is next to a concrete road
Two streets cross and the signs prove it
A bedroom with a large blanket covered bed in front of a flat screen TV.
A man holding up a hot dog on a stick.
A toy wagon holds many stuffed teddy bears.
two zebras are standing together in the woods
A group of people on motorcycles driving down a street.
Two parking meters sit on the side of the street.
A vehicle decorate like a pink elephant with passengers on its back.
Black and white photograph of a woman in an old kitchen
Animal in shadows of woods surrounded by foliage.
The domed, shiny surface reflects a man falling off a skateboard.
Closeup of a brown bear sitting in a grassy area.
This is an old town from the 1950 's.
A man skateboarding on the grass in his yard.
Two people stand in a field as one of them flies a kite.
A baseball player is sitting on the bench at a baseball game.
A white bus is driving on the road.
Contrails can be seen from a descending jet.
A grey and blue train passing over a city area.
A young girl is smoking in a kitchen.
A  city bus with bikes on the front of it
Four people playing a game system in someones living room.
a group of baseball players that are on a field
a baseball hat cake made with fondant that says happy birthday
A bird flying over a beach with a few people in the background.
a male is looking at a sausage pizza
A IMAGE OF A CAKE TEA POT AND BIRDS
A group of large trains on a steel track.
the people are walking down the street with luggage
A bedroom with a desk in the corner.
A picture of a girl that is posing on the ground.
A large green freight boat is seen at sea
a microwave is open with some food in it
A man cutting a cake on a table with cards on it.
a man looking up as he rides a surfboard
an image of a living room setting with tables
A young person holding a surfboard next to a man.
A chocolate cake with a pile of strawberries on top.
A giraffe laying down and another giraffe standing up next to trees.
A man riding a bike while holding an orange and black umbrella.
a close up of a sandwich on a plate next to rice and beans
a building with clock tower in a town square
a clock is sitting on the outside of the building
a pizza topped with different toppings is brought to a table
A man rides a horse while other people look on.
A warning sign is at the edge of a body of water next to a fire hydrant.
The sign on the side of the road is telling motorcycles to use caution.
Their is a skyview of the city from a small aircraft.
There is a orange tabby cat sitting on a mat
A large passenger jet sitting on top of a runway.
an adult female standing on a beach holding a colorful kite
A clock is shown in a package on a shelf.
A woman smiles on a street while holding an umbrella.
A young boy swings a baseball bat as another boy waits to catch the ball.
A group of people sit in a room while one plays a video game
a toilet and a bidet in a bathroom
Tall, fresh, colorful flowers in a clear vase
three people are skiing down a huge mountain slope
Four fine zebras cruise through grains very alert.
a tall clock tower near other buildings
The large twin engine airliner has a red stripe on the sides.
A city scene has a tall red double building.
One single biker seems to be leading the group down the road.
A man in light blue jacket riding on a skateboard.
Street construction being separated by orange barriers.
A hotel room with tv, desk, bed and arm chair
a close up of a person eating food at a table
A laptop that is sitting on a desk.
a flooded city street with a stop sign coming out of it
A squirrel is eating a piece of food on the ground.
A park bench is next to a colorful fence.
A group of people on motorcycles sitting in the road.
A blue and white bicycle parking in bike track next to building.
An older woman sitting at a table cutting up donuts.
A man wearing a lei is waiting in a parking lot with his luggage.
Man with young boy carrying surfboards at beach.
a person standing up and holding two remotes
There are rambutan, bananas, and papayas in separate crates.
Cement ledge with orange in bowl and red plastic bag below.
A brown and white sink sitting in a bathroom.
A small vase of flowers with petals on the table
a man is standing on top of a surf board at sea
A table topped with steak, potatoes and carrots.
A lavish hotel room with a comfy bed.
A couple of glasses of wine on a table.
The wooden bench is near a busy stream.
An  big airplane flying through the sky
Adults with looking at watercraft on waterway near park.
A kitchen filled with an empty refrigerator and microwave.
A driver's view of an intersection on a sunny day.
a herd of zebras walking through the grassland
A green bus parked in a parking lot next to others buses.
A dog curled up by a pair of boots on the floor.
a man is skateboarding on the edge of a building
Two men play tennis in a fenced courtyard
Snowboarding elderly man on side of mountain posing for picture.
A view of a single bathtub in an otherwise empty bathroom
a man holding a child next to a double decker bus.
A bird is wading in shallow water by a boat.
A dog and some humans in a garage of some sort.
A sink in a kitchen with an overhead light on
one dog laying down and another dog standing over it
A woman on the beach is flying a kite.
A street in an Asian country is littered with signs and advertisements.
A group of people standing on top of a grass covered field.
There are two people standing on the side of a street.
a trolly train on a city street at night
A young and a woman sitting down outside with a laptop between them.
Seven cows are lined up while being milked.
A woman riding a horse in a pasture with great caution.
A large jetliner flying through a cloudy sky.
A young woman drinking from a wine glass
A woman standing alone holding a large, white umbrella.
Several elephants eat grass and plants by the water.
A computer geek's setup of his computer, laptop and various games.
a cat in a luggage bag in a closet
Two zebras, one facing forward, one looking at its mate.
A tall church tower under a blue sky filled with white fluffy clouds.
a person standing on a beach wit ha dog
Old rusted train left out on the train tracks
A teddy bear drinking from a pink cup.
A pizza with an egg in the middle is on a plate.
A country road covered in rain next to a river.
A boy holding a kite while standing on a sidewalk.
A young woman holding a teddy bear in a room.
a person is standing next to a surfboard
A clock on the side of a window in a room.
Three skiers posing for photo in front of sign.
The fire hydrant is in a field near a covered, wooden bridge.
A group of young children sitting next to each other.
The catcher in a baseball game picks up the ball with his glove.
A black dog laying underneath a car in the shade
a brown and yellow bathroom with a toilet tub a mirror and a sink
The modern bathroom has a glass shower door and cream and brown color scheme.
A group of zebras standing beside each other in a grassy area.
Large special saddles are used while riding elephants.
A living room with a brown wicker couch and ottoman.
Three cats sitting on a leopard print bed.
A table topped with a tray full of cookies and a vase filled with flowers.
a street with cars parked on the side
A boy in a red jacket at a bus station.
A shower and sink in a small room.
Numerous water fowl either taking off or landing in the water
A large group of motorcycles stretching into the distance on a highway.
Old lady takes a rest from her walker on a sea side bench
A hotdog sandwich with sauerkraut, cheese, and mustard.
A couple of people with snowboards in the snow.
Three people standing on a mountain taking a picture as they ski.
b baby doll holding a very big samsung phone
A street sign sitting next to a tree.
Four women with two children cross the street in a crosswalk.
A professional kitchen with metal counter tops with good lighting.
A gray truck driving past an ATM machine.
A couple of horses standing near a road.
A person at a table outdoors with a laptop.
A pair of hot dogs with toppings next to a drink.
Sliced apple in a bowl covered in cinnamon.
zebras are walking in a pack on the grass
A man in a tan shirt and glasses in a car
A large train resting inside a railway station.
The hotel room has two large beds, a desk, a flat screen tv and a lot of space.
A kitchen area with a dishwasher, stove and microwave.
there  is a large blue vase that is empty
A clock tower on top of a  building next to the ocean.
Skateboarder doing a high jump down stairs at a competition.
A person riding a snowboard on the snow on a sunny day.
An indoor bathroom with reflective marble counter tops
A bunch of cows enjoying the grass and sunshine.
A man flying a kite over a sandy beach.
There are benches on the landing where one can sit and enjoy the view of the  wooded surroundings
A parking lot filled with parked cars in a shopping center..
a close up of a young person wearing a suit and tie
A large commercial airplane parked on the runway
A man riding a skateboard through the air above a skate park.
Several skiers ride down a steep, snowy slope.
a toothbrush that is on down on the counter
A man is standing under an umbrella in the rain.
some people standing around a bright lit up party bus
an image of two little kids playing baseball
A small red bicycle sits on a hardwood floor.
A white plate topped with meat and broccoli.
A woman returning a shot at a tennis match.
Two surfers are riding two large ocean waves.
A large bus parked on a handicapped parking space.
A person stopped wearing a yellow jacket riding a motorcycle.
Stuffed bear posed "reading"  open computer reference book.
A dog laying in bed all covered up with the blanket .
Old canopied single bed with luxurious linens and curtains
A cat sitting on the floor beside a pair of shoes.
A bench falls into a crack in the asphalt.
A man holding a luggage cart in front of an airport.
A pile of carrots and other vegetables on  a tray.
A woman sitting on the couch with her baby.
Some very tall giraffes in a big green area eating.
A couple who is cutting their wedding cake together.
A black cat sitting in a tub licking the faucet.
A black and white clock is mounted to a building
Computer screen displaying a page of small print.
The woman is sitting on the couch watching TV.
There is a little girl standing next to a very large pizza
A grey and white cat laying in a black wire basket.
A child playing with toys in a backyard pool.
Two skateboarders performing a trick on a ramp.
The man is about ready to cut the cake to share.
A vintage bicycle is parked outside a storefront beside a state of the art apparatus.
A beautiful young woman holding a tennis racquet on a tennis court.
A large bathroom shower has flowers by it.
A refrigerator in a basic kitchen with bottles on counter.
The man is in the air after jumping on a snowboard.
A person stands on skis on a snowy mountain.
A green bowl filled with oranges on top of a blue striped table.
Teddy bears modeling on a runway with other teddy bears watching.
the man is holding on to a firsbee
A bicycle is locked up to a post
A long city bus pulls away from the curb into traffic.
A crowd of people stand in the water on the beach.
a woman in a skirt holds a tennis racket an ball
the elephants are moving people across the river
three people skiing together in a line down a hill
A baseball player is holding a baseball bat.
A little kid that is touching a fridge.
A woman carrying a surfboard into the ocean.
A giraffe standing in the grass near trees.
a yellow fire hydrogen next to some weeds
A black cow eating some grass in a field.
a little league player getting ready to throw a ball
A person is snowboarding down a snowy slope.
A girl plays with a Wii remote in her hand.
A female tennis player getting ready to serve.
A large bus driving down a city street.
A white plate topped with meat, veggies and bread.
A colorful breakfast omlet with toast on a green plate.
Paper umbrellas hangs from the trees for art
A man is turning a pizza with a spatula.
Three sheep are grazing on the city sidewalk.
A small child in a pink dress sitting at a table having cake.
a couple of men in cowboy hats near a sheep
A chocolate cupcake with a smiling giraffe face.
A man in the air on his skateboard doing a trick.
a street sign that reads "do not enter" on a quiet street
a baseball player holding a baseball bat on a field
a street sign showing a one way street
A cake with a knife on the table
A black kitchen sink with potted plants, toaster oven and knife rack behind it.
This black dog is sleeping on a bed with white sheets
A man with a brown sweater is playing a WII game.
A man standing on a  park holding a white frisbee.
a group of teens playing at a skateboard park with one doing a jump
Two zebras and a giraffe are walking in a park.
A large crowd of people and flying kites.
Two beach chairs next to an umbrella on the beach.
A cat peeking over a tub that it is inside of.
A red and silver airplane sitting outside at the airport.
Overhead view of a table with a log and food on it.
Black capped cranes standing in a zoo enclosure
Small child on a skateboard watches another skateboarder.
A tennis player holding a racket looking up at the spectators.
A man holding an umbrella on top of a bridge.
Two plates that have food on a counter.
An old green steam engine is on the tracks.
The reflection of a red truck in a buildings windows
Two well dressed hot dogs are sitting next to fries.
A person on a court with a tennis racket.
A sink sitting under a mirror and near some cupboards.
A person riding a horse and another person petting the horse.
A small engine plane sitting on a runway.
person walking their dog on sidewalk past cars
A bath and a sink in a small room.
A skateboarder doing tricks in the air at a park.
A herd of elephants walking through a grass covered field.
Blurry photograph of a cat jumping up from a chair
A person that is about to throw a frisbee.
some sheep standing in the snow with one looking for food
The man squats down while surfing through a wave.
A tidy hotel room that has two beds and a flat screen
two females dressed in ski attire standing side by side in the snow
A body of water with boats floating on top of it.
The pizza is next to a bowl of salad.
A couple of boats parked at the wharf during the day.
Guy in glasses using scissors to cut something in the room.
A group of vehicles driving down a city street.
A group of athletes are waiting to compete on a playing field.
Cars backed up several blocks in traffic on a city street.
A small girl is sitting inside an open suitcase.
A horse and buggy parked along a sidewalk near a wharf.
A little boy wearing a baseball hat holding a baseball bat.
a person wearing glasses with a cellphone in their hand.
A number of mannequins in a clothing store
A couple is on the beach with a small child.
a white dog that is looking at a frizbee
A very pretty blue city street sign near some trees.
A group of men standing around a UK surfboard.
a cluster of blue flowers inside an orange peel
A person riding skis down a snow covered slope.
Two parking meters on roadside and a road sign
A sandwich with lots of french fries in a foam container next to a cup of dipping sauce.
A dragon boat race with a bunch of people in the boat
a boy in a baseball uniform poses for the camera with his bat
A group of soccer players on a field
Zebras are socializing in a pattern of three by three by one.
a kitchen with a table and chairs and a stove
A person skateboarding in front of two statues of reclining women.
Two women play together in a tennis match.
A toy sits at a desk with a beer.
The fire hydrant is across the street from a large building.
two people playing tennis in front of a crowd
A skier at the bottom of a slope, among coniferous trees.
a giraffe leaning over so it can eat some leaves
A man sitting in a chair holding a glass in his hand.
very well made meal placed in a bowl
Two sandwiches in plastic wrap sitting on a counter.
A teddy bear suspended in mid air as it rains down water on it.
Two hot dogs on a plate with a cup of coffee.
a man goes to hit a tennis ball with a racket
World War II vintage fighter plane parked in a museum.
Bulldog riding a wakeboard on a body of water.
A man flying through the air while riding skis.
College dorm room with stack of newspapers, backpack and suitcase near bed.
A bathtub sits next a window showing a ferris wheel.
A couple relax and watch a wide screen television on the far end of a messy living area.
people standing around the table with 3 laptops on it
A falcon sitting on a back yard BBQ grill lid
A large cow standing in a grass field.
a tennis player is doing an overhead serve
A giraffe stands by a tree in its habitat at a zoo.
A horse stand behind a fence and in front of an old building on a snowy day.
Living room of residence with green couches and large bookshelf area.
A person on a orange motorbike is on a track.
THERE IS A DISPLAY OF DIFFERNT DOUGHNUTS ON THE TABLE
A large clock on a cloudy day in the city.
A table topped with small and large metal bowls filled with veggies.
An airplane is flying high in the cloudy sky
Bowls of chopped carrots, onion and lettuce on a turquoise mat.
Green traffic light shown against a tall sky scraper at night.
Three sheared sheep on grass facing different directions.
A dining room and kitchen area with a glass table and gray chairs.
A train that is sitting on the tracks.
The black and white photo shows a toilet and a bathroom sink.
A round bed with lots of pillows next to a cat bed.
The three zebra are walking down the road.
A plate with two slices of pizza on it with toppings.
Quick Stop Groceries has many things besides groceries
THERE IS A RED STOP SIGN ON THE DOOR
A mom giraffe escorting her newborn around a fenced in area.
A guy is eating a huge slice of cheese pizza.
A bird walking past a white car in a lot
A man feeding a brown spotted giraffe over a fence.
A dark colored beverage in a tall glass and a small bow of food on a table.
A hipster standing between two surfboards while wearing sunglasses..
A toaster oven and dish drying rack sit on the kitchen sink counter.
Boats docked by a couple of city buildings.
The hula girl doll sits on top of the car dash.
Snow piled up on and around a fire hydrant by a fence.
A man and woman set a formal dining table.
two trains on train tracks at a train station
baseball players in motion playing in a stadium
A man is kneeling down in front of five surfboards.
The little boy is petting the giraffe whose head come over the zoo enclosure.
A hamburger sitting on top of a tray on tissue paper.
Cars riding on the street across a train on the tracks
A empty living room that has a table in the center.
a work desk with display with graphs, notebooks, and keyboard
A little boy and a little girl laying on a cat shaped beany bed.
An airplane with four engines is on a runway.
A brown curly haired dog chasing after a red frisbee
A woman is swinging at a tennis ball on a court.
the baseball pitcher getting ready to pitch the ball
A school safety sign lies against a piano.
two people in a living area playing with a dog wearing a cowboy hat.
A boy enjoying some sandwich or donut during the day.
A residential bathroom with sink, toilet and curtained tub
a living room with a couch a window and a lamp
A close up photo of a baby giraffe standing in the hay.
Some hooks that are holding hot pads, a ladle, and a pair of scissors.
A man is leaning out of a train.
A group of people enjoying a meal at a table.
A bathroom with a toilet, a sink, and a bathtub.
people walking on the sand of a beach shoreline beneath flying kites.
a woman talks on a phone in front of a slide out glass
A small framed picture is hanging above the toilet
Two military men riding horse in the water along the shoreline of the beach
A woman walking down the street with an umbrella
Some highly cultural objects on display in this well lit room.
A man plays an organ in an historic photo.
A little girl with a snack laughing on a bench.
A man hitching a ride on an elephant.
A variety of old motorcycles on display in a shop
An open toilet seat next to a urinal.
A close-up of pink and red flowers in a clear vase.
A black and white photo of men shoveling rocks.
A group of people on top of some horses.
Three giraffes in a field with a fence
A traffic sign is displayed on a street.
A man using one hand to hold a skate board while performing a handstand
Two female tennis players on a grass court.
A broken park bench in the middle of a grassy lawn.
Sugar donuts sitting in a white paper bag.
A man sitting on a park bench next to a person laying on it with a dog.
A row of auto-flush urinals lines the wall in this public restroom.
A baseball player holds the bat while the catcher and the umpire stand behind him on a baseball diamond.
A gray and black cell phone resting in a man's left hand.
A lonely sheep standing in a field in front of a rock wall.
A man and a woman riding on a motorcycle are getting ready to hit the road.
Two horses gaze out from among the trees.
A man sitting on a couch holding a small white object.
A bus going down the area next to the ocean.
Guy with shades on taking picture of his hot dog
The skier is skier down the snow covered hill.
A large bathroom with a large bathtub in front of curtained windows.
A bunch of people walking around in a street
a guy riding his skateboard near the edge of a pool
A mascot entertains fans as baseball players leave the field.
Several motor scooters are jammed into a small market street.
A microwave sits above a stove built into the cabinets.
A bus going to Oakland in an empty lot.
People watch a women's softball game from behind a chain link fence.
two girls standing outside a building next to a large toothbrush statue
A young man sitting on a toilet in a white bathroom.
There are two snowboarders in the air completing stunts.
The image shows a book digitally modified onto a tennis racket.
A zebra eating hay scattered on the ground while another zebra lays in the shade.
A man in an orange t-shirt rides a wave on his surfboard.
A woman in a window either taking a picture or video taping something outside.
A skier holds a ski pole in each hand.
A large commercial plane sitting on a tarmac.
a man that is on a tennis court with a racket
A person and some animals that are by some plants.
A baby pulling themselves up to look at a laptop.
Trays of snacks and a bottle of wine.
Four guys are sitting around a table eating and drinking.
A slice of cheese pizza on a plate with parmasean cheese on the crust.
A cupcake covered in lots of white frosting.
a picture that's been sped up to show streaks of headlights and taillights
A young black man sitting on a skateboard on a basketball court.
A train is on rails over the ocean by a pier.
A LARGE TRASH CAN IN THE SHAPE OF A SOUP CAN IS ON A STREET
THERE IS A PLATE WITH SWEET DESSERTS ON THE PLATE
a woman in a pink top holding a cellphone and a few other people
A blue Hospital sign with an arrow pointing towards the Hospital.
A man riding a motorcycle down a curvy mountainous road.
A giraffe standing in  a valley of two small hills
young boy with surf board in hand walking out to the water
an image of a man with a tennis racket in hand
The meals are ready in their individual containers.
three yellow buses line up on the street
A surfer rides a medium sized wave on the beach.
Boy doing skateboard trick in air at a skate park.
A person riding on the back of a horse drawn carriage on a beach.
A large smart phone made the the NOKIA company.
an image on the table with apples and oatmeal
A cat is looking at itself in the mirror on the floor.
A Frisbee team on a field being happy.
A refrigerator with a variation of different magnets and photographs on the doors.
A pond with lilypads and a frisbee floating in it.
military jets being prepared for a  mission
A picture of some people posing for a picture.
two people sitting at a table with laptops
A group of young men standing around playing games on the Nintendo Wii.
an image of a stop sign and yield sign
A red trolley train is going down the tracks.
a small boy is playing with a remote control
The kitchen is has a stainless steal refrigerator.
a vintage photo of a bike parked next to a store
A giraffe about to eat leaves from a tree
A kitchen with a stove and a microwave.
A young woman in a red skirt is waiting on a train platform with her suitcase.
Several horses that are grazing in a field.
A bathroom with a marble bathtub and a large sink.
A parking meter with two cars parked beside it.
Giraffe stretching its neck out to reach green leaves on a pole.
The large red city bus drives on a brick street.
A man riding a skateboard up the side of a ramp.
a person riding a snow board on a snowy surface
A street sign that is on the side of the road.
a woman holding onto a container as she eats a donut
A kitten peeking out from of a pile of white blankets.
Three zebras standing in grass with bushes and trees.
Two young girls in uniforms sitting closely together.
A small boat is in the water and a red bench in on the dock next to it.
some giraffes standing in front of a white building
THERE IS A PASTRY THAT IS SITTING ON A PLATE
A cat drinking coffee from a cup on top of a table.
A plate covered with a meat and vegetable dish
a man performs a flip trick on a skateboard
A sandwich and french fries are on a plate.
A living room with wooden walls and furniture.
four giraffes basking in sunlight of enclosed area
a close up of a pot cooking broccoli
A herd of cows standing in a field grazing
a group of people standing around a table covered with different containers of food
A boy is cutting a string with scissors.
A close up of an "all traffic" sign on the freeway.
A large jetliner sitting on top of an airport tarmac.
A man posing with a mouse and keyboard
an image of kids playing on skateboards in the street
A woman attempts to fly a butterfly kite.
Two boys and their mother playing with a kite.
A man on a couch playing a video game.
a close up of a white toilet and trash can
Two teddy bears with a price tag on ear.
A pan of food on the stove consisting of sliced carrots
this train is leaving the station on rails
A jet airliner sits in front of the runway.
Two men play a game together using the Nintendo Wii gaming system.
A couple of kids in skinny jeans with skateboards.
An open, lit-up, fully stocked refrigerator and freezer.
People are getting off of a large bus onto a commercial airplane.
A photograph of a white range and oven.
A beach with people, beach chairs and umbrellas.
A woman plays Wii while a man holds a martini glass beside her.
A red firetruck is on a street near a brick building.
A group of children with two of them brushing their teeth.
A MUNI bus in San Francisco, parked next to a fountain.
some people and a red and black train engine
A surfer rides a wave in this Michael L. Baird photo.
a tennis player swinging a racket at a ball
A man wearing a medieval style helmet sits atop his motorcycle.
A person is holding a flag in a gathering
Many people near a river gathering around in a circle
A couple of young guys at a skate ramp with their boards.
A pair of kites fly above a statue.
A person balances a large scale full of goods.
A blurry picture of a man with black hair wearing a suit and tie.
A man is walking out of a pizza restaurant with his pizza
A town square with many pedestrians walking about.
a bathroom with a blue dustpan and broom on the floor
A cat sits under an umbrella while indoors.
Two skewers of vegetables and broccoli  on a plate
A large inflatable whale sitting on top of a beach.
a full view of view of a zebra and a head shot of another zebra
A vase with flowers is displayed next to a handmade object.
A woman serves a ball with a clock in the background.
A man and women at a table eating, there is a baby in a stroller behind them.
A man adjusts his tie while getting ready to go.
an apple sticking out ot the side of an apple
a kitchen with a refrigerator and a stove
A young woman is brushing her teeth at the sink.
A man pitching a baseball from a mound on a field.
a cat is on the coach staring at a remote control
a cat sitting on the toilet looking at one on the floor
A bunch of pedestrians walk down the street in the rain
Keepers looking after a family of elephants at the zoo
A red truck has a black dog in the drivers chair.
Three people sitting on bench watching a train go by
a white train is coming down some tracks
Four people on a ski slope preparing to ski.
A male tennis player gets ready to serve.
A road sign by a stone wall and dirt path.
A sea lion on the rocks with an elephant's head photoshopped on it.
Two men sitting at a table with a pizza in front of them.
Sun shining through a window into a bathroom.
A person on a motorcycle riding down a street.
Two dogs sitting at a dinner table enjoy food in bowls.
A sign, on a sidewalk, containing directions to nearby locations.
Several different donuts are placed in a tall bowl
a person on a skate board tries to do a trick
White commuter airplane with blue tail on an airport runway.
A train is parked by the sidewalk in the city.
A man doing tricks on a skateboard with onlookers watching
A person surfing on a wave in the water on a surf board.
A woman in a towel combs her wet hair.
A bench in a park covered in snow.
The vase is decorated with a colorful design.
Lady teach a class and uses her laptop
A woman is playing a game of tennis.
A baseball player swinging a baseball bat at a baseball.
The fireplug is the dominant element with the architecturally interesting building in the background.
A policeman on a horse is standing across the street from a building.
A bathroom has several items next to a sink and on top of a medicine cabinet with a door opened up against a glass-walled shower.
There are various utensils on the counter of a large kitchen.
a plate containing a bag and several pastries
A stainless steel kitchen sink on a black granite countertop.
A paper plate with a very large sandwich with a lot of condiments on it.
a cat rests its head and paw on a pair of womens shoes
A cup is on a desk with a dog figurine.
Baby at her first birthday party feeding dad cake.
A living area with several chairs and a lot of color
Two children's miniature trains with conductor and mother and child.
A horse is shown behind fences in a field.
A white refrigerator freezer sitting inside of a kitchen.
A man in black shirt holding a yellow frisbee by rocks.
A man in blue shirt touching a cake with a utensil.
The woman in a pink shirt looks at the kite in the sky.
A cake with thick icing partially eaten with a knife.
Up close view of a plate with two well cooked hot dogs on it.
Clock and sign on church tower made of bricks
Two lanes of cars waiting at a traffic light.
A herd of cows grazing on a hill by the road.
A plate with food on it and an orange with a fork on the plate.
A small train with writing all over it passes through an intersection.
A yellow truck next to various cars in a warehouse.
A miniature bathroom set is shown for a model
A batter is throwing a bat and getting ready to run.
A hot dog on a  bun covered in lots of pastrami and a pickle.
A group of people by a bunch of bananas.
a boy and a woman in a competition with a motor bike
A baby chews on a toy with a cat pinned under his leg.
The man is preparing to pitch the ball.
A man preparing food on top of a large metal pan.
A bunch of items that are on a table.
A skier goes cross country consulting a sign
A sunset scene with water, elephants and grass.
Two small sheep, one standing and one sitting, in a grassy field
gray and white cat hiding underneath a toilet
a close up of a number of zebras behind a fence
A baseball batter, catcher, and umpire await a pitch at home plate.
A young man carries a black backpack and a blue suitcase.
A cat lies on its back on top of a table with pink roses.
A living room with a couch, a chair and a piano.
An open oven has lots dishes on the racks.
A statue of a jalapeno on a fire hydrant.
A bus is passing through a city intersection.
Two elephants standing near a small pool of water.
A brown cloth covered table filled with stuffed animals.
A railing in front of the beach with surfboards leaning on it.
Two zebras looking for food near a tree.
Two guys are playing some sort of video games.
Many televisions are showing the same sunset picture.
There is a person looking at the contents of a refrigerator.
A woman cutting a mans hair in a barbers chair.
A boy operating a mouse and viewing a laptop.
An apple laptop with pens, headphones, books and various small items.
A skier carves a path as they descend a snowy slope
Children playing in a soccer competition on a grass field.
Small girl eating pizza off a colorful plate on a blue table.
A toilet in a restroom with a wooden toilet seat.
An orange on a counter next to a bottle of alcohol.
Bed and nightstand with blinds closed and doll sitting on pillow.
a kitchen with a brown dining table set and a potted plant on the counter
Living room with half circle window and furniture.
a table with two glasses and a plate with a chocolate dessert and a spoon
A view of a gourmet style banana split.
a couple of zebras are standing in a gassy field
A desk has picutres, cds, cups, and a dog figurine.
A man jumping in the air with a skateboard
A large outdoor clock with two faces and various designs and numbers on the faces.
A little girl and woman standing near a birthday with lighted candles.
a train on a train track near a small river
A man with a bright green tie with his arms around to boys.
a person snows boarding on top of a small hill
A vase with flowers, cup, pitcher and mug sitting on a table
a woman holds out a stuffed bear to a man in a suit
people standing and windsurfing on boards in the water with trees in the background
A group of people flying lots of kites in a large grassy park area.
A colorful bus stops at a bus stop.
A man riding a wave on top of a surfboard.
A giraffe picture on box with some pizza.
A couple of multi-colored lawn chairs sitting on a beach.
A group of baseball players playing a game of baseball.
A man posing with a horse in the shade
two elephants are in a field together eating
We are looking at a delicious plate of banana walnut pancakes.
a public transit bus on a city street
A bartender filling a long row of champagne flutes
Several stacks of disposable cups sit in a kitchen.
Assortment of toothbrushes in ceramic container in corner of counter in bathroom.
A machine that dispenses tickets for some mode of transportation.
A baseball player holding a bat standing next to a base.
A group of sheep surrounded by three dogs.
Two people touch feet while sitting in chairs.
A cat sitting on top of a book shelf filled with books.
A person holding an open mobile phone and a camera.
2 people outside on a snowy area snow boarding
A man sitting on a ledge reading a book.
A cat is standing on the back of a huge dog.
Warning sign displayed in wooded lane on sunny day.
Fresh fruits, vegetables, and other foods are spread out on the table.
A meter with a sign on it stating that the meter remains as a courtesy to cyclists
A city bus moving down a city street on the sidewalk nearby
Two cows standing in a penned pasture near a log.
A person does a trick on a skateboard in black and white.
A zebra standing amongst tall, dry grass during the day
A plate of food that is on a table.
Gloves, cell phones, brushes, ties, and ear buds are placed on the floor.
A fork perched into shredded meat on the bread on the table.
Brick fireplace in a white and brown living room.
A person riding a snowboard on a snow covered slope.
Four pieces of luggage sits on the floor.
A man flying through the air while riding a snowboard.
Black and white photograph looking past traffic lights at an old building
A girl standing in a boat resting her arm on an elephant who is passing by.
A statue of a dinosaur, next to a bunch of flying kites.
A baseball player tries to avoid a tag out play.
A man walking down a road holding a black umbrella.
a close up of a small dog near a pair of shoes
Guy in hoodie peeing in a bathroom toilet
A parking lot with cars and motorcycles at walmart.
A table topped different plates and bowls of foods.
The goose is curious about whats in the bucket.
A very tall tower sticking out of the side of a building.
A person that threw a frisbee in the air.
a man with white and blue on playing tennis
A dog is looking out a large window.
a woman trying to fly a kite with no wind
A bathroom with bathroom supplies is pictured in this image.
The man in black is moving towards a refrigerator.
An arrangement of food is displayed on a table.
a couple of cows are standing in a field
knife cuts into a medium sized pizza on a plate
A man sitting on the hood of a car talking on a cell phone.
a hotel room with a bed, chair and a window
there are many benches that line this park
A commercial airplane being pulled across the runway by a truck.
Two baseball teams of young children playing baseball on a dirt field.
a woman and child checking out a display of food on an outdoor table
A truck in the street near a person on the side of the road
Toilet design outside of the US with accompanying trash can.
A person and a dog with frisbees in a park.
A young boy is flying a kite in a park.
Multiple trains sit on tracks that run through the city.
A wooden carved clock tower with posts holding it up.
some people and the male is holding a baseball bat
A small kitchen with dark wood cabinets and white appliances.
A pick-up truck with a Christmas wreath attached to the grill.
A refrigerator door left open showing the contents inside.
A red fire hydrant between two flower boxes
Several square pizzas are sitting on round plates.
A plate filled with fruit salad and a melted cheese sandwich.
A baseball player prepares to swing at a pitch.
A woman sitting on top of a purple motorcycle.
A coffee cup sits next to an open computer.
Rusted fire hydrant covered with bees in grass near road.
A workspace with a laptop computer and desktop computer.
A baseball player is swinging at a pitched ball.
A plate of colorful vegetables and a cut of meat.
An orange fire hydrant sitting below a tall building.
A chef is cooking food in the kitchen.
The zebras are grazing on the grass in the field.
A person is holding a fork with pancakes on a plate.
A man and boy in dirt field playing a game with frisbee.
A man poses for a picture in a suit and tie.
A cat drinking water from a bathroom sink.
A train pumps out steam while going down a track on a cloudy day.
A pitcher, a catcher, and a man up to bat.
a photo of a kitchen with a fridge, an oven and a sink
a flooded street with a street pole
five giraffes drinking water with a field behind them
A woman holding a tennis ball and racquet on a court.
a person holding onto a banana with brown spots
a sign attached to a metal pole sitting in the grass
A couple of trains that are riding in the rails.
People taking photos of a public speaker with their telephones
A snowboarder performing a stunt on a snowy mountain.
Three women are standing in a kitchen cooking.
A jet flying in the sky surrounded by smoke.
A living room with hard wood floors covered in furniture.
A close up of a pizza with spinach and parmesan topping.
A girl in a green shirt and denim skirt cutting a cake.
This is a staggering picture show of people having a remarkable time.
A close-up of a green apple next to other fruits.
Several cows in a field with a train passing in the background.
A street sweeper driving down a city street
A young giraffe and an old giraffe outside of a building.
Five uniformed players are on a baseball field near a crowd.
A cat in a bow tie laying under a car.
A desk full of desktop and laptop computers.
A group of people on a horse carriage ride going down a street.
A very close up look at a plate with some food on it.
A plate of food that includes beef, broccoli and sauces.
A large bird flying next to a tall building.
A herd of cattle grazing on a lush green field.
A frisbee in mid air with a someone below jumping.
a giraffe outside near a forested area and a lot of trees
an image of several giraffes in a zoo
four planes flying in the sky in a formation
A bathroom with a toilet and a counter next to a door.
Cat laying on the floor near some books
A man in a wetsuit riding on a surfboard
Boxes filled with donuts sitting on top of a table.
A small and large giraffe are by a tall fence.
Young men playing frisbee in a grassy field.
An older black desktop computer running Windows operating system.
A man is catching a white Frisbee on the beach.
Several images of a surfer in various phases of going out for a wave.
A vase of flowers are placed on a long table.
a person in snow gear walking through some deep snow
The sign for Spring St. and 6th Ave. is in front of a brick building.
A couple of zebras are in a brushy field.
a male in a light blue shirt and a white frisbee
some people on a bank flying kites and water
A boat is coming down the water near the shore.
a couple of pelicans sitting on some rocks
A family is playing with the Wii together in the living room.
Three giraffes are standing in the field spread apart.
A couple of women chasing after a frisbee on a field.
A plate of cheese bread next to bread sticks and wine.
A street that has a bunch of cars and trucks.
A pizza that is setting down on a table.
Someone's hand on top of a computer's keyboard.
A man in the progress of getting ready for a wedding.
A black horse standing inside a fenced enclosure.
Hands typing on light colored electronic computer keyboard.
Two beds with a nightstand in between them.
A man wearing a tie holding his suit jacket over his shoulder
TV in a cabinet with other furnishings around it
Man sitting on the side of a van playing the guitar.
some elephants in their pen and in some water
A wooden table topped with plates of food and fruit.
A brown bear is walking in the woods by some bushes and trees.
A kitchen with a refrigerator and some cabinets
a man and woman are sitting at a table with their food
A man in wetsuit surgin on surfboard next to wave.
A white bathroom sink sitting under a mirror.
A grill holds meat and a wide assortment of vegetables.
A group of women eating at a dinner table and conversating.
A palm tree in front of a poster.
In a park, a man in a dress shirt sits on a rock.
The boy is skateboarding  up the ramp during the day.
A laptop computer sitting on a desk in front of a window.
Commode scene, probably commercial establishment, outside of USA.
Girl with cake in hand looking at lit candles on it
A living room with gold walls has a playpen and mounted television.
A large cut pizza on a table with a laptop.
A cat sitting on top of a television looking down.
The toilet has special buttons that help the handicapped.
Several people film and observe children as they use iPads at school.
A woman swinging at a tennis ball on the court
a guy jumping with a skateboard on a sunny day
A mix of beef and broccoli covers rice.
Four powder covered donuts on a blue plate.
Sheep grazing in an open grassy field.
A woman sticking her tongue out and doing the "shocker" hand sign.
three muffins sitting on a chair with a bite out on one of them
A tennis player with racket serving the ball
a couple of kids that are playing on the ground
A tray covered in chocolate donuts on top of a table.
Several cops on motorcycles parked next to a large group of people.
A close up of a clear vase with flowers.
A group of people throw a frisbee in a circle.
A wooden bench sitting on top of a dirt field.
An alarm clock next to two people sleeping and a pillow.
A laptop on a pedestal near a hedge.
A taco salad sits on white paper near a table with a lap top.
Teddy bears with barcode tags in a pile.
A cramped bathroom with a yellow bowl on the back of the tank.
a very large collection of remote controls spread out
A floor with lots of different items and a bag.
The motorcycle is parked on the side of a road near snowy mountains.
A line of hawks wearing hoods on a wooden beam.
a hot dog with onions and cheese next to some french fries
A low angle shot of Big Ben in the daytime.
a woman looking up at a banana tree.
A motorcycle police officer leads a parade on a sunny day.
A counter cluttered with many items, including a tea kettle, a pot, a food scale and more.
The city bus is driving through a street intersection.
Two woman sitting at a table eating food
A wild trail with elephants and jeeps driving down a path.
The boy eats his large breakfast at the table
A market has an array of fruits displayed in boxes.
a pair of scissors and eggs laying on a table
a close up of a person pulling food out of an oven
Two women in front of a television playing a video game.
A green, red and blue bus parked on a street in a foreign country.
He is eating a banana while taking a selfie.
A couple of benches sitting next to each other.
a busy street that  has a lot of cars in it
a person taking a photo in a mirror
A few people are laying on a pull out sofa bed.
Vintage motorcycles sit on a tiled floor way in a shop
A baseball player is getting ready to hit a ball.
A piece of cake sits atop a piece of foil.
A woman prepares to hit a tennis ball on a tennis court.
A snow boarder taking flight while skiing down a slope
A baseball pitcher in motion with the ball right out of his fingers.
A big screen TV and a Wii gaming console on a rooftop.
A large truck and a bus on a road.
Police officer on horse moving through city street.
a couple of horses that are tied up
a clock on a tower next to a building
an old photo of some people in fancy clothes sitting on a boat
A woman riding a aqua blue wave on a surfboard.
a red city bus coming through an intersection
A bowl has a salad with carrots, red cabbage, and broccoli in it.
A man soaked walking out of the water holding a surfboard.
A glass bottle on a red surface with a red backdrop
A bowl filled with mixed cooked green vegetables.
an extremely long hot dog covered with ketchup and mustard sitting on a table
A man with a dog is preparing to board a train with others.
The people in the homemade boat have a bicycle and a big green umbrella.
A donut sitting in front of a laptop with black and orange sprinkles
a close up of a hot dog next to a drink on a table
A man standing outside beside a bunch of fruit.
two apples and a banana laid out to look like a happy face
An asian woman smiling while holding a cell phone.
a young broccoli plant in a garden bed
a train moving on the tracks next to a building on a hill
Various sizes and colors of tagged and bundled luggage.
A bride and groom cut the cake at their reception.
a lady on the mountains in very warm clothing
A guy leaning on the front of a food truck
Odd plant in a vase on a tray with cookies.
A view of home plate and to left field during a baseball game.
An adult elephant walks near two smaller elephants.
The floor of the bathroom is strewn with toilet paper.
Person of a surfboard riding a wave in the ocean.
Two bikes are sitting in the sand on the beach.
A cat perched on a toilet using the bathroom.
Two people in cowboy hats riding bicycles in an RV park
A jockey rides a horse through a course.
a blue and gray bus and a woman and buildings
A person holding a tennis racket and ball getting ready to serve.
a large bus riding in the street outside a building
two little teddy bears with peoples names in tags
a black bird flying above the water of the ocean
A person riding a racing bike on a track with spectators.
A surfboarder falling off his board as a wave hits
A batter, catcher and umpire in a baseball game.
A herd of sheep standing on top of a grass covered field.
A group of kids at a table with a cake.
A teenager doing a skateboard trick in front of a crowd.
A small red bird perched on a branch.
A man swinging a tennis racquet on top of a court.
A woman sits on a bike holding a small gun as a man lies in front of her.
many kites flying in the sky with a street light
A red food truck has a crowd of people by it.
An old model motorcycle parked outside a house.
A bathroom with a glass shower door, toilet, bidet and sink, with a set of shelves
A bed and a mirror in a small room.
some children are in a yard and one has a dog on a leash
Several horses running down the track near a fence.
A person is displaying the hot dog they are eating.
Two bears playfully fight and nip at each other.
A busy city street has many red double decked buses on it.
Skate boarder performing aerial trick on sidewalk with car nearby on roadway.
A passenger jet that is on the runway.
someone is holding in their hands a very old mp3 player
A group of men on a field playing baseball.
A cow in a fenced in grass area.
This truck has two yellow ribbons and says Freedom isn 't Free.
two people playing on the ocean with a frisbee
A man on a court with a tennis racket.
The zebra is drinking water from the pond near the grass.
A white plate with two crab cakes and fries.
A woman's feet who is wearing a pair of red heels.
The painting shows a parrot sitting on a branch over a river.
A small child poses in his baseball uniform
A fire hydrant in a weedy lot next to a street.
A woman is riding a moped on the road.
A stack of folded shirts sits in a darkened room.
A small group of cows are grazing out in the pasture.
Black and white photograph of a train at the station
People are standing in a street car covered in oranges.
A brown bear licking the ear of another brown bear.
A man wearing a beret while using a laptop computer.
A little girl packs her luggage with toys.
a bathroom with a sink and a mirror in it
Woman placing a dog on a white and yellow surfboard.
a small brown and white bird eating off paper plates on a table
A couch and a coffee table is in a living room with a wooden floor.
A teenage male is falling off of a skateboard.
A couple of buses parked in a parking lot.
a desk has a laptop computer and monitor on it
A computer is sitting on a messy desk with flowers.
A large living room has a mini kitchen in the corner.
A building with a clock built into it.
A dessert consists of donuts and custard cream.
A slice of cake and strawberries on white plate.
there is a cow along with baby cows behind a gate
A pizza on a rack and a plate with noodles.
Pair of giraffes foraging in natural outdoor setting.
A glass filled with pens and scissors and pencils.
A black cat laying inside a bathroom sink.
a white van is on the back of a truck
A wooden chair sitting on a sidewalk next to a tree.
A slice of pizza on a white ceramic plate.
Orchids are arranged in a glass bowl with table accents around.
a man that is standing under a tree
A painting of waves upon an ocean with tall grass and gold flowers
Luggage at an airport under a blue net
a very black dog lying on a courch
A girl with pale skin wearing a hoodie holds up a toothbrush.
A close of a fire hydrant painted red white and green
A giraffe and several zebras out on the plains.
A large tower with a clock stands in front of the cloudy sky.
a number of people in an open field with kites flying above
Skier in the air on fresh powder snow.
Elephants are bathing in a river with three men.
two large air planes on a run way
a little boy is holding up a cell phone
A faded yellow and red train passes through the trees.
A street scene with many cars and a bus.
Three surfers stand in front of a wall facing the ocean.
A young man standing on top of a field holding a baseball bat.
A man sitting at a table with a laptop and looking off to the side.
A cute little animal made out of oranges that is on a plate.
There are tombstones in the cemetery next to an old church.
Two stuffed animals sitting beside each other on a chair.
A large truck is shown in a rear view mirror.
a fire hydrant near a tree in a field
A dozen of glazed donuts in a white box.
a zebra grazes on some vegetation next to a fence
A car is driving down a city street.
BOY TAKING A GIANT LEAP ON A SKATEBOARD IN FRONT OF ONLOOKERS
A man and woman on beach with three surfboards.
A man surfing, with a vegetated coast in the background.
A large circular clock near a body of water.
A tall stop sign next to the road near a red fire hydrant.
A young girl is holding the reins to a small horse.
A man on a skateboard with his friend talking to people.
A bear eating a piece of food in rocky area with hay.
There were a flock of sheep walking down the road together.
A wedding cake and cupcakes on a table with knife.
A table has some old fashioned computer type equipment on it.
A desk with a laptop and desktop computer.
A man with a baseball bat that is standing in the dirt.
A hotel room with a neatly made bad and lamps on the bed stands.
A modernly styled hotel room has a bed that appears to float off the floor.
A couple of bannanas and cards for sale
A herd of horses standing in a dirt horse coral.
A Not A Thru Street signed hung up on a tree.
a blue chair is in front of a desk
A woman preparing to serve a ball thrown high in the air.
A toilet and a urinal with male and female signs.
a person wearing pants surfing on a white board
A grassy field with different colored umbrellas on the grass.
A outdoor cafe with many people chatting and eating.
a group of young people getting ready to go ski
A large group of people are on a field flying kites.
a dog sitting in a truck with its head out the window
a cat laying on top or s shelf in front of a window
A bunch of wooden desks sitting inside a classroom
A man and woman stand holding tennis rackets with a young boy.
a large air plane flying in the sky
A kitchen area with dining table, refrigerator and sink.
A wood plate with several yellow rolls on it.
Elephants are drinking water from a small pond.
a close up of a dog laying under a table
A ginger cat lounges comfortably on a bed.
A man standing behind a camera on a grass covered field.
A couple of men kayaking in a flooded park area through a gate.
A woman scratching flakes of fecal matter off of her buttocks.
A number of wine glasses and a cup on a tray
A bicyclist speaks to two police officers on horseback.
A construction worker standing on dirt near a fire hydrant.
A young child in a field of grass holding a baseball bat.
Weightplate with me investable sitting on top of the table.
Three colored toothbrushes standing in a glass holder.
Cuff links on the sleeve of a man wearing a business suit
Two giraffes are standing under a tree back to back.
Fresh fruits are stacked and arranged in colorful rows.
A table with plates, silverware and an electric grill.
The ski jumper is concentrating intensely on his target.
A man is walking while using his cellphone.
A man riding a surfboard in the water
A person on a snowboard rides on the hill.
This is some fine dining courses on nice plates.
A person inside of a house using a computer
Horses and goats are grazing on the open terrain.
Two boys in jackets and hats ride horses together.
A person has fallen off a surfboard near a large wave
The young boy is playing in a baseball game,
Man on skis on a downhill course after a fall.
A rainbow siting below a lot of clouds near a field.
A person holding a controller pets his cat.
A white toilet and broken mirror in a side yard.
A man riding a wave on a surfboard in the ocean.
Zebra grazing on grass in outdoor enclosed area.
A sea plane taxis across the water in a large lake.
A microwave oven sitting on top of  a counter.
Two pillows on a bed next to a window.
Stuffed toy bears on display through window setting.
A white toilet sitting in a restroom with a open lid.
A dog is laying on the ground with a frisbie.
Man with a backpack using a urinal with against a tiled wall.
A trio of images of food including bell peppers, watermelon, milk, and chopped meat
A close of up oranges with people standing around fruit stands.
there is a surfer that can be seen in the water
This is a cut up potato on a cutting board with a knife on top of it.
A baseball player swings his bat after a hit.
A baseball game in progress with the umpire calling a play.
A man standing in a room with something in his hand
An adult goat standing beside its baby goats in a grassy area.
a teddy bear sitting on a wall next to an old stone house.
Several skiers congregate around a slope at a ski resort.
Little boy looking out over a calm body of water
A young child is asleep next to her mother.
a hummingbird eating from a little bird feeder
Quesadilla for breakfast with a friend at a restaurant
A bird is posted on a rock by a lake.
An empty roadway between two rows of buildings.
Two children stand on a porch with toy tennis rackets.
A cake depicts a laptop, mouse, and latte.
A blue bus waiting for passengers at a stop.
A kitchen with multiple counters and various appliances.
A brown bear pup running across a grassy area.
The men are celebrating at a formal dinner with one wearing a paper crown.
A smoothie is pictured next to several fruits and vegetables.
a group of navy jets slying together in a line
The huge airliner is flying next to the clouds.
A blurry image of a man in a room full of pots on tables.
A bunch of people with some wearing headscarves are flying kites and pulling a panda bear balloon.
A kite laying on the ground surrounded by people.
A baseball player in a white uniform holds a bat over his shoulder as he stands near an umpire and a catcher.
A desktop computer sitting on top of a wooden desk.
A young girl looks through the eye holes in a pizza.
A mirrored bathroom with a good hair dryer.
Steak and crab cakes served with grilled peaches.
A professional skateboarder leaps over a bunch of over skateboards while a crowd watches.
A variety of Asian foods sit on a table.
A girl standing under a white and black umbrella.
A man is riding a surfboard in the ocean.
A surfboard stored on a rack at the beach with people in the background.
A man leaning on a pole on a sidewalk in front of a store.
A street sign sits at an intersection near a store.
A red train traveling past a three story building.
a bathroom filled with a sink, toilet and hardwood floors
A few trucks at night with their headlights on
A skateboarder takes a leisurely run down a city sidewalk.
A boy riding a skateboard on a sidewalk in an open courtyard.
A cat playing with a cup that is on the floor
A man is covered with four cats in bed.
a clown, teddy bear and troll doll for sale in a store.
someone rolls a pizza cutter over a small pizza
A wireless computer mouse with a computer in the background.
A child reading a book next to a dog that's lying on the ground.
There is a man sitting on the couch next to a woman but he has three neck ties on.
Three lamb in a pen, some of which have been sheered.
two young people playing in a house one is posing with a stick.
Man sitting at a picnic table near the beach with his lunch.
A cat on a leather chair next to remotes
A camera and tripod is shown with a laptop.
Hundreds of birds soaring through a cloudy sky.
A jumbo jet is just taking off form the runway.
A woman puts something into a stone oven.
A airplane that is flying over a runway.
A child flying a kite on a sunny day.
A counter and refrigerator in a small kitchen.
Surf boaders preparing to head into the ocean.
An airplane is on a runway near a passenger ramp.
A food entree is served on a plate.
There are different types of Italian food in the picture.
A man leaping to hit a tennis ball with a racket.
A beautiful woman inspecting a small brown dog.
A fancy bathroom with clear shower, toilet, and mirror
Two boats floating on top of a river next to a  rock mountain
Red pickup truck carrying a sign it its truck bed.
A set of windows with a red farm house in the view and green grass on the ground.
A woman sitting at a table with a plate.
A soldier dressed in white on top of skis.
A young elephant by a pool of water in a zoo enclosure.
a split picture of two tennis players swinging at the ball
A police officer and police horse directing traffic.
a black keyboard and a power strip and cords
A young woman looks over her shoulder as a sky lift takes her down the mountain.
a person riding  a wave on top of a surfboard.
Four men playing with remote controlled dog toys.
A man teaching his child how to ride a skate board
Baseball player at the plate in the process of swinging at a ball.
A t-shirt has been put onto a stuffed bear
Small cat sitting on top of a table looking at a television.
A cat with a collar sitting on a laptop keyboard.
Two square pizzas sitting a grill with cheese.
A baseball player is starting to run to first base.
Four people who are all wearing snow skis.
Two kids laying down propped up on pillows.
this giraffe is going for a walk in the grass
A man is surfing on a surfboard, catching a big wave.
The purple and pink flowers are in a vase.
Two baseball players walk near another player from the opposing team.
A person performs a jump in the air on a snowboard.
a close up of a plate of pancakes on a table
A giraffe standing next to a wooden pike fence
A woman is posing in front of a giraffe.
A woman sitting on a couch near a dog
A very sleek, clean and dark modern kitchen.
a baseball player is running down a field
A person on a snowboard in the snow.
A plate on a table is filled with carrots and beans.
a toilet on the ground outdoors near a bath tub
The contents of a purse are on a table.
A vodafone sitting on a table next to a Mac laptop.
A mismatched bathroom includes a center shower pan.
A white toilet sitting next to a window and a sink.
Three giraffe standing next to a fence under a lot of trees.
There is no image here to provide a caption for.
A table holding a group of fruits and vegetables in bags and crates.
A tour bus with advertisement on the side of it
People sitting at a table with multiple servings of food.
a woman in a black top on a motorcycle and a male on a bicycle
a red plate that has a piece of chicken with some veggies on it
birds standing on the edge of the ledge by the water
a large clock on the wall above a radiator
many people riding skis on a snowy slope
A hat that is on top of a shelf.
A keyboard and mouse on the ground in a room.
A woman holding a suitcase on a dirt road.
A couple present a birthday cake with three candles.
5 very people posing for the camera over some drinks.
a bunch of motorcycles are parked tightly together
Two monks with umbrella standing on a pavement
A man carrying a surf board out of the ocean.
A nurse administering medicine to a patient in a hospital.
A white plate that has various types of vegetables, meat and food items on it.
A set of cutlery and personal items lined up on a table.
An orange kitten on the green couch by itself.
Skier in red jacket stands on top of a large mountain
placemats are on top of a counter in this kitchen
there is a man sitting in his truck next to a surf board
two trains on a train track at a train station
A elephant standing next to two men near a stand.
The bathroom is clean and ready to be used.
A railroad train letting off a big black smokecloud
Two birds stand beside each other outside a green door.
A person standing in the snow near a snow board.
A single engine plane out front of buildings
close up of a red vase holding sticks
A old picture of workers building the railroad.
A large leafy green salad in a silver bowl.
A pre-made cold sandwich is in a cooler with drinks.
A man is on a saddled horse with reins.
A woman sitting on top of a bench with large breast.
A couple of street signs that got wrecked from a car accident.
A horse with a cover over it being carried along by a woman.
A woman that is sitting outside on a bench in the snow.
Two giraffes standing on a grassy plain with mountains in the background.
Group of folks playing bowling on Wii sports
A person goes down the slope covered with snow.
A PICTURE OF A BASEBALL PLAYER PLAYING BASEBALL
A table topped with vases filled with flowers.
Three sheep are grazing on grass by trees.
A man standing in falling snow at night holding on to a snowboard.
A man running on beach with a surfboard and mountain in the background.
A woman sitting on a white bench with her dog.
A wooden table topped with plates of food and drinks.
A close-up photo of a propeller plane in flight.
A baseball player is getting ready to swing the bat on home base.
An accordian sitting on a toilet in a bathroom.
A child holding a teddy bear while outside.
A train that is driving by in the day.
A baseball player has just hit a ball.
The train has stopped on the railroad tracks.
A pair of shoes that are under a bench.
many horses st a horse stable with people walking by
A very nice looking pizza with assorted toppings.
A woman in a dress carrying an umbrella
The vase is filled with multiple pink flowers.
A photo of a horse race from inside the stands
a number of boats in a large body of water
Three urinals are hanging from a marble tile wall
some kids in a bedroom with a lot of beds in it
A surfer dressed as Abraham Lincoln rides a wave into the beach
A clock sitting next to a large tree near a building.
Young girls with backpacks are standing near stairs that look to go to the subway.
a young lady in her room looking out the window
A couple of people playing a game of tennis.
A frame has six pictures, two with a horse.
A living room area with tile flooring and a man sitting int he middle of the room on a chair with a remote control in his hand, while looking at a television.
Yellow box truck parked on busy street in city.
A man is cutting a sub sandwiches while a lady put a vegetable in the bag.
A dog chasing after a Frisbee with green grass in the background.
People standing around two cakes and plates on a conference room table.
A woman laying in a bed next to a cat.
a sign on a short pole nest to some little trees
A gray and white cat sleeping inside of a luggage bag next to clothes.
a person in the air with a skateboard at a skate park
A person holds an umbrella in their hands.
A woman is making homemade pizza at a table.
A table topped with boxes of cupcakes and a sign.
A woman in a white shirt cutting into a cake in front of a television.
Man surfing on a surf board on water.
THERE ARE A LOT OF VEGETABLES ON TEH STAND
three elephants in a green field and some clouds
Motorcyclists gather at an event with their bikes.
A book and a pillow with a face lie on a blanket.
The group is gathered around the table to eat their meal.
Two breakfast meals on a table at IHOP.
A train on a track pulling into a station.
A dog enjoys chewing on a carrot in the living room.
A black and white themed bathroom with two toilets.
A food tray with french fries and a sandwich.
A young man prepares to hit a ball with a plastic bat.
A train going along a track near apartments.
A large glass table topped with different types of plants.
Two men in the park playing with a frisbee.
Bathroom stalls with trash on floor in commercial business.
A woman holding a tray with a chocolate covered pastry.
A toilet with buttons or a remote control.
Screens and small stuffed animals on a computer desk.
A pantry area next to a large white fridge.
Pair of skiers on snowy slope at sunset.
A garbage truck is emptying a plastic garbage can.
A man is bicycling down a street with a passenger standing on the back.
A group of people are skiing down the mountain.
A game controller in a persons hand over a couch.
Two horses are pulling the covered wagon through the snow.
A man on skis comes down the slope
a person standing over a squat down toilet
A group of four people playing croquet on a lawn.
A small airplane is flying against the blue sky.
A bouquet of flowers sits in a vase on a desk.
A person is in the rear view mirror of a motorcycle.
The man smiles while walking with skis down a grass slope.
A walk in shower in a dilapidated bathroom.
A young girl holding a tennis racket upright
A commuter train pulling out of a suburban station.
A person using a pair of scissors to work on a garment.
A man taking a big bite of a hot dog.
a bus showing domestic animals moving along the street
A couple of cows that are penned up for safe keeping.
A close-up of a black cow in front of a metal fence.
Cheerleaders are riding atop a trash truck turned float.
A bunk bed sits next to an open window.
A very big pretty bird in the water.
a fire hydrant with the word hydrant written above it
A man holding a remote in front of a garage.
two boats are idly floating on a lake.
People looking at a group of giraffes in zoo.
A baseball player crosses home plate as his teammate waits.
Four airplanes are flying high over telephone wires into cloudy skies.
a man bent over a sink while brushing his teeth
Three wine glasses and a glass bowl are on the top of the refrigerator.
The woman is walking with a pick umbrella.
A woman on a transport bike waiting for customers.
Two double decker buses sitting on top of a parking lot.
The Ansett-ANA airplane is parked on the lawn.
A clean bathroom with a toilet and shower.
A person holding up a smart phone to take a picture.
a girl sitting at a table with several plates of food around her.
A little girl trying to feed two giraffes through a netted barrier.
A woman is playing with a Frisbee on the grass.
A passenger train leaving the train station that is now empty..
People walk near the many parked tour buses.
A teddy bear sitting on a table in front of a computer.
a number of people sitting and standing near a building
a man is eating fruit from a bowl
Two cows roam and graze among trees and shrub along a mud path.
A table with three each of three different kinds of pizza.
There is a taxing rolling through a wet street
A train covered in snow on top of train tracks.
two elephants in tall bushes and trees in the background
Construction is being done on a street near businesses.
A man looking in a toilet under a sink.
A person is standing in the middle of fruit.
A chili cheese dog on a plate with a bag of corn chips next to it.
A woman standing on top of a sandy beach flying a kite.
The man and woman are decorating the vases together.
The men are enjoying a meal together by the window.
A street corner with a sign and a person riding by on a bike.
A television in a living room with a doughnut logo on it.
A kitchen has old white cabinets, and rice on the counter.
A yellow and black fire hydrant on sidewalk next to building.
A window in a kitchen with a red shade is shown.
a nice stove that is inside of a kitchen
A bullet train on rail tracks in the open country.
A yellow bus on street next to a building.
A man flying through the air riding a skateboard.
A donut on a plate in a microwave oven.
A display case in a bakery filled with lots of donuts.
A snow boarder performs a jump on a ski slope.
a giraffe standing next to a tree with one with it's leg in the air
A person with a baseball bat on a field.
A white toilet in a black bathroom with a phone on the wall.
Three children dressed in "Sunday school" clothes posting for a picture.
Fans pose with stuffed animals at an ice rink.
A tennis player is lunging forward after hitting the ball.
The toilet is clean and ready to be used.
A rocking chair sitting near a fire place.
Four men stand behind a couch playing a video game.
There is a mirror with a reflection of a train in it.
A her of zebras in the watering hole with a giraffe in the background.
A boy performing a trick with his skate board.
A refrigerator that has a plant on top of it.
This is a broccoli carrot soup with a lot of broth.
A man wearing a black shirt and a purple tie.
A man riding a wave on top of a surfboard as he flies through the air.
Oriental umbrellas at a food court in a mall.
A picture of some food in a plate.
A person with an umbrella stands in front of a bench.
A little baby laying on a fluffy blanket.
A dog with a frisbee in his mouth in the back yard.
A surfboard is decorated and sitting in the sand.
A group of CGI people standing on a hillside flying kites.
An orange and white cat sitting on a wood seat by a bed.
A white plate topped with a pile of food.
Two women prepare various vegetable dishes in a kitchen.
A women riding a bike with an umbrella.
a piece of cake sits on top of a plate
A group of girls sharing a pie each with a fork.
A cat sitting on the back of a motor bike.
A man placing some flowers inside a vase.
Woman riding a horse on an asphalt road.
Two pug dogs dressed in green bow ties and green top hats to celebrate St. Patrick's day.
A young child is standing in a room with toys on the floor.
A white van is covered with graffiti as it's parked near a curb.
A man is riding his 103 labeled bike on the road.
A young boy holding two skis poles on top of a snow covered slope.
Two people on bicycles and a dog crossing by barrier on a street.
Three different trains stopping at a train station.
A plate of vegetables is set next to some sauce.
A motorcycle cop on a city street tries to look cool.
A man and a woman enjoying a meal of sandwiches.
A man exits the huge boat parked by the beach.
The dog is laying down on the grass outside.
Man flying kite in open field near RV park.
Boy skate boarding on cement ramp at night.
a bathroom with a tub, counter, mirror and small mosaic tile
Three people skiing in single file in the snow.
there is a fried crab inside of a small bowl
A group of animals standing next to each other.
Several Southwest airlines planes sitting on the runways.
Two dogs on a beach surrounded by grass.
a couple of kids sit at a table with some cake
Orange puffy dog standing in the light on a tile floor.
A herd of sheep grazing in an open grassy field.
A stir fry consisting of rice, broccoli, and other vegetables.
a man is taking a selfie in the mirror
People are skiing down a snowy hill.
A bus is headed under a pass way on a foggy day.
Five girls in the frame playing soccer, one has the ball.
A colorful train winds through the valley of a mountain.
A statue of a man not far from a large clock.
A young boy in a cluttered rec room playing Wii in his pajamas.
A person riding a skateboard down a handrail
Horses cows and sheep are led down a dirt parking lot.
a bear that is sitting on a very large rock
Two baby horses playing together in a field
A male tennis player about to return a tennis ball.
A kitchen that has a stove, refrigerator, and table in it.
A smiling man with a box of donuts is handing a donut to a girl as two other young children look on.
a person riding a surf board on a body of water
a person is drinking a beer and eating food
A white toilet with a clear toilet seat.
a bath room with a toilet and a towel rack
A group of snowboarders in the snowy conditions
Three fire hydrants in front of a huge building.
Sports team playing baseball on a ball field.
A couple of women holding up smart phones in their hands.
A night scene with a lit street sign, "Fremont St. Experience."
Three cows stand at the top of a grass-covered hill.
The front and back cover of a book.
A busy street with people walking by a train station.
A man in a top hat and a woman with glasses.
a bath room with a toilet and a sink
A pair of scissors and crumpled paper sitting on a table.
A plate with a piece of food next to a pile of cheese broccoli.
A cow and a calf are standing in a pen.
A grown and a baby elephant are in a sandy area
a nice fast green motor cycle in the sun
A ELEPHANT IS IN THE WATER RIGHT NOW
Young professional looking man with a tie and cardigan
A red train with a bike painted on the side.
A man ordering something from a milk truck.
A man riding a skateboard through the air over a ramp.
A red double bus sitting on top of a dry grass field.
A police officer rides a motorcycle with a side car.
A tennis player lunges to hit the ball.
a delicious looking sandwich on a plate with a knife
The cowboy at the rodeo is trying to rope the calf.
a street sign for Peepee Falls street above a stop sign
A dog is sitting on a piece of wood.
A women in a sunhat and sunglasses posing beside a bilingual English-Arabic stop sign.
A girl stands while talking on a cell phone.
Living room with a table, couch, and a lounge chair.
A young boy is in the park holding a kite.
A green and white van full of signs written in spanish
A child laying in crib with teddy bear.
A man sitting in front of his birthday cake smiling.
Some apples and strawberries are on the plate.
a fire hydrant sitting undeneath trees covered with toilet paper
A punch of different shots of a man in the air.
This is an image of a laptop computer
An animal eating from the ground near a beach.
A man standing up holding wii controllers in his hands
A group of people holding while glasses posing for a picture.
A store is on a city street near a traffic light.
There is a military plane that is parked on the tarp
a black and white cat looks out the window
Some type of cheese casserole enclosed in parchment paper in the oven.
A man leans down and picks up a flying disc.
A large kitchen with many brown cabinets and brown flooring
Feet wearing red tennis shoes stands next to a white toilet on a tile floor.
A bed in a bedroom next to a table with a lamp.
A laptop sits on a pad on a desk.
A tennis player bounces a ball before a serve.
A young boy tosses a tennis ball into the air in preparations to hit it.
a couple of people walking on a highway
A man cooking vegetables and sausages on a grill
An assortment of miscellaneous gadgets spread out on a table.
A white dog is lying down under a chair in sand.
An cat sits on the sill of a dilapidated window.
A happy woman engaging interaction with her laptop
A clock tower made of bricks outside when it is not so bright.
Three giraffe standing next to a brown stone building.
The corner of a kitchen showing a dishwasher, sink and household items.
Large assortment of decorated vases on shelf on display.
A surfer rides in on her stomach and a gentle wave
a couple of elephants are standing in a field
a group of weird looking vegetables sitting on a table
There are people with a man holding a Frisbee on the grass.
Pizza with side salad and glass of wine on display on table.
A man is talking a picture of a man on a skateboard.
A black boy playing tennis at a tennis court.
there is a man on a skateboard doing a trick
Brown, black and tan cows grazing on grass in an open field.
A man sitting with two ties on.
A large Italian dish on a wood block
A woman wearing a pair of glasses on top of her head.
a man is holding a tennis racket and a ball
A horse drawn carriage coming down a city street
Men talking to monks sitting down at an airport.
A baseball batter striking a ball at a baseball game.
A person sail boards in a lake with hills in the background.
Two people sit on a city train while checking their personal items.
a long narrow bathroom with a dirty tub and blue and white walls.
A stop sign in front of the water on a bridge.
Two beers sit on a table between bunk beds.
A pan on a table with lots of pizza.
Bowl of pasta with chicken and broccoli with bread and cheese.
A dirty show floor in a very small bathroom.
A yellow school bus negotiates and intersection in a city.
A large room with much seating available.
A young boy riding a pedal boat at an amusement park.
A man talks to a young boy who is wearing skis.
A little boy plays outside with his ball.
A table with a lamp on top of it next to a couch.
Young man in orange jersey swinging a baseball bat.
A man is sitting next to a Christmas teddy bear.
Two zebra standing next to each other on a dry grass field.
A person wearing a tie posing for a photo.
Smoke billows from the back of a yellow and blue fighter jet.
A dog is standing in the middle of a rug wearing a green tie.
A couple of people are walking their horses.
A cat that is laying underneath an umbrella.
three baseball players holding up bats on a baseball diamond
A man on a scooter doing a trick in the air
A vase and glass with decorative paintings on them.
Three friends pose for a picture while dining.
A woman plays a video game in a living room.
A man installs wood cabinets in a kitchen.
A picture of a old water pic machine.
A man lugging a red bag of luggage down a sidewalk.
A man and a woman seated on a motorcycle, leading a line of others, also riding on motorcycles.
A man stretching out yelling while catching a Frisbee.
Asian man in glasses holding two colorful mobile phone cases
a number of people standing holding umbrellas near a building
A couple of lawn chairs sitting under a white umbrella.
A person is holding up a carrot in a kitchen.
A close up of a blue vase with flowers on a table.
A cow eating grass by a house next to the ocean.
a person who's going down a snowy slope.
A kitchen filled with metallic appliances sitting next to a stove.
some sliced up orange peels sitting on a counter and bowl
A wood room with some tools on shelves
A little boy seems fascinated by this silver fire hydrant.
A bathroom with a white toilet next to a sink and tub.
A bicycle chained to a beached boat on a beach.
A man skiing on a slope while people watch.
An old man in the middle of his kitchen.
People walk on the beach, with a hut in the distance.
A crowd gathers outside of an outdoor bar.
A man crossing a busy intersection near train tracks.
The man is fixing his two skies so the shoe will fit.
A crowd of people with umbrellas standing near a train.
A dog in a field looking up while wearing a hat.
A black and white cat relaxing inside a laptop.
A kitchen is shown with an oven and stove.
Two men ride a bicycle contraption with a big load of bananas.
A large motorcycle is on display at a gathering of people
a baseball pitcher ready to throw the ball
A coffee cup sitting on a counter in front of a TV with the show 24 playing.
A bathroom with a shower sink and windows.
People carrying surfboards walking down a sidewalk during the day.
A bus sits parked at the curb on an empty street
this man is riding a wave on a board
A baseball player stands in front of advertising signs.
Fruits, vegetables and a carton of eggs sitting on a table.
Three giraffes standing idly in a dry field
A man in a suit is holding a glass of Champagne.
Streetlights in front of a brick building in some downtown
The working and kitchen area of a dorm room
Man in a green field standing behind a red Frisbee in the grass.
A surfer in a wetsuit catching a breaking wave.
two dogs standing on a checker board printed floor
a close up of a small dog near a car
Two cows stand next to each other inside a corral.
A young child lays in bed with a bunch of different books.
Two birds walk in the surf along the beach.
a black and white picture of a blue fire hydrant.
The small bathroom has a beige toilet in it.
The woman is showing the child how to feed the giraffe.
Two brown cows standing in some tall grass.
A young boy touching a small frog that is sitting on an orange frisbee.
A herd of cattle walking down a country road.
Three motorcyclists riding down the road on a curve
People are waiting on the station platform for the train to stop.
Several zebras standing in grass during the day.
A man riding a motorcycle while talking on a cell phone.
A tennis player has just hit the ball.
The toilet is sitting in the brown colored bathroom.
A print ad for the Pizzeria La Crescia.
A desk with a laptop, monitor, keyboard and mouse.
Cute picture of white cat snuggled near older dog.
a close up of a remote control pointed at a tv
A hot dog on a plate with lettuce.
people skiing down a hill with no poles
Horse-drawn carriage moves along street carrying two passengers
a man that is throwing a frizbee in the woods
A cat is perched on top of a parked car.
A couple of street signs sitting next to tall buildings.
A young woman with an oar paddling on a surf board.
There is a bowl of fresh fruit on the table.
A young boy is sitting in front of the oven.
A wall mounted black oven next to a counter top.
A man trying to manoeuvre through violent waves as he surfs.
A woman sitting on a bench holding a kite of a bat.
A train traveling down train tracks next to a small building.
A boat sailing on a beautiful lake during the day.
Olympic skiers are competing in a cross country event.
a train traveling along tracks near a lush green forest.
A man using a snow board holding a giant fake axe
A tennis player swings his racket to return a ball.
A plate that has a sandwich and french fries on it.
A elephant and a brown elk in a field.
A neatly organized room with a bed and stuffed bear on it.
A pair of hands preparing a sausage dog on grill.
a group of guys on the soccer field playing in front of a crowd
A group of people skiing in a ski race on snow covered ground.
An electric commuter train at a well maintained station
A motorcycle sits on a sidewalk near a city street.
A dell lap top and an apple laptop side by side on a counter
A man driving a two horse wagon team.
A kitchen with a stove top oven next to a kitchen counter.
A man in a shirt and tie motioning with his hand.
a table that has a bunch of stuff on it
A large airplane flying in the blue sky.
Two pieces of flat round bread laying next to each other.
A boy sits on a brick wall while holding his skateboard.
A small room features a microwave and a mirror.
A man wearing a fedora talking on a cell phone.
A man holding a plate of fresh pickles up.
A group of zebras gathered and a wooden shelter to get out of the sun.
A man cleans his surfboard with a cloth
There a man and woman standing on the beach.
A laptop sitting on a desk near a cellphone, mouse, keyboard and monitor.
A blue chrome motorcycle with a dark blue seat.
The tennis player in the green Nike shirt has a pained facial expression.
A restaurant called the library bar and grill
This bathroom has wall paper on most of the wall and wall paper on the bath tub.
a child in a wagon with many green apples
A couple of people that are playing a game.
a traffic light next to a street sign
Two buses, one blue and one red and white, are going to different destinations.
A man climbing up the side of a black pole in a park.
A red and green plate holding a pink cake with frosting.
A person on some skis in the snow.
a image of blue and yellow trains on train tracks
Two people walking along the beach while someone flies a kite in the surf.
A woman and two men inspect cars at a show.
A GROUP OF ZEBRAS CLOSELY GATHERING TOGETHER IN OPEN AREA.
A breakfast sandwich made from biscuits contains egg and sausage.
this is a group of elephants in the water near rocks
Stop sign with street signs at a parkway intersection.
a pair of animals on the side of a rocky hill
A man standing on a snow covered slope holding a board.
a grey cat standing at the sink with its eyes wide open
Four bowls with food in them on a table
A laptop and a mouse sit on a wooden table.
A bed that has padding with a blue picture frame hanging above it.
A young boy about to throw a baseball during a game.
The desk is full with computers and other hardware.
Two men sitting on a couch holding pool sticks, one between his legs.
A couple of people walking out of the ocean with surfboards.
Black car sitting at a red light intersection.
Variety of different deli products sitting in a glass case next to each other.
A police officer on a motorcycle with others following.
The dark green double decker bus travels down an empty road.
There are lots of seagulls flying near a boat.
There is a freshly made pie on top of the stove
A rainy picture of three red double deck buses on a street.
Various black and white street signs with a pigeon on them.
A man and a woman cooking hot dogs on an outdoor grill.
A person and a kid on a couch in a room.
Parents laying on bed in opposite direction of their daughters
A zebra standing on top of a grass covered field
The table has several plates of pizza on it.
Two sandwiches and a bowl of fries sitting on a plate next to a cup.
A skateboarder jumps over a limbo bar during a competition
The view from the airplane shows a mountain range.
A man throwing a baseball at a baseball game
The people are playing a game in the living room.
Child laying down with arms extended in the air.
A small bathroom with a lot of white tile.
Two women in a kitchen preparing a meal
two cats laying in a messy bed near a wall
The woman is posing for a photo near the bikes.
Men in SWAT gear running with guns drawn.
a pair of colorful vases holding white daisies
A small bird perched on top of a tree branch.
An old building with a clock tower in it.
Cupcakes with candy and marsh mellow toppings sit in a white box.
A man performs a trick on a skateboard in front of three other men.
A train on a railroad track adjacent to 5 other railroad tracks.
Man in blue shirt feeding birds from cup.
An elephant with a medium sized bird on his back is eating brush.
Two men are standing in front of flags and shaking hands.
a big white bridge is going across a lake
A person sits at an outdoor bar with a piece of paper.
a stuffed animal dog sits inside of a toilet
A bird is perched in front of a window with bars on it.
A black bear that is sitting in a grassy spot in a garden.
A little girl wearing green shoes riding a skateboard in the street.
Two men pictured next to a light aircraft with another one in the background
A stained-glass window is seen in front of a unique background.
A man is standing in a field and flying a kite.
A table filled with fresh vegetables being prepared to eat.
Catering truck parked tightly between cars on a city street.
Many people stand in front of a large modern building.
ATTEMPTING TRICKS ON BICYCLES AND SKATEBOARDS AT A SKATE PARK
Colorful bird sitting on a branch of a tree.
Man in blindfold and red garb holding glass of wine.
A man is running to try and catch a frisbee.
A sign indicating turns ahead in the night
A deep pizza with cheese sliced into 6 pieces.
Two horses racing with two men on them.
A man takes his dog for a ride on a scooter.
a sandwich and a cup on a table
A bit of broccoli, celery and melon on a table.
a couple of men are eating on a boat
Outside shot of a restroom showing the door partially open.
A street sign leaned over with the words High Gate Avenue on it.
a view from below of a one way sign
Two boys are playing a game of soccer.
A kitchen with an island which includes a dishwasher, a stove, cabinets, a vent and two windows.
a bath room with multiple mirrors and sinks
Street sign saying Tow Zone with a teddy bear hanging from the pole.
An orange subway car with purple and yellow graffiti is passing by two men.
an image of a bear that is walking up a hill
A man in a cap is sitting at a laptop.
Two dogs standing in front of debris in the snow.
A sandy beach covered in lawn chairs and umbrellas.
Baby eating food from a  blue plate and spoon
A boy hitting a baseball with a bat on a field.
a park bench with a blue umbrella among flowers and trees
A cow is standing outside in the grass on a foggy day.
a man that is walking around with a surfboard
A yellow and blue train traveling under a bridge.
A feminine shirtless man holding a bottle of wine in the kitchen with the refrigerator open.
A couple of people in the snow on skis.
A cross country skier is stopping along a path.
Street signs one on a street corner surrounded by trees
a cat siting on a blue bench in front of some trees
a rusted boat resting on the shore
A boy skateboarding along the top of a marble garden shelf.
A bus that is driving down the road.
There are books on all three shelves of this book shelf.
An airplane and airport crews preparing for takeoff.
A pizza cut into four slices with blue stuff on it.
a women that is on a court with a racket
a male in a white shirt with a black suitcase and people
A double decker bus driving on a street.
A guy is performing a trick on a skateboard.
some sheep are standing way off in a field
A red pick up truck with a large blue object in it's back.
A little boy in a plaza holding a kite.
Two men standing on a hill in snow skis.
Close-up of green bananas still on the stalk
a bunch of trains that are sitting on tracks.
A large jetliner flying over a mountain next to a statue of Jesus.
A pasta dish is featured along with a grilled flatbread.
A man is standing in the grass holding a baseball bat.
Person holding a camera in front of an orange display.
A man standing in the doorway of a bus traveling down the road
Many different cars parked on a city street.
A white toilet with the seat up in a room.
A bed that has some books on top of it.
a number of birds flying over a body of water
A red bowl of meat and vegetables on a wood table.
A herd of zebra standing next to each other in water.
A person in a blue coat snowboarding down a mountain.
Group of people all on laptops during a meeting
A plate of homemade cheesy pizza on a table.
gentlemen in suits one wearing a bow tie and one a regular tie
A man riding a skateboard down the side of a graffiti covered ramp.
Behind a metal bar a giraffe is view-able.
A young boy jumping into the air while wearing a catchers mitt.
A couple of people sitting at a table with pizza.
A skateboard rider on top of a handrail by a path in the city
A dog is laying down with some stuffed animals.
A bunch of animals gather together in the snow
A bowl of soup with chopped broccoli on top.
Two women talking and having a drink at a bar.
A cow reaches through its fence to eat hay.
A girl is playing frisbee in a courtyard area.
A subway train stopped to except new passengers
an image of a 2 zebras looking on
We see a very old and beat up coke machine.
A horse tied up to the side of a tree in the snow.
A person that is working on a computer.
a yellow green white and red double decker bus and a building
A yellow street sign sitting on the side of a road.
Two people ride an elephant on the side of a road.
A man is holding a cellular phone against the rail.
a person holding clothes near a bed in a bed room
A man in sunglasses is getting ready to play tennis.
A photo of an omelet and toast with coffee on the side.
A delicious looking pizza with a variety of vegetable toppings stands out on a yellow plate.
A child watches television while a panda bear sits by a purse.
A white washer machine positioned in a bathroom.
A dog walks through a kitchen with cabinets.
The train does not have any cars attached to it.
A cute teddy bear sitting on a table next to a bottled beverage.
A gren and white bus on street next to a building.
A sliced chocolate cake on a white plate.
Hand with scissors cutting computer printout paper.
A boy holding a skateboard with a two women and coffee design on it.
A silver car driving down a rain soaked street with bikes on top of it.
The train engines and cars have seen better days.
Black-and-white photo of two benches on the street.
A bowl of soup including vegetables and rice.
a knife with a black handle broccoli and green beans
A man on skis performs a jump in the snow.
The sky is cloudy behind an illuminated street light.
A bathroom vanity with candle, toothbrush's and holder and photo's of Marilyn Monroe.
Two small ducklings on a field of grass.
A dog sitting outside a large brick building.
A man unloading sheep from the back of a truck onto a pile of mud.
A lady eating a doughnut and drinking coffee.
A red train sitting at an empty station.
A woman helps a little girl take a bite of a large hot dog while they sit on a bench.
a big tower that has a clock on top
A filtered photograph of a person riding a motorcycle.
A dirty bathtub sits in a bathroom with a big window on the side.
A hot dog has a person's head on it.
A crowd of people on a beach flying kites.
A white plate with a piece of brad on top of it.
a grey suitcase next to several other objects outdoors on the pavement.
a large giraffe that is walking by some trees
A couple of baseball players that are on a baseball field.
A woman swinging a tennis racquet on a court.
A jug shaped vase holding yellow flowers on a table.
a ceramic set of two cups and a cake which is probably a sugar bowl
A man sitting on a sofa holding up a laptop with writing on the monitor.
A communal sink in a white and dingy bathroom.
A small child putting peanut butter on some bread.
The young man is going around the cone on his skateboard.
there is a man pointing out to another man in the ocean
A skateboard turned upside down in a street with shoes hovering over it
A street sign pole with many street signs on it.
Two three dimensional images of a woman with an umbrella.
A woman standing in front of a table with lots of salad.
a man that is standing up at home plate
a person waking up and hitting the alarm button on a white clock
Young woman using video game controller in living area.
a flat screen television sitting on a entertainment center
A large elephant walking through a wooded area
A large airplane flying high up in the sky.
A man standing in front of a TV playing a video game.
A man in a a shirt and tie smiling at the camera
Two green street signs sitting under a tree.
A bowl of cherries beside apples, bananas, and eggplants
Slices of pizza sitting on plates next to a glass.
A parked airplane with the terminal gate to the plane.
A cat sits on a desk in front of a computer.
Two small kids on skis on the slopes
A person cutting bananas in half on a cutting board.
A small truck sitting on a road near a gang of bikers.
A man cutting a piece of plastic with scissors.
a baseball field and some players playing baseball
A baseball player poses as if he hits a baseball.
The pizza is ready to be cooked, then eaten.
A fork sitting on a table next to a car shaped cupcake.
The clock has many designs and sculptures carved around it.
A tractor and a truck travel down a road.
A blurry image of some bison laying on the grass.
A number of motorbikes and cars parked in the field
A white bull dog rolling around on it's back next to a cat.
A large long train on a steel track.
golden clock details on large clock tower clock surrounded by brick
A woman playing tennis swings a racket overhead.
a woman sits on a couch with a cat laying on her
a tiny ass bed in a tiny ass room with a tiny ass tv
A cat is sitting on the arm of a chair.
a giraffe in its pen some bushes trees and grass
A vase filled with lots of flowers sitting on top of a table.
a white plate filled up with a lot of glazed donuts
A farm animal on dirt outside of a home.
A person holding a tiny piece of paper.
The pizza sits on the board on the stove.
A sub sandwich on a table at a restaurant
A girl and boy playing on a fire hydrant.
a close up of two people shaking hands over a motor cycle
A white and grey freckled horse next to a brown horse in a valley filled with trees and tall grass.
A woman in a white dress and someone with a striped umbrella seated by a pond.
some cute brown and white cows looking towards the camera
a male skateboarder in a white shirt is doing a trick
An orange keychain is next to a red camera.
a young man doing a jump with his skateboard in a skate park
A giraffe bending over near a big pole.
A woman standing in a twist position with arm extended, and a Frisbee in the air near her, in a grass park with trees, with people playing Frisbee, walking and lounging in the grass on a sunny day.
A  cat sleeping on a blanket on someone's bed
A cluttered living room with figurines on a display case and photographs on the wall.
A girl in a dress standing on a small skateboard.
The man on the couch is playing a video game.
A man is standing next to a motorcycle in a village.
Several cars and people at bikes sitting at a red light.
A study table where two laptops are kept open.
some baseball players a pitcher catcher and an umpire
two ladies and kids playing sports in a green yard
A man in a white shirt and black shorts jumps near a soccer ball.
A plate of bacon, sausage, and other breakfast foods.
a man on skis fly through the air
A long train with a yellow front stopped at station.
Red motor 
scooter parked on the sand with a sunset in background.
A fire hydrant is gushing water on a sidewalk.
A man arranges toppings on the uncooked pizza.
a group of friends sitting on a mountain posing for a picture
Two teddy bears sit on a bed in a bedroom.
an image of a group of giraffes at the zoo
A person sitting down talking on a telephone.
Skateboarder sitting down in the snow in front of another rider.
Two boats docked on top of a gravely beach near the ocean.
Two giraffes are walking through the enclosed area.
a dog lying on the ground next to a red bicycle with a laundry basket attached.
A person leaning on their ski poles with a snow covered background.
A couple of men reaching up towards a blue kite.
a green train is coming down a set of tracks
a laptop sitting on a special rack on top of a desk
Two blue and yellow trains parked next to each other on train tracks.
A person cycles on a motorcycle down a road.
A stove sitting next to a bunch of old box springs.
A man, woman and child petting a goat at a petting zoo.
a bear on a road near a field of green grass
A man kneeling down on a beach next to the ocean.
A man in a tie standing in front a a Budlight truck.
A young cow looking forward while several others drink at a trough in the background
A white sink sitting under a bathroom mirror.
Partially eaten cake doughnut with sugar sprinkle topping.
A grey automobile driving down a city street shaded by several trees.
A man throwing a frisbee towards a man and two children.
A toy plane flies through a cloudless sky.
The baseball player is throwing the ball from the mound.
A person riding a skate board in the air.
Woman in dark, heavy dress cooking in a home kitchen.
A metro train is pulling into the station.
A large gray elephant walking across a road.
a man with his arms out waiting to hit the tennis ball
Two plates filled with plain hotdogs on a table.
a room filled with a stove and surrounded by cabinets
A woman on a bike with a baby seat holding a dog leash.
People in the ocean are playing frisbee and sitting in small watercrafts.
Flying bird silhouetted overhead against cloudy sky background.
Kitchen accessories in a clean, organized  kitchen.
an old stone building with a clock mounted on the side.
A pole with several different street signs on it.
Two students waiting to cross a busy street.
A collection of plush animals with big ears and eyes.
living room angle with fireplace, bookshelf, furniture, and hardwood floors
a black and yellow train sitting next to a fence
A young boy sitting on a stone bench in an arid landscape
a large wooden park bench next to some rails
A store building with stuffed bears in the window.
A man leads a painted elephant carrying tourists down the street.
A woman sitting on top of a wooden chair at a table.
A woman sitting in a vehicle using a cell phone.
A whole pizza sits on a pan on the table.
there are two skiers that are going down the hill
A desktop computer that is sitting on a desk.
a wooden table with so many tools on it
Man posing in front of bicycle with a banana in his hand.
White horse looking over shoulder in enclosure of wood
some zebras are in their pen eating some food
A couple of signs hang off of a building
A commercial airplane on the runway with the jets on.
A doughnut sitting on top of a napkin next to a cut of coffee on top of a doughnut table cloth.
A toy baby in a toy stroller in a toy kitchen.
A homemade pizza, salad and two glasses of wine on a table.
A long train riding on train tracks through an empty field
A horse is looking over a fence with a shield on its face.
Several kids are playing frisbee outside in a yard.
A giraffe is sticking its tongue out at some people
a woman in a red top some glasses and a pizza
A man riding on his bike and talking on the phone.
A TV sitting in front of a picture on a wall.
a teddy bear set on top of a child sleeping
A man is driving with his dog in the back seat.
a man takes a photo of a clean bathroom
A plate that has a half eaten piece of cake.
There is a group of small birds standing on the chairs.
A wooden table topped with different kinds of foods.
there are many men that are playing soccer on the field
A brown, black, and white cat that is wearing a black hat
A person on a field with a baseball bat.
A man riding a skateboard down some steps.
The train is pulling up to the platform.
A red small engine plane in motion on a field.
A man windsurfs with several other people in the background.
A commuter train that is stopped at the station for loading of passengers.
A side mirror with the image of the Eifel Tower reflecting in it.
A tennis player lurches forward after hitting the ball toward the other side.
Two plates of food next to two laptops.
A woman in business attire walking on a sidewalk and talking on a cell phone.
A woman on a bench reading a book
USA 20 dollars totaling 120, held down by a cell phone with Coca cola cans nearby.
Girl smiles for picture in busy Asian plaza.
A small pizza sitting on a sheet of tin foil.
Three people posing for a picture in front of a cell phone case.
A person with their car open stands on the snow.
This is a small and clean but cluttered kitchen.
A homemade pizza with cheese and cucumbers in a pan.
some one skiing on a snow filled hill
a person helping someone prepare food on a buffet
a bathroom with a toilet a curtain and wooden floors
a cat starring at the camera and a television in the background
LOTS OF CABLE CARS, ON LOTS OF TRACKS
red flowers in a jar against a screen
Man with unhappy face in clothing store shirt section.
A large white bush stopped at a bus stop.
A man and woman with blue shirts and bicycles on a sidewalk.
Purple and gold bed and flowers against a red wall
The two teddy bears are posed together to take a photo.
A person giving a stuffed teddy bear a kiss
Two sofas are facing each other in this well decorated living room.
Two people carrying backpacks are cross country skiing.
An elephant in the shade of a tree.
A tower white with yellow trim tower features a large clock.
A large bathroom with a frosted walk in shower
A man with a racket walks on the pavement.
A group of children standing next to each other on snow.
A  MAN IS SKATE BOARDING ON THE SIDE WALK
Two boats that have groups of people in them.
a person with a skate board and a back pack
A baseball player holding a bat next to home plate.
A man on a motorcycle talking to a woman in an SUV on residential street.
A group of people riding skis on a snow covered slope.
a sandwich in a plastic food basket on a table
Several zebras walk through the tall green grass.
frosted donuts in a display case to feast upon
A bathroom with a toilet and a shower with a window.
A triple layer cake with a white hand made out of frosting on top of it.
A white plate topped with a salad next to a glass of OJ.
Three carrots sitting on a plate in front of a knife.
There are several plates with different pastries on the plate
A kitchen with wood cabinets, white refrigerator, white stove and a microwave above the refrigerator.
a man in a suit is holding up a beer
A construction worker smashes away at the roof of a building.
A hockey player on the court with a bunch of stuffed animals.
A woman with some of her fingers in her mouth
Two giraffes are eating their food from a feeder.
a group of people loading up on a big airplane
A sandwich and can of soda on a table.
Man riding a colorful surfboard on green ocean waves.
Golden lab with smile sitting in the bed of a red pickup truck.
There is a small kitchen with black cabinets
The steak and hot dogs are being cooked on the grill.
a yellow and black train traveling along a train track
there is a street sign that has been bent in the middle
An Audi car on an oriental city street
The jumbo jet flies over a building with it's landing gear down.
A woman walking down a street holding an umbrella.
A bike and some small birds on a field.
A wooden bench by the water and some grass.
A bright red umbrella with a view of the ocean and mountains behind it.
A large pile of stuffed animals is outside.
These are user manuals for an Apple mouse and keyboard.
A bathroom features a large mirror and toiletries next to the sink.
A group of chefs working in a kitchen that has a statue of a chef.
An airplane parked on a runway at an airport.
A sign at a railway crossing giving instructions on how not to get hurt.
A beautiful blonde girl holding a Nintendo Wii controller next to a man .
A group of wild animals walking along a gravel road.
A woman is sitting down and talking on a cell phone.
A man standing in a wooded area looking at trees.
Dogs walking down a set of rickety porch steps.
Two plates of food are sitting on a tray with forks.
A very nice looking train by a plat form.
Chairs and a table with a laptop on it sitting outside.
Three zebras stand in tall grass near a wooded area
A boy is flying a kite in a field.
a desk covered with electronics, paperwork and a lamp
A lavish bedroom furniture set of carved wood
A zebra standing on the grass above a bird and rocks.
Motion blur photograph of lights at night time
A picture showing a long line of scooters parked on a city street.
A man displaying a cake pan at a kitchen counter.
Brown and white cat sitting in front of an open refrigerator.
A little child standing next to a yellow fire hydrant.
A fire hydrant on a side walk in front of a building.
A person holds a fork and knife to cut pizza.
A gondola boat ride on the canals of Venice.
A bench on a pier near a ferris wheel in a park.
A woman is riding a show horse at a competition.
a bunch of skiers at a skiing resort on a clear day
A dog rides in a cart pulled by a man on a bike.
A living room filled with lots of furniture.
A passenger bus that is driving down the street.
Two vehicles are parked in a giant warehouse.
Fruit on a plate next to a book.
an orange truck people trees and a street and buildings
there are many canoes that are in the water
A number of street signs on some poles
A long train sits at the station waiting for it's departure.
a dog is playing with a water bottle
A suitcase sitting next to a brick wall.
A smiling woman pours a bucket of water into a toilet.
The large herd of cows are all around the large field.
A room has blue walls and a wooden floor.
Atribe of people ride some elephants out side
Craft tools and a project currently in progress
A boy is riding a skateboard in order to skate off the ramp.
Antique black and white photograph of a horse drawn tram
A young girl opens her mouth while eating cake.
View of a city bus through the side view mirror
This red pot is filled with a variety of vegetables.
A for rent sign hanging outside in front of a building .
A picture of a Wii remote in its packaging.
A room filled with flowers in front of windows.
A group of people in grassy area with kites in the sky.
Two young people are posing for the camera with their surf boards.
A herd of elephant walking across a dirt covered ground.
Orange train on tracks in the country side.
A young woman on a tennis ball about to return a hit
A man is taking a picture of himself in the mirror
A red double-decker bus with a open top level.
A batter up to plate in the middle of the swing.
a man in a tie and a woman in a hat ride horses
A quail looking bird is standing in a tree.
This man is looking downward while his is skiing.
A photograph of something in the image.
A view of an empty kitchen with white and wood lined cabinets.
The paper towel holder in the restroom hangs from rope.
A view of a pizza cut into four slices.
a couple of jets are flying in the sky
a basket with a sandwich and some fries in it.
Two men smiling while riding in a bus.
A dog staring out the window at people standing outside.
A screen of people playing a baseball game
A bedroom with a four post bed decorated in black and white.
A man standing next to a wall sized glass window.
A small person rides a skateboard modified with large tires.
A couple of street signs mounted to the side of a building.
Girl on a couch with her computer on a table
a group of people walking on the street during the day
A person on a surfboard riding a wave.
A dog is under a brown computer desk.
The woman is playing a game of tennis on the court.
a body of water with three boats sitting next to each other
A bathroom vanity sink with a large mirror and hairdryer on the wall.
People are sitting in a large room on couches with a fireplace.
Several men in the kitchen with one cutting a piece of meat.
There are two beds in a room side by side.
there are two blur bullet trains on the tracks
A lamb with several babies is laying in the grass.
A bathtub with a colorful wall decoration is seen here.
Small flowers are placed in a clear empty bottle.
Horses in fenced area with grass and hay and adults nearby.
Group of people standing around a kitchen area with food on it.
Plates of various food items sit on tables.
A baseball player is standing on the playing field.
A very tall clock tower towering over a green tree.
There are plates with food and drinks on the table.
A man and woman in kitchen preparing food by a stove.
two zebras in the field grazing on grass
A man stands in an outdoor market selling a variety of fruit.
A man ries his skateboard around bright green cones.
a cat with its head burried in a shoe
Two children on boogiebody boards in the ocean.
A public bus parked in a bus station at night.
A man skillfully water skiing in wild water.
a couple of guys standing up with some snowboards in hand
A cat is standing under a red car.
A vase with a few large sticks in it next to a sink.
The back of an elephant with tusks overlooking a road.
Man standing on a soccer field holding a frisbee with a dog beside him.
There is a clock displayed on the side of the building.
A group of people sitting at a table eating food.
There is a green plant inside a bottle
A man calmly sitting on a bench with an Indian Head Dress on.
A bi-plane with a wing walker on its wings.
A large body of water with a train traveling over it.
A woman brushing her hair standing in a living room.
a man holding  a tennis racket beside him during the game
some oranges hanging from some branches of an orange tree
A vase filled with purple flowers sitting on a table.
An Apple laptop rests on a custom wooden stand.
A large bed in a bedroom next to a fire place.
A woman flying a kite on a rocky beach near the shore line.
A man has fallen off of his surfboard.
A man catches a wave on his surfboard and holds his arms up to balance.
Two men in horse drawn carriage on city street.
A person sitting on  a snowboard going downhill
A view of a bathroom with a mirror, towels and a tub.
Two men in robes while one has a toothbrush in his mouth.
A biplane in the sky in the middle of a turn.
a clear road across the street from tall building and water
a red baron pizza cooked in a microwave
An all white and steel bathroom with 2 windows
A car stops to pay a parking fee to a woman
A black Chrome laptop sits on a desk.
Some very tall pretty giraffes by some other animals.
A couple green stoplights on an empty street.
A little girl holding a green cup in front of a bowl of food.
A young child with a colorful umbrella walks down a path in a coastal setting.
A man is holding a small Dell laptop.
The sink and mirror in a business bath room
A table filled with plates of food sitting next to each other.
A massive crowd of people standing around the Washington monument.
A man doing a trick on a skateboard.
a bathroom with a toilet, tub and cabinets
A cat taking a nap upon a laptop computer on a desk.
Bicyclists on a city street, most not using the bike lane
Busy city traffic in an older part of town.
A black and white photo of an old steam locomotive on a train track.
A blue and white train in front of a building.
a man sits by himself on a bench in a stone-paved square in front of a large bed of flowers
A large amount of tables and chairs are by a clock.
Young man pointing at computer keyboard with sprouts growing out of it.
Baseball game with batter on base and umpire standing by.
A sign reading "Car parking," is on a fence in front of a herd of cattle.
Small black and white cow in grazing field.
Large black motorcycle with a motor vehicle sitting on the side.
A couple of women are walking down the street
A plate with  sandwich and french fries on it.
A person is walking in the sand near the water.
a large air plane on a run way
A cat looking out a window with green trim.
Person in pink snow gear on a snowboard.
Jumbo jet flying over a group of trees.
Toilet bowl sitting on the side of the road filled with papers.
an image of two men working on their laptops
Many cows travel down the side of a street
The family is playing a game all together.
A large fluffy cat is sitting on a chair next to a computer mouse.
A man sitting and looking at something with his hand to his mouth.
A man kneeling on the ground next to a couch working on his laptop.
The tennis player stands ready for the next play.
Two men hold up glasses of bear above two pizzas.
A passenger plane that belongs to American Airlines taking off.
A man checking on some food that is in a white oven
A wooden table holding two glasses of wine and a plate with pizza.
Lots of bags of luggage sitting on the floor of an airport.
A group of people playing a game of frisbee.
Two people riding on a boat on a large body of water.
Pair of adults and teens having silly fun.
A young man is using his skateboard on the street.
A cat sitting in a plastic water bag on a hard wood floor.
The elephant wades through very deep, calm waters.
A female tennis player in action on the court.
An animal on top of a table while a bear rides a bike.
Four stuffed animals against a plain light colored background
A person walking while brushing their teeth and wearing a red hat.
A jumbo jet travels down the runway towards the camera.
A crowded city sidewalk with lots of people.
A giraffe in an enclosure looking at onlookers.
A man sits and admires the architecture of a large bridge.
A sign prohibiting skating on the sidewalk with black and red writing.
A father taking a mirror photo of him and his daughter brushing their teeth
A small boy is eating a sprinkled doughnut
A boy on a skate board crossing the street
The skateboard competition is geared to even the youngest boarders.
A bunch of food is layed out on a white dish
This is a shot of someone wearing a pair of skis.
a black and white photo of two people cutting cake
a giraffe standing in a pen next to a tree trunk.
A handyman moving a refrigerator back into its place.
A person wearing a Cat in the Hat costume in front of kids.
A horse drawn carriage waiting outside a hotel.
A fruit stand with grapes, oranges, apples and plums.
Player returning volley during match play on tennis court.
Someone who is applying some chocolate on a cake.
There is a street light with two green arrows in different directions
A plate with bagel, fruit and potatoes sitting on a table.
A surfer is riding a wave while another swims to catch the next one.
A giraffe inspects the roots of a fallen tree.
A bathroom that features a vanity cabinet with sink, commode, overhead cabinet and mirror.
a couple of doughnuts are under a display case
People watching a man doing a skateboard trick at a skate park.
A man cuddles with a woman who holds a banana.
A young man siting at a picnic table holding a sandwich while a young girl looks on smiling.
A hot dog sandwich with eyes on top of a plate.
Two giraffe's by large rocks with and ostrich.
Relax in the chairs next to the pool.
A pizza with ham, cheese, olives and oysters on a plate.
a black and red brick building with a white and black clock and sign
Vehicle traffic on a city street in a snow storm.
A NARROW HALLWAY WITH A TOILET IN THE BATHROOM
A stop sign with bike rack next to street corner.
A giraffe attempting to lick a woman's hand over a fence.
A fluffy cat is sitting on a glass table.
a woman swinging a tennis racket on a tennis court.
The little girl hold the pole while the man sits on the fire hydrant.
a person riding a surfboard in a body of water
A woman sitting down holding a cell phone up to her ear
Well hello there cat, are you up to something?
a blue car sits in front of a red bus
People are walking down a busy city street.
A woman is brushing her teeth in the bathroom
a black orange and white cat and a controller
A table with a white plate of food that includes broccoli and chicken.
A plate holds a portion of a broccoli casserole.
The small bathroom has a shower, a toilet, and a sink in it.
Little kid on a metal pedal tractor in a yard with sheep.
A group of people on grassy field with kites in the sky.
Several young children with ties in a school room.
A man in grey shirt sitting at a table with plate of food.
A kitchen with a sink, cabinets, and other accessories in it
assorted decorations laying around on a plastic tarp
Two persons in formal dress posing for a photograph.
A person in a car smiling next to a suitcase.
A young man dressed in a suit and a tie smiling at someone.
Multiple person seating bench on the side of a city street.
A gastric delight of sausage, broccoli and onions.
A young boy riding a skateboard on a street in front of a house.
A train riding on a track near a platform.
The two woman are baking in the kitchen.
A green double deck bus parked next to a railing.
A boy runs in the grass while holding an umbrella.
A young baseball player stands at the plate, in motion to hit an oncoming ball.
THERE IS A BOAT ON THE GRASS IN FRONT OF THE YARD
A kitchen with stone counters and a bar.
This is a shot of a printed recipe for lemon marmalade.
A large group of people flying kites in a field.
A man cooking food over an open flame.
A variety of people sit at tables and watch screens.
A tennis match from above with a large crowd of spectators.
A fire hydrant in the grass with a red top and yellow bottom.
A toilet in bathtub in a home bathroom
A large open field filled with a large group of cows.
A kitchen with an oven and a sink.
there are many small red trains on these tracks
A pair of cats laying on each other on a desk.
a man wearing a hat is on his boat and four birds
A steep incline of snow, a strip of sky and what looks like a large red strut sticking out of the snow make a setting for a helmeted skier in otherwise regular clothes in a twisted posture the points of his or her skis ascending the slope.
Old photograph depicting train in industrial area of city.
A clock tower has a group of people inside it.
A person pets a cat near a weathered building and plant.
A living room with a large picture window, a fire place, a table and a couch and loveseat
The bowls on the table are filled with food.
a plastic toilet in a small bathroom stall
Skateboarder performing aerial trick in large indoor area.
two photos of one woman playing against a doubles team
Three people are standing together in the snow on skis.
A man in a wetsuit surfs a wave.
A man is riding his dirt bike while wearing a helmet
Four cows grazing and resting in the shade of trees.
A long blue classic truck parked in a parking lot.
A large passenger jet flying through a cloudy sky.
A tennis player runs and swings his racket at the ball.
A skier standing on top of a rail on a snowy slope
A stop sign with a one way directional sign marks an intersection.
A group of people are gathered for a photo.
A black cow walking across a stream.
A young man standing on a skateboard riding a wave.
a bench near a river, situated close to a bridge
A woman wearing a dress holding a brown teddy bear.
some zebras gather around a watering hole in a herd
A dog lays on a large turtle pillow panting.
two people sitting in a living room touching each other with their feet
A herd of sheep blocking a parking lot.
Two men standing on a tennis court holding tennis racquets.
A man wearing a shirt and tie next to a barn door.
A couple of brown and white cows standing on top of a hill.
A black-and-white photo of a man walking next to a fire hydrant.
A smiling couple stand next to a bench at the bottom of an escalator.
Small dog sitting on bed in bedroom of home.
This is an unused bathroom with a sink, toilet, and bathtub.
A woman stands near a red double-decker bus and uses her cell phone.
A green, orange and white train in a train station.
A group of men and women standing together.
A man riding in a carriage with horses and an umbrella.
A man holding a baseball bat wearing an old fashioned uniform
The view of a large sized bathroom with a tile floor.
a dog sitting in an open suitcase with a stuffed animal
A man pets a small, baby elephant.
An old vase with artwork of an octopus.
The cows are ready to eat and drink their next meal.
A girl with red dyed hair eating a banana
a delivery truck and a couple of bikes
A laptop displaying a picture of a man.
Two giraffes eat leaves from some small trees.
This metal plate contains lovely slices of tangerines and pomegranate.
A dog whizzes by to run towards a yellow Frisbee.
Young boy with giraffe in enclosed fenced area.
A counter holds a plate with bananas on it.
a yellow red and silver train on its track and some wires
A white table topped with game controllers, remote controls, a keyboard and phones.
A woman eating a pizza and smiling for the camera.
view of a snow capped mountain from in a plane
An old man with a beard and a bowler hat
A sink with cups and towel next to it
Several pictures of Asian style dishes and in the middle a person is eating.
a giraffe standing close to a big rock
A far view of a plane in a cloudy sky.
This is the caboose of a freight train.
a large group of people on a beach doing various activities.
a man can be seen walking past a store front
A cat is playing on a small computer.
a man standing on a tennis court in front of a crowd
There is a chair sitting in an empty lawn.
A bathroom that has a mirror and a bathtub.
A giant clock tower and a clear sky.
A child with a bat standing at home plate waiting for the pitch from a pitching machine with his teammates in the field.
A cabinet that has some stickers on it.
A man and boy are looking at a cellphone.
The restaurant kitchen is closed until lunch tomorrow.
A couple of women reaching out to a tall giraffe.
A black baseball mitt, ball, and baseball bat.
A batter positions the bat in the air.
a woman sitting at a table while using her white laptop
A bedroom, well lit, has a couch, dresser, comforter.
A brown clock tower rises above some trees.
A group of people pose with a horse as a crowd looks on.
A man standing on a sidewalk beside a park.
A large blue street clock attached to a post.
A close up shows a large bunch of broccoli.
Animals next to shoreline in artist's painting at sunset.
The woman on the bench was near the boy on the skateboard.
A man smiles while eating a small piece of food
A teddy bear sitting on ice with a knife stabbed in the belly.
An on microwave unit is heating something in a cup.
This desk has a one computer and two laptops on it.
A metal stove and a counter in a room.
A CHEF IS HOLINDG CARROTS IN HIS HAND SMILING
Two cats sitting in their beds beside a window
a clock hanging fron the ceiling announcing the time as 144
Rain falling on a city street filled with people and cars.
Two men playing competitive frisbee against each other.
Two adult horses grazing in a dry field.
A flamingo with its hear scrunched back near its feathers.
A blond married man in a green T-shirt sits in front of a computer keyboard taking a bite out of a donut.
Small brown dog with colorful braids sitting on a couch.
A cat sitting on top of a wooden table next to a yellow motorcycle.
A picture of a person that is looking into the water.
Two small elephants standing together in the wild.
A little boy that is standing on a skateboard.
there are four small beds all in the same room
A cat sitting in a sink in the bathroom
A dog and a cat standing side by side.
THRE ARE TWO PEPLE THAT ARE SITTING ON THE BENCH
a man is playing tennis on the court
People are gathered around vintage cars at a car show.
A seagull strolls on the beach during sunset.
a group of people are crammed in a small area on motor bikes
This guy is in the country about to fly a kite.
Four remote controls attached to the side of a television.
The pole of the stop sign is covered in vines.
There are some people riding horses on a field
A woman sitting in the sand holding a kite.
A giraffe standing near a tree branch in the grass near a grove of trees.
A laptop with keyboard and mouse separate on a desk
Their is two pieces of bread on a white plate
A living room with a decorated christmas tree in it
Traditional view of crowded city residences with big bridge at the end of the street.
A group of people sitting at a table eating food.
A young boy is herding rams with a stick.
A grinning man stands outside in the snow with a snowboard.
Three people on a baseball field with catchers mask and baseball bat.
A computer desk with a laptop computer on it.
Two giraffes inside a white fence in a yard.
a double parking meter near a tiled wall
A person is walking through the snow on skis.
a woman smiles at two babies who are laughing at each other
A girl happily eating a pizza in a restaurant.
Fresh fruits such as apples, pears, watermelon, and apricots.
A group of men playing a game of frisbee on a field at night.
two snow skiers coming down a snowy hill
People skateboard down the sidewalk and the street.
Two elephants standing next to each other in front of a face.
The teddy bear sports some very unusual colors.
An upside down sign saying "Road Work Ahead"
A teddy bear is sitting down reading a book.
A person in a black snowsuit pulls a kid on a sleigh who is holding a birghtly colored umbrella.
A pair of yellow scissors sticking out of a cow pattern bag.
A couple of sheep grazing on the grass in a pasture.
Two girls walk down the street carrying pink umbrellas
A man is performing on stage at an event.
Several bunches of bananas growing on a tree
there is a cat that has fallen asleep under a car
A red sleeping area in a bedroom scene
ONIONS, TOMATOES AND OLIVES ATOP A PLATE ON A TABLE
A young child dressed like a chef cutting broccoli
A normal bench sitting on a wooden bridge
A small calf next to a large cow in a field.
A man riding a wave on a surfboard in the ocean.
a large pizza that is in a box
Seven stuffed teddy bears lined up against a wall.
Group of cars passing by a long row of apartment buildings.
Several small boats are on the water on a foggy day.
A bathroom that has some blue tile on the wall.
A pizza has meat toppings on a square plate with other food items on a table.
The bathtub and sink of a bathroom with a large mirror.
a girl that is playing some tennis on a court
A chocolate cake with candy on top of it.
A fire hydrant stands in front of the entrance to an apartment.
A zebra in front of a barn and pen.
A bird holds its wings up as it wades in shallow water.
A child in the window of a paper fire truck.
A surfer hitting a trick on top of a wave.
A cow laying on a green field next to it's baby.
A very cute small boy holding up a cell phone.
A holiday wreath with stuffed teddy bears and a penguin.
A young man on a skateboard rides past a cafe.
The items are on the conveyor being ready to be put on the wagon.
A closeup of a waxed surfboard in a surf shop.
A car at a show with people in background.
A car's passenger side mirror reflects the image of a long freight train.
A sandwich in a box with carrot sticks and an apple.
Two men holding bottles with ties on their heads
A man holding an umbrella on a sidewalk.
A person with a tie is holding a baby.
A red fire hydrant sitting on the side of a road.
A man purses his lips while holding up an orange in front of his face.
A giraffe is standing tall in an enclosure with large plants.
A young boy sitting at a counter drinking from a straw.
Two young people playing a game on the Nintendo Wii.
A bowl of cherries on a table in front of different fruits.
A person is on snow skis on a mountain top.
A delicious looking donut or cinnamon roll covered in icing
A young boy contemplating skating on the pipe.
Some people in a large kitchen preparing food.
A large airliner is taking off from the runway
a pizza on a plate on a table
A birthday cake is topped with a dog's head made out of frosting.
There is no toilet paper in this tiny, claustrophobic bathroom.
A baseball catcher standing and ready to throw a baseball as an umpire looks at him.
A BMW motorcycle sitting in a marina with boats.
a horse pulling a little carriage down the road
A person with a black, grey and green striped tie on.
a person sitting on a city street talking on a cell phone
A highway scene with a bus and a car behind a cattle truck.
A plain white bathroom with a sink and toilet.
A couple of people standing in front of a TV.
there are many fruits and vegetables on the table
A messy living room with pictures on the wall
Large pottery and bonsai trees are sitting outside.
Donuts with frosting in foreground, plate underneath them all
A blue motorcycle strapped onto a vehicle trailer.
A man with a hat is sitting by a television.
Skateboarder with leg tattoo riding on a skateboard
A man placing a baseball on a tee for a child.
A large bunch of baby bananas still green and in a basket.
A woman poses for a photo while sitting on a bench by the seaside.
Lady with sunglasses under pink umbrella at outdoor event.
The stop light found at a Hocken avenue intersection.
A man loads bananas high on top of a banana truck.
A bus with its doors open is waiting at a bus stop.
a tall giraffe peering over some trees and shrubbery
Black and white photo of a skateboarder doing a trick.
A pizza that is made of many various ingredients.
A red church in between two plain buildings.
A photo of a woman eating a hot dog.
Small black object sitting on the inside of a toilet bowl.
A very large commuter train is going down the track.
Men at the beach with one holding a surfboard
Two people looking into an empty, lighted wall oven.
A sandwich with broccoli, onions, cucumbers and other food on it.
a bird sitting on a fence against a lake
two plates of food on a table with chairs
a pair of scissors cutting  sheet of plastic cups
A group of people that are sitting at a table talking.
white plates that are covered with assorted donuts
A nice bathroom has a sink on glass.
The horse is tied up to the post outside.
A living room with orange colored walls, and a purple chair.
a view of a flock of sheep grazing in a field.
A man in a black jacket holds a toothbrush in his mouth as he stands near a woman with her eyes closed.
An elaborate metal vase holds a decorative bouquet of flowers.
A woman cross country skiing with her dog.
a blue double decker bus traveling down a street
Birds perched on iron poles in front of a tall building.
A man is asleep in bed with a laptop open on his lap.
Two horse drawn carriages travel down an old looking street.
a male in a red shirt eating and some people and lights
A motorized bicycle covered with greens and beans.
Several planes are parked in an airport field.
Herd of elephants standing in waterway near man in orange shirt.
A professional baseball player in the middle of a swing.
The parking lot by the market is full of cars.
Two plates sit on a table as one plate holds a sandwich and the other holds a cup of soup.
A young man is in action with a frisbee.
A man sitting on a bench surrounded by trees.
A desert and fork on a plate with multi colored polka dots.
A group of men in uniform riding a bunch of horses.
A blue and black motorcycle parked next to a silver truck.
A man skiing in the air above a snow filled mountain top
A cat is sleeping in a window sill.
The basket of lemon is near a rubber duck in the large bathroom.
Carrots, peppers, and zucchini resting on a paper towel.
A crowd of people are under a tent with a giraffe.
A table topped with a banana and other items like scissors.
A snowboarder is at the top of a snowy hill.
A very big building with a bunch of chickens.
there are two very large beds inside of this room
A man riding over a wave with a surfboard in the ocean.
A room with beds and suitcases and other items
a clock hanging down off a brick wall in a row of circular hanging light shades
An old red fire truck with 3 kids sitting in it
A man wrestling with a calf at a rodeo.
there is a small plane that is flying in the sky
A man riding a white and brown horse in the dirt.
two white plates with pizza a pitcher of wine some glasses and silverware
A giraffe running in an open barren desert.
An orange and white food truck is parked inside.
A bird stands near a car in the snow.
A white vase that is holding a pink and white flower.
A person wearing a large hat standing in front of a building.
A white and blue train traveling down train tracks.
Two large gray elephants standing in a dry grass covered field.
A man and a woman standing in front of a train.
A baseball player, catcher and umpire in a baseball field.
Broccoli, carrot, and dome other items on a dish.
A man surfing the waves on his surfboard in the ocean.
A plant coming up from the inside of a square pipe
two people driving motorcycles next to each other
A small white building with a clock tower on it.
A person on a skateboard up on a ledge.
A picture of a blender with some liquid in it.
A group of young boys riding scooters at a skate park.
Two women share a red umbrella walking down the street.
a person holding an open umbrella near a small pool
A man that is holding on to a racquet.
A cat sits on top of a toilet in front of a bathtub.
The giraffe is standing with its head between the tree.
a giraffe is eating food from the branches of trees
A tall brown elephant walking through a lush green forest.
Stuffed toy dog lays next to laptop that the woman is staring at.
A group of people sitting around a wooden table with food.
Three people are standing and throwing a frisbee.
A desk with multiple computer monitors and a laptop.
A man swinging a tennis racket at a tennis ball.
a living room with two white couches, a fireplace and a window with a view
A wooden desk topped with a computer monitor and keyboard.
People are getting off a bus in the evening.
a line of parking meters with buildings and vehicles in the background
A full view of a plate full of delicious food.
A horse standing in a secluded field of a Mountain Valley.
A plane fitted with pontoons moving around in the water.
A large dark sheep stands with two young ones.
Two people sitting on a ski lift over snow and trees, one wearing skis and one wearing a snowboard on feet.
A brown bear walking in an open area.
two pizza pies sitting on top of wooden pizza racks on top of stovetops.
A white kitchen filled with appliances next to a window.
a rum cake vender in a yellow truck
a laptop on a small table with a mouse
a bunch of random stuff sitting together on a tablecloth
A beautiful young lady walking a black and white dog past a hotel.
A table that has two cakes on it.
Series of propeller airplanes lined up at an airport.
a silver gray subway train parked in a subway
A paddle surfer riding a small wave in the ocean.
Pots, pans and a collander displayed on a kitchen cart.
A person doing a trick on a skateboard.
A row of orange trees sitting along side of a dirt road.
There are bicycles parked along a stone sidewalk.
this tennis player stands waiting for her opponents serve
Several people riding their bikes down a sidewalk.
A woman that is sitting down with a book and umbrella.
A woman is sitting with a congratulations sign
A bunch of bananas are hanging from a rack
A person is skiing on a lake while holding a rope attached to a parachute.
The man had to bend down to kiss the horse.
A group of cows walking down the middle of a street.
A small boat floating on a lake at sunset.
A bowl of oranges and bananas is in the center of the table while a plate of toast and eggs is towards the end.
Bright white bathroom sink and shelf with folded towels on a shelf.
A white bathroom area with a plant and yellow bottle.
A cat stretching its paw over a keyboard.
man riding a blue surfboard in the ocean
a group of bikers driving down the street
Man standing in office with glass walls eating a donut.
A MAN IN A RED SHIRT AND JEANS PLAYING A VIDEO GAME.
A man with a skateboard talking to another man.
A cat rubbing its head against a person's shoe.
Someone holds a donut in front of a box of donuts.
a couple of giraffes are sitting in a pin
Two large adult elephants have saddles on their backs.
A laptop, a mouse, and a pen are on the wooden table.
Three surf boarders talk on a dirty beach covered with seaweed.
A view of the back of a bus from inside.
An elephant is carrying people across a forested area.
She is sleeping with her dog on the couch.
A young boy sitting on a rug holding a cell phone.
A man standing next to a woman as they prepare food.
Two people sit on a bench in an grassy area in the midst of some building.
A parking meter is on the curb of a hilly street.
A umbrella stuck into sand at a beach with boats and hills in the background.
Two men are riding on motorcycles through the air.
The bus is stopped at the street corner.
Two pieces of luggage leaned up against a tree.
A cat stretched out next to a persons leg who is sitting in a chair holding a laptop in their lap.
The snow boarder is snow boarding down the mountain.
A man is seen walking out of a building.
A green birdhouse sits on a wooden platform in a garden.
A young elephant holds its trunk up to its mouth.
Crates of different vegetables stacked next to each other
A grey cat wearing a hat is getting petted.
an image of a girl that is playing outside in the field
Parked motorcycles and an old yellow school bus
A couple of people on skis examining a park description sign.
A picture of a stop sign with a small green smiley face sticker.
A vase of flowers, money, and a bottle of wine sitting on a table.
A clock sits above green bushes under a blue sky.
Passengers wait on a platform for the arrival of a train.
A large truck with crane scaffolding on the back.
an athlete holding a tennis tacquet in a stadium
Two uncooked pizzas has different ingredients on each.
A man wearing a blue tie with the ten commandments on it.
Five youths stand together holding tennis rackets on a court.
a wall with a bunch of graffiti on it
A man in gray and black holds up a small cell phone.
The man is walking up the ski slope.
a person with an orange beanie taking a picture of a gray train
a plate full of vegetables sits on top of a table
A woman and children surfing in the ocean.
A few men standing in there military uniforms .
A woman eating a doughnut sits behind a box of doughnuts.
a close up of a person riding on the back of an elephant
A planter box of vegetables in a fenced garden.
A man standing on the sea shore with surf board in his hand.
Purple teddy bear with book in its lap staged to look like its reading to a small orange stuffed bear beside it.
Two giraffe standing next to each other under a cloudy sky.
A woman is walking down the street with a red and white umbrella.
The cat is sitting on a person with a laptop on their knees.
A dual monitor station also hosts a cup of coffee, water, and a thin keyboard with a mouse.
A white fishing boat being followed by birds
Man and woman sitting at table enjoying meal with wait staff seen in background.
Tennis court match with a player on each side of net and people in audience.
The two ball players are setting in the dug out.
Some people sit together for a meal.
The very large, spceous bathroom has carpet and a jacuzzi.
A surfer wears a completely black wetsuit including a head covering.
One bird on top of another on a tree branch.
A man holding a skateboard in front of a group of people.
The intersection of a city street at a red light
A group of people sitting around a table together.
A young boy who is eating some food.
A line of young skiers ski down a gentle slope.
A delicious lookign healthy vege pizza in a box
three giraffes behind a fence with a tree near by
Giraffe and other animals graze in tall grasslands.
A man standing next to a red motorcycle in a parking space.
A man rides a cow through a parking lot.
a close up of a person with a plate of food on a table
A man in blue shirt standing by a brown and black dog.
A large brown teddy bear laying on top of the ground.
a bowl with some noodles inside of it
Two people and a dog that are standing together.
A man in brown shirt jumping with skateboard over gap.
The man is on a ladder painting the walls.
Several sheep standing and grazing in a yard.
A collage of photos shows different foods being prepared.
There are a lot of items laying in the bathroom floor.
An old style white stove with a kettle on it.
Two women are about to cut into a chocolate heart cake together.
A man and little girl sitting on a bench near a parked airplane.
two cake doughnuts with three strawberries and a cup  of coffee
Four fighter jets fly through the sky leaving a trail of smoke.
A man sitting on a motorcycle near several bicycles with a partially visible person standing nearby.
A young girl being pushed on a skateboard by her brother.
One lamb, amongst other lambs, looking directly toward the camera
two sheep sitting on a hill next to a fence
A yellow fronted train is going down the tracks.
This is a bathroom that is painted an ugly mustard color.
there is a woman dressed in a costume holding a bear
A parking meter and a car on a road.
A train car with graffiti on the side of it.
A person with a guitar hung on their body while playing a keyboard.
A paper, laptop, cellphone, mouse and bottle sitting on a table.
A haul of produce including squash, bananas, and mushrooms.
a bath room with a toilet a sink and a bath tub
A small herd of sheep grazing in a grassy field.
A green vase filled with multi colored candy canes.
A group of children playing in the snow.
some soldiers cutting into a decorated sheet cake
A red hammock set up in a wooded park.
A snapshot of a family at a store taking a picture together.
A blue motorcycle parked on the side of a road.
Three men holding baseball bats dressed in full uniform the first man is holding the bat and the man in the middle has his hands crossed and the third man is holding the bat with both his hands cupped together.
A man standing on a dock next to a boat.
A man and woman playing a video game together.
A giraffe standing next to a horse in the grass.
A meat filled sandwich sitting next to a cup of chili.
A couple of sheep in the middle of a grassy field.
A bathroom with blue walls has a window, a sink, a bathub, and a toilet.
A bike and a dog on the sidewalk outside a red building.
A cat laying on a pillow on a couch
People in casual sports uniforms running and jumping around.
A large double decker bus is driving down a street.
A photograph of a kitchen inside a house.
A field with horses on a cloudy day.
A Dilbert doll sits on a table next to drinks and a plate of donuts.
A guy is doing stunts on his motorbike.
A room full of American soldiers eating pizza.
A kitchen, including a table, oven and cabinets.
A vase full of flowers is sitting on a deck.
A traffic light and street sign on the road.
A cake that is shaped to look like a child's toy.
A group of friends sits in their living room while playing video games.
Black and white photograph of man on skateboard carrying a surfboard.
Two small babies sitting in feeding chairs with spoons in their mouths.
A woman that is kneeling under a elephants trunk.
A church lit up at night in a town.
Man with glasses talking on cell phone in car
A dog and a man are herding sheep.
A baseball game with a batter and a catcher.
A family riding on the back of an elephant
The man stands on a stage as his neck tie blows in the wind.
Harvested bananas, still green, sit in a pile.
A cat standing in the fridge with milk and juice.
An electric train pulling into a train station.
women sitting on a bed while man is getting dressed
A pizza is sitting on a pizza stone fully cooked.
A person waiting to perform a stunt on his skateboard on a quiet street.
An orange cat laying on its' side.
a hotdog a hamburger  and some onion rings
an old rust bucket truck with a cracked mirror
Several teenagers are playing soccer in a field.
A very blurry picture of an intersection taken from a moving car
A white and red boat in water with lighthouse in background.
A plane prepares to land on an airport runway.
a meal with meat, rice, and vegetables
Man poses for picture while sitting on the motorcycle
a person riding a surf board with a sail
A miniature blue train engine sits on the tracks in a rural setting.
An Asian gentleman sitting in a blue chair at an open office area.
people skiing down a roped off section learning
a bunch of traffic driving on a city street
Long empty white bus sitting out in the parking lot
The back side of a small charter jet flying through the air.
A balding man with glasses, standing near a bridge.
An open door leading to a small bathroom
a close up of a sandwich on a plate
A homemade pizza with gourmet toppings cools on a plate.
A woman standing in front of a large candle lit cake.
A professional baseball player holding a bat during a game.
Antique black and white photograph of surfers on a California beach
there is a large pizza with toppings on it
Two slices of pizza sitting on a white plate with soda near it.
a large building with a fence in front of it .
A restaurant sign hangs in from of a large oak tree.
The pipe smoker enjoys his nightly  smoky ritual.
A group of snowboarders poses for a picture on top of a mountain.
A man jumping up with is hands raised while playing Wii
A duck is in the air flying over water.
A group of snow skiers waiting  at the top of a mountain.
Skiers on a snowy slope stop for a rest.
Some animals that are sitting in the street.
Two small beds are sitting side by side
An empty side walk with in a city
A man flying through the air while riding a skateboard.
A double decker bus driving while it snows.
a little bathroom with a striped tiled floor
some people a clock tower and a black and white clock
A saddled horse tied to a rope on a beach
The street sign in posted near people walking across a road.
People watching two school buses crash on a dirt field.
a zebra is walking around in the snow
A metal wire fence confining sheep inside a grassy meadow.
colorful head pieces on large elephants for entertainment
Some very big trains one of them blowing smoke.
Photo of a man riding an old styled bicycle near what appears to be the Golden Gate Bridge.
A woman is walking and holding a kite
A siamese cat playing on the bed with a tabby.
A dozen surfboards are lined up on the beach shore.
black furry dog sitting in front of yellow fire plug
A man playing frisbee with a child in the park.
A bus is making a left turn behind a white car.
Man looking at camera taking a bite of food
A road sign advertises luxury while a cow rests on a dirt lawn in front of run down buildings.
A seagull holds a small fish in its beak
Soda with a plate of food, such as, pork, macaroni, and corn.
A desktop and a laptop sitting on a desk.
A flock of birds in motion of a field of grass.
A painting of green apples next to a bunch of bananas.
Two cats lying stretched out on a bed.
Two horses are standing together on the beach.
A food truck that sells soft frozen lemonade that is parked near other cars and kites are flying overhead.
A bouquet of different flowers is in a vase.
A snowboarder catching some air over a bump.
A woman ordering food in a dark restaurant.
These are crab cakes served on lettuce leafs.
A man is flying a kite at a park.
a group of zebras together in the grass
A school bus covered in art and a sign.
A holder with toothbrushes, toothpaste, make-up and earrings.
Two men stand holding skateboards in front of them.
Modern looking living room with white flooring and furnishings
Three red traffic lights suspended above an intersection by a cable.
Unoccupied park benches near very unusual, leafless trees.
a person riding a skate board on a skate park
A photographer holding a camera is looking in a mirror.
A busy street with many people walking down the sidewalk.
This is a man and a dog walking towards the water.
A large display with many watermelons and bananas.
a silver and blue fire hydrant lights and grass
diced meat and tomatoes are mixed with cheese and pasta in a large bowl.
a person wearing a vest, collared shirt and tie in front of bookshelves
a man standing on the street at the bus station
A brown and white cow standing in front of an iron fence.
a foot long hotdog and a regular hotdog and a mug of beer
some people walk down a city sidewalk by stores
A black cell phone resting on the table.
Man standing in a living room holding up a Wii controller.
A drink in a mason jar sitting beside a vase of pink flowers.
A clean bathroom with a white toilet and black bath mat.
A bunch of kids and some grown ups skiing.
A woman holding a tennis racket while people watch from the stands.
A lot of oranges are on a plate, with some having spilled onto a table.
A large building that has a clock on it.
A train with a red and yellow engine on a railroad track.
A man and a woman holding Nintendo Wii controllers.
I love the way the sun is creeping behind those two buidings
A pair of glasses and a cell phone next to a laptop.
a close up of a street sign with a building
A man holding a phone up to take a selfie.
a lady petting a giraffe behind a fence
The dog is laying on a rug in the the living room.
A man wearing a tee shirt eating a sandwhich.
Sheep are laying down together in the snow.
a vintage photo of some cows grazing on some grass
A truck driving down a rural dirt road near a street light.
A horse carrying a carriage getting a drink of water.
A child wearing a hat, tie, and white shirt smiling
A small baby is eating a long banana.
A man gets ready to swing a tennis racket.
Bananas on a table woman using a cell phone on another.
A snow boarder is in mid air on the mountain.
a train sitting next to  a  pedestrian sitting on a bench on a  railway platform.
A crowd of people standing next to a parked truck.
A woman with a cake and bag on the street
A woman seated and another standing with a cake and soda on the table
a book and a tablet on a black desk
A little girl in a green dress watching a herd of sheared sheep.
Three horses are seeking the shade of a large cottonwood tree.
A man in a red shirt in midair catching a flying disc.
A sad, young girl sits on her bed, moping.
A hill that is used for people to ski on.
a kitchen with a stove and a refrigerator
a collection of stuffed animals with some wearing party hats
A view of a room with a couch, television, and a fireplace.
A toddler holds a tennis racket that is bigger than they are.
Farm animals graze in the grass in the sunshine.
A skillet full of broccoli and vegetables cooking.
There is a stuffed bear in an electric chair
An area of a city street section off with police tape.
He should be careful not to get sauce on his notebook.
Blue umbrella in black and white photo of crowd of people
A bathroom in the process of being remolded.
An empty bench sitting under a nice big shade tree.
Computer stand with large monitor in cluttered room.
A scooter is parked on the street in front of a car.
A man wearing a black jacket next to a brick wall.
An old woman is playing with her two dogs
A boy wearing a green shirt and helmet is leaning up against a black fence while standing on a skateboard.
there is a sandwich and a bowl of food on a white plate
A laptop and a tablet on a wooden table
A person is wind sailing in the ocean.
a silver oven and stove and some brown cabinets and bottles
A small stuffed bear with a red hat.
A gray and white cat sprawled out on a sandy surface outside.
an upset adult baseball player throwing a baseball bat on first base
a car and a rear view window on a dirt road.
A bathroom with two small windows and a bathtub covered in a shower curtain.
A baseball player in red shorts prepares to swing at the ball.
Some people and chickens hang out in an undeveloped space.
a person holding a hamster holding a piece of broccoli
A surreal photo of a chair, a clock tower and a table suspended from the side of a building.
There is a person in animal suit holding large toothbrushes.
Two girls enjoy playing a game on the Nintendo Wii.
A person in a shirt and tie is holding a can.
A couple of ladies are playing tennis in this 3D image.
A bag on the floor with various items around it such as sneakers, clipboard, scissors, insect repellent and paper towels.
a guy taking a picture of  some art work on the wall
A beverage cooler and counter area in a small store.
A man wearing a blue shirt maneuvers to volley a tennis ball.
Mandarin oranges tangerine on yellow with blue trim bowl, white counter top.
a few baskets of food that is on top of a table
A small pizza has a curly topping on it.
A wooden caddy is full of scissors and pens.
A flat screen tv on a wooden shelf in front of a green wall.
Two cows standing on a dirt road next to wild green brush.
People spending time on a beach during the summer.
A blue vase holding pink carnations and white daisies.
The street sign is for Curran Street and 10th Street.
A little girl standing next to a boat on a beach.
Young men are playing frisbee in a park.
A clean white stove with a stainless steel pot on it.
A night scene of a traffic light in front of a parking lot.
A black bear laying on top of a field near trees.
A group of tourists are feeding some elephants.
A kitchen with wood cabinetry and a double sink.
The airplane is being serviced so it can make it's next flight.
Young men playing on the beach with a cow in the foreground.
Brown bear standing next to a big log.
a close up of a pizza with broccoli
a small boat on a beach with trees in the background
a bunch of orange cones sitting in the road
A couple of kids are on their laptops
Swans are swimming in the pond at the park
Two snow skiers pose to have their pictures made on their way uphill.
A bathroom with tan tiled floors and a glass shower.
several people are waiting to board a train
A surfer standing on the beach in front of his board
A woman is on skis riding down the snow covered sloped.
An animal that is looking at something in the air.
A man wearing a suit and tie and red hat with a silver buckle.
a couple of kids that are playing some frizbee
A kitchen is completely decorated in white and black.
A baseball player holding a bat in both of his hands.
Crowds of people on a street corner and a bus picks up people.
A woman riding a bike down the street.
A man bites in to a piece of food while outside
A teddy bear sitting in a fake bath tub with a rubber ducky.
A car crashed into the side of bus on a busy city street.
Man with piercing riding a skate board through neighborhood
A man in red jacket snowboarding down a snowy hill.
a lady in a chair touching a vase that is on the floor
A girl sitting in a chair holding a laptop in her hands.
Covered and uncovered produce is sitting on tables at a market.
A black and white image of a shipyard with some boats.
A computer desk with various items around it.
A couple of baseball players standing on top of a field.
a woman with flowers in her hair staring at the horse next to her
A skier skiing on a snowing day with trees in the background.
A cloud rolling over a ski slope with skiers watching.
Some women who are cooking a pizza on a grill.
A little girl cutting up food on a  cutting board.
a brown horse feeding on the grass which is well cleaned
A woman holding a cell phone while she smiles.
An older man wearing a suit and tie.
A couple of animals on a grass field.
A wooden table with a hotdog and a pitcher of beer.
An otherwise ordinary roof and chimney are offset by an ornate tower resting in the middle of the roof that features ornamental work, a walkway, a weather vane, and a clock.
A person in blue ski pants on skis going down a slope
A girl in a hat sitting on a dock near the water
A woman laying on the floor next to a dog and a cat.
An old smiling lady holding out a remote.
Man behind counter in  shop with coke cooler, newspapers, condiments on table.
A skate boarder flying high in the air over steps.
A red stop sign on the street in the snow.
Two adults and one baby elephant walking in the woods
A woman is jumping her horse over a piece of wood.
Multicolored kites flying in the blue sky with a few clouds.
A boat that is on some wooden cylinders on a beach.
A window stands beyond a large tub in a room.
A transit bus riding down a street with trees lined along it.
Three surfers standing in the sand holding surfboards
We see a blurry picture of a person riding a bike through a field with some cows.
A small airplane flying over a field filled with people.
A sign on a street post advises smiling.
this grizzly bear is standing in some shallow water
A woman walking up some steps towards a door.
A fried piece of lobster sitting on top of a table.
A person and a dog playing with frisbees.
a table that is full of many different  teddy bears
Many skiers are walking through the snow with skis and poles.
Someone is showing a text message to the camera.
Slices of pizza in a box next to a DVD movie.
A dumptruck is parked on a street near a hill.
Two young cows standing next to each other.
two brown bears lying together and relaxing on a rock
an elephant extending his trunk out and on to the ground
A close up of a plate of food containing eggs and toast.
A zebra and her baby walk through dry grass.
An older man in shorts with flip flops and an umbrella standing next to a luggage belt.
A male and a female walking together in a military airport.
a cat and a dog near one another
A woman and child are in the kitchen eating food.
A tennis player getting ready to serve the ball.
A bathroom scene with focus on a mirror and a bathtub.
A person in a ball cap sheering a sheep.
a coin-operated parking meter stands beside a brick wall along a parking lot
Men standing around outside on possibly a movie set
A dog laying on a red couch in a room.
A woman with glasses contemplates something as she rubs her chin.
Two persons on the sea shore holding a ski board.
Peddlers in boats on the waterway talking to people on the sidewalk.
an older person on an air plane looking at a display on the back on a seat
People are standing outside near a clock tower.
A display of vintage items including an antique television, Barbie dolls and a lunch box.
Young boy dressed in a large baseball uniform.
a woman holding a mitt during a baseball game
A large television screen in a large room.
A photoshop of President Obama and a celebrity
A cat sitting on a bench in front of a building.
A man sitting on the floor by a window with an electronic device
a train is moving forward letting out a huge puff of black smoke
a man in glasses gazing at the pizza on the table
A sign with plants and shade umbrellas sitting on the side of the road.
An old blue truck is on a grassy area.
Little girl covering her face and sitting in a wooden chair outside of a door.
A humble kitchen has a stove and microwave.
A herd of sheep standing outside of a pen.
a large bunch of flowers outdoors in a field
The cat is on the counter in the bathroom.
White goose with young floating on water in daytime.
A cow grazes from a junk pile, as a bird of prey soars overhead by the side of the road in a desolate setting.
The single bird has a small head and a large body.
Large group of food sitting on top of a table with white dishes.
A toilet and sink are connected to a steel piece.
A girl is standing outside flying a kite.
A plate with meat, broccoli and cheese and a potoato.
Several elephants walking on dirt and grass near body of water.
Two young girls holding hands in front of giraffes
A man rides an elephant across a body of water.
a train that is on a train track
Several countries have their flags displayed with flower memorials at the base of  lighthouse.
A fat hipster wearing a gray hat, a pink shirt, and a black butoniere.
Motor bikes with multiple packages driving on city street.
A group of tourists watch a herd of sheep in a field.
a street sign with a sticker on it to make it look like someone on a cross
Couple walking with an umbrella in the dark.
A bathroom with a toilet and sink below a window.
A white bird with a long black peak standing near the ocean.
A person riding skis on top of a snow covered slope.
brown cabinets in a kitchen with black appliances
A man with a tennis racket and ball is on a tennis court.
A foot long sandwich on a plate on a table.
A tennis player holds his racket with two hands
A living room filled with furniture sitting on a hard wood floor.
a train covered in black dirt sitting in a fancy train station
Children pay adept attention at a party as someone speaks.
a man riding a boogie board in the water
A ram laying down in the hay inside a wood enclosure.
A large number of suitcases cordoned off by rope.
A man eating a slice of pizza without holding the slice in his hands.
a man is in a salon getting his hair dryed
a lamppost during the day with two street sign
This is a game of professional baseball being played,
A motorcyclist walking away from his motorcycle that is parked beside the road.
The horse is approaching a man wearing a camera.
A ski slope with one skier on it doing the snowplow.
A large skylight inside of a building with a high ceiling.
a young man holding onto a bat by a sign
woman takes a picture of herself in a mirror.
A young woman kneeling behind a small stone wall.
A set of bulls lying on the ground next to a boat.
Two sheep stand next to a fence on grass.
A whole sliced pizza and a can in a box.
a bathroom with a toilet and a sign on the lid
A woman balances an umbrella on her finger.
A striped zebra is on short grass by a forest.
an image of a cat on top of a couch
Electric train car, on tracks with car carrier in background.
a man with a green bandana holding onto a kite string
There are different appliances in the middle of a kitchen.
A man bent over in an open grassy field with something in his hand.
THIS IS A PHOTO OF A SMALL HERD OF COWS WALKING DOWN THE ROAD
Two laptops are stacked on top of each other on this desk.
A painting of a blue fish flying through the canvas
A single engine plane painted yellow flying overhead.
a man in the kitchen cutting something on a cutting board
An umbrella standing upright in a room on the floor near a wall.
A woman is pointing and holding a hair dryer.
Two women who are holding papers and wine glasses
a street post with lights while clouds go by
A large tower stands tall in front of a blue sky.
A snowboarder posing for the camera on a snow bank.
A group of rescue workers helping an overturned car
two people standing side by side holding a glass of wine
Two giraffes out in the sun either in a zoo or in the wild
Train with its lights on a train track at night.
A man flies a kite by the water side.
a sanctuary sign and a tall clock tower
Five giraffes in an enclosure on a sunny day.
Fruit baskets and dips on display in a market.
Beach umbrellas made of straw with the ocean in the background.
A couple of girls holding tennis racquets and a ball.
people in a field lfying many kites flying in the sky
A elephant stands at a watering hole with its truck in its mouth.
A shelf full of teddy bears on display.
a train going down the tracks near a large city
A bathroom with wooden door and a suitcase on metal a metal frame chair.
A man in a red suit is on a white surfboard on top of a wave.
A man walking in the sand with a surfboard.
Multiple fire engines in the street in front of building.
The boy is playing video games on the tv.
Two long buses parked on the side of a road.
A giraffe that is standing in a grassy area.
Two women smile with skis on as they sit in a snow bank.
A pair of tiny red scissors getting ready to cut.
A cut in half sandwich on a plate next to a shake.
a white plate on a table  filled with pizza plices
Woman places a piece of chocolate at the top of this treat
a long train is crossing over a river
Young boys on a couch with their stuffed animals and a laptop computer
A man sitting in a chair with his legs crossed.
A lady playing tennis on a court professionally.
People are walking with horses on a trail of dirt and stone.
A skier comes down the snowy slopes quickly.
A male surfer riding a very small wave to shore.
A cat is on the floor with some scissors.
A dog that is sitting on a couch.
An older stove sits in the kitchen next to a bottle of cleaner.
An athletic middle aged male skier courses downhill.
A train is traveling though a very beautiful mountain area.
A stuffed bear is sitting next to some jars
A dining space with a table and four chairs under a window and art on the wall.
a small 3 storey building with a clock on the top
A large empty bathroom with a walk in shower tub.
A small child sitting in a sink brushing his teeth
Double photos of two Rice University tennis players
A meal laid out on a table outside at a restaurant.
A stop sign on a pole in the grass.
A boy getting ready to hit a baseball at a game.
A man riding a skateboard into the air.
large gothic styled church towering over cemetery
A rusty fire hydrant is between two poles.
A plate full of food accompanied by a glass of wine.
An old train is making its way through the city.
A ski slope scene with a skier on skis.
A person and a laptop in a room.
A black and white photo of a motorcycle.
The four images each have different plates of food.
a person riding a skate board on a ledge
A pole with two wooden street signs in front of a bush.
Fresh produce, including oranges and apples, is on display in bins in the sunshine.
Two wine bottles on a table with one wine glass next to the bottles.
There are many zebras out on the plain.
A flock of birds are clinging to a tree.
A piece of paper and some scissors on a table.
A women holding a tennis racquet getting ready to play a game of tennis.
A man carries a surfboard through the city.
a small pizza that is on a white plate
A white sheep standing in a wire pen.
A tennis player prepares to return the ball.
A large commercial air plane on the other side of a body of water.
a group of zebras standing on a dirt and grass field
Some ice cream with a fork on a clear plate.
A kitchen filled with kitchen furniture and accessories.
A man carrying a plate with food on it.
A man standing in a field is throwing a frisbee.
A plane that is on the ground in the air.
A bathroom is reflected in a round mirror.
an image of a skateboarder doing a trick down a ramp
a dark gray horse grazing in the field
This is an arrangement of pebbles and fruit with a butterfly sitting on an orange slice.
A woman in her bra and a dress holding a giant green object.
A zebra stands near a mound of dirt in a wooded area.
a close up of a slice of pizza in a box
A smart phone sitting next to a receipt on a table.
A kite that is stuck in a tree.
Book case with books and computer with keyboard
A woman faces a truck that is loaded with luggage.
The man jumps high to hit the tennis ball.
three brown bears are cooling off in the water
an image of two horses with noses nestled to each other
A suitcase and a stroller full of miscellaneous items abandoned on a city sidewalk.
A building with an ornate clock fastened to it near a flag.
A group of different animals that's standing in the dirt.
A bunch of fruit like banana along side each other.
A young person stands in the kitchen, holding up a box of food, near the island counter.
some people riding some bikes right by some boats
A man on a striped board windsurfing in the ocean.
Man preparing to serve ball on outdoor tennis court.
The giraffe seems calm inside of the fence.
A couple of people carrying surfboards under a pier.
A person sitting at a table eating pizza and drinking wine.
Two men standing in a living room next to each other.
a ca dipping its head into a toilet bowl
A group of surfers ride a wave on their surfboards
A young child sitting on a surfboard at a beach.
A person is riding a snowboard down a snowy hill.
a hotel room with a nice tv and sofa setup
A girl who is wearing a baseball glove.
there is a woman cooking in a very large kitchen
A young boy wearing a blue shirt standing next to a woman.
some people walking on a pier and a skateboarder
Cat sitting near a row of shoes and boots.
A strand of beads on an open laptop computer.
Two military men being honored with an award.
A bird on a beach with the ocean in the background.
A sub sandwich is fully loaded and must be eaten from a container.
The cow is grazing in the tall grass.
Green highway signs pointing in opposite directions next to a building
A brick outdoor structure of the Delacourte Clock.
The perspective of the skateboard picture creates an unusual scene.
A boy and a girl pose for a prom picture.
A wooden table with bowl of soup and cup with beverage in it.
Two giraffes under the trees on a sunny day
A surf board rider falling off his board while a ship sails out a sea.
A plate with a sandwich and french fries with a drink in a glass.
A person's hand holding a bitten into doughnut.
Several cows laying in the grass on a sunny day.
A small boat tied to a dock at a pier.
an old silver and brown double parking meter
A mountain covered in snow with a person on a snowboard.
A woman grimaces in frustration with a video game remote.
a truck sits parked next to a bench
A cat sitting on the home office desk by an open window
a steeple outside of a window with a clock
Person on the tennis court bent over with racket in hand
A large elephant stomps around on the dirt covered ground.
A woman smiles from behind a bar displaying liquor bottles.
A kitchen view of a refrigerator, with TV trays next to it.
A skier standing in the snow next to a yellow and blue train.
a man bouncing a tennis ball on a court before he serves
A man wearing a backpack and holding a suitcase on the road side.
A blue, yellow and brown house with a clock in front of the fence.
a tiger striped cat hiding under a bed
so many people at the beach swimming and resting
part of a sandwich sitting on a table
Three men sitting on a bench holding black luggage.
a dog sitting in the driver seat of a truck
A female in pajamas and hooded sweatshirt playing a video game.
A group of people riding an elephant through a forest.
a plastic cup of almonds some crackers and cheese
Large made up bed in modern bedroom, with small desk.
A woman tying a horse down to a trailer.
A zebra runs across a field with antelope in the background.
A man is standing in the middle of a living room.
A piece of pie sits on a red plate.
teddy bear like candy on a wooden table
A man on a skateboard is riding on the ramp.
A close up picture of a vase in front of 6 other vases.
an airplane flying about many tall buildings and cars
A clean living room with multiple sofas and a flatscreen television.
People standing at a bar, eating appetizers and drinking wine.
A bridge over water that has several trees on one side.
A little girls peers into display of goods in a bakery.
Someone is doing something right now that is fascinating.
A bald headed man on top of a red motorcycle.
The man is working on his cell phone by his desk.
A woman pouring coffee into cups on a counter.
A group of men on a field playing baseball.
A motorcycle sits parked across from a herd of livestock.
a man on a surf board riding on a wave
A cat is licking up food from a blue plate.
A red brick building sits on a corner and has a tower and a clock.
A wrought iron bench sits above the sea shore.
A garden filled with lots of green plants.
A plate full of food that has carrots and some meat on it.
Three beds in a white bedroom with two windows.
a photographer wears a umbrella to get camera dry
A child in a blue coat skiing on a ski slope.
Some french toast sits on a plate next to coffee.
A table with a plate of food, pitcher of orange juice, coffee and sugar packets.
A brown and white dog with long ears holding a yellow frisbee in it's mouth.
a woman is on her cell phone on the sidewalk
Two planes that are flying in the sky.
A table topped with plates and trays of food.
A woman swinging at a incoming tennis ball
Two Clydesdale horses being walked through a park.
Skiers enjoying a day on the slopes in the sun
A group of men are playing a game in a living room.
The living room has a long grey couch and a rug under the coffee table.
a man sits on a park bench surrounded by pidgeons
an image of a cat with a tennis racket by a girl
an old jet fighter with a propellor sitting in a plane graveyard
A kids ski school with one instructor teaching
A woman shops at a market with an assortment of fresh fruits.
A fire hydrant and a little yellow ball person is between three yellow poles.
The sausage is sitting on the side of the plate.
A train decorated with candy canes and other Christmas decorations.
An open laptop computer sitting on top of a wooden desk.
A living room decorated with a modern theme.
Some players in action on the soccer field.
A very small bathroom has a toilet in it.
A male is eating a large piece of food with his mouth wide open.
A couple of people underneath a building with a clock.
A large tower that has a clock on the side of it.
Young boys are playing softball on a dirt field.
Two bears playing in a water hole at a zoo.
The vase has some beautiful flowers in it.
A customized motor cycle with skulls on it
A custom motorcycle on display at a motorcycle show.
A black tour buss parked on side of road
A person jumping up into the air on a skateboard.
Blue passenger train passing through an open forrest.
Black and white photograph of a women's tennis team
A snow covered sign in a city neighborhood.
a cat with some kittens laying on a bed
Two water buffalo's standing together by a fence.
The words Market Street are written on a white sign.
A person leaning on an upright skate board in front of a building.
Two people seated on a couch, one with glasses and holding remotes.
A boat is sailing on the water in foggy conditions.
A surfer waits at the water's edge on a rocky beach.
A desk has a keyboard, monitor, and laptop on it.
a closed up flower laying on a huge leaf
This toilet has a weird plastic piece on it.
The dining table is in the middle of the large kitchen.
a guy in a black suit with a bright tie
The city is next to a beach and many docked sail boats.
A man riding a surfboard on top of a wave in the ocean.
a kitchen sink with several white mugs hanging on the wall.
A city in the night light up with lights
A family poses together during a day out skiing.
A black and silver fire hydrant sitting on a sidewalk in front of a brick building.
A baseball player standing on top of a green field.
A woman in the kitchen with others preparing a meal.
A giraffe is eating in an enclosed space.
A view of a few cocunuts in a basket.
A tall shell gas station sign proclaims it is the Czech stop.
A man standing in a room holding something in his hand.
Woman with umbrella walking in the rain next to man.
three small birds on a sandy beach
A cat laying on top of a couch on a shoe.
a chopping board with some cakes on it
A lush green hillside covered in cows grazing.
Three baseball players stand on a baseball field.
A woman getting ready to light candles on a cake.
there is a dog sitting in a room where there is sun
People holding various phones in a group together
A black tennis player swinging the racket towards the ball.
a girl is turned around on a wood bench
an image of a military man holding his daughter
A man who is standing in front of a crowd talking.
A man and woman holding coffee and talking to a woman in the city while walking their dog
A close up of the push to walk button
A couple of animals lounging on a hill in the open.
A horse that is walking around by themself.
A couple of computer monitors sitting on top of a wooden desk.
A woman is sitting at an outdoor table using a cellphone.
A long haired house cat, sitting in a shallow pot, is roaring.
a man on a horse that is in side of a gate
a polar bear sleeping on a rock ledge
A women riding a scooter on a busy street.
an image of a dog that is catching a frisbee
Woman carrying bags eating a hotdog on a crowded street.
A beach with a lot of kites flying in the air.
A train going down the track with steam on top and a bicyclist riding beside it.
a man in a a hat i standing with a horse
A keyboard, mouse and monitor sit on a desk.
Two large toilet sectionals in the middle of a grey bathroom.
this is a cat in front of a tv
A mirror, road signage and a skyscraper in the city
A bathtub and sink under a window with a lace curtain.
A person is holding a sandwich in one hand
Some baseball players are playing a game.
a boy on a skateboard is about to skate down the ramp
Picture of a person that is reading a book.
A white plate has a brown stripe design in the middle
A sink in a kitchen under a microwave oven.
A couple of black bears snuggling each other.
A traffic light with a building in the background.
a man on a surf board rides a wave
A building with a sign that says Donuts above the door.
Trays of a variety of different donuts for sale.
close up of a pastry with a bite taken out of it
A man is standing and talking on a cell phone.
A bus is traveling down a city street that does not have much traffic.
There is a flip phone in a banana shaped case
A giraffe running around a field at a zoo.
A parking meter reserved for the disabled outside of a boutique
A man holds an oversized frisbee at the park.
A toddler happily takes a bite of a donut.
A beach that has people walking on the sand and in the water.
A train sitting on top of tracks with steam pouring out of it.
First bus on street currently not in service.
A man flying through the air while riding a skateboard.
The girls are checking-out where to put their surfboard in the water.
A black cat rubbing up against a laptop.
a close up of a small bird on a green surface
A car with some surfboards in a field.
A Delta airlines plane with the food services truck docked at the service door and a worker at the door.
A sausage sandwich and greens sit on paper.
A cake is being cut in front of little kids and parents.
Cat standing on papers that are sitting next to a laptop.
A studio apartment with a bed, a table, and a kitchen area.
Woman poses on beach with two umbrellas in front of a floating boat
A red car with various pizzas sticking out of its window.
A church with a steeple and the sky in the background.
A table full of food with a glass of water.
Cows walking on a path between rocky outcrops.
a man standing by a desk  with a toothbrush in his mouth
some cars and a motorcycle driving on a road
A group of people watch as a man stands before them  holding a string that is attached to a kite that flies in the cloudy blue sky.
A gathering of people fly kites in the park
A  woman riding a motorcycle with a man on the back of it.
There is a male surfer riding a wave while the sun goes down
A black and white cityscape shows lots of people, mainly a tall, smiling man in suit and tie, who is paying attention to a woman standing beside a second smiling man in glasses and headset, who is also holding a microphone and notepad.
A man making a face while biting a hot dog with cheese on it.
Bright sunlight shining through a colorful window curtain.
a white horse standing next to a stream, rocks and a green field.
A man with a surfboard walking into the ocean.
A kitchen sink near a couple of windows.
A woman wearing a white shirt and black capris getting ready to fly a multi colored kite.
Two boxes of donut with milk and juice on a dining room table.
A green bus with a bike on the front of it driving.
Four bears standing on a fallen tree outside.
A young boy holding a blue baseball bat on top of a green field.
An adorable little girl holding two ski poles.
A motorcycle parked across from a business next to a highway.
A red and white wings black bird sitting on wood
Man and woman at an outdoor restaurant smiling for camera.
a person riding a two thick wheeled bike on sand
a group of people shopping for fresh fruit and vegetables at a market
A baby elephant following behind a mother elephant
several bottles displayed on counter in well decorated indoor area.
A yellow train is traveling down the railroad tracks.
Fireplace with brick border displaying many photos and decorative flowers.
A view of a bathroom, that is very old looking.
An airplane ready to let passengers get on.
A rendering of an old fashioned water closet.
A old time picture of a woman milking a cow.
A pinto horse walking in a coral with two people.
A dog standing on a chair eating out of a dog bowl.
A cute puppy curiously looks to see whats going on.
A woman gets a fresh glass of wine from a cask using a glass instrument.
A bedroom packed full of home goods and luggage.
Military colors being shown at a baseball game.
A group of alpacas grazing on a dry hillside.
A woman and her son picking out sweets at a bakery.
A young skateboarder wearing safety equipment skateboarding down a sidewalk.
A dinner plate with meat and vegetables on it.
A kitchen large green hanging plant and a door.
A bed with four pillows and the covers turned down.
A large motorcycle is parked next to a brick wall.
cows in a small wood and straw shack
a vintage photo of a woman sitting on a horse with a man in a suit standing
A woman smiling while holding a yellow banana.
The lady is sitting with food in her hand.
Pedestrians cross the street during a winter day.
assorted foods separated in bowls on a white table
a number of people standing in a kitchen area with a counter top
Four pieces of pepperoni pizza on a plate.
A man in a suit standing in front of a window
A woman is walking her dogs  on the city sidewalks through the newly fallen snow.
People swim in a pool on a beach resort.
A toddler in a t-shirt holding open a refrigerator door and looking inside
A street lines if restaurants with signs hanging off of them.
A baseball sitting in a baseball mitt on a blanket
A lone kite is flying above the water and under a blue blanketed sky.
Two road side workers chatting, one is holding a stop sign.
A homemade square pizza fresh from the oven.
A lanky skateboarder poses against a barn-red door.
a dog stands inside of a boat as it stares at a camera
A man surfs on a surfboard over a wave
A foot ball fan is showing off his team spirit
A man in black jacket with dog in snow.
A tray full of breakfast items served on a plane.
A young man in a black shirt and purple tie driving an automobile.
Different style toys placed next to eachother  and a batman costume.
Two men are talking to each other while holding a skateboard.
A pitcher throws a ball while the opposing team watches.
A man in uniform is looking at his phone.
A woman is pouring a bottle of wine into wine glasses.
a man in a blue shirt and a orange tie
A bird sitting on top of large pile of brush.
A man casually throws a frisbee into the air.
A woman in a Sailor Moon costume rides a motorcycle in a street full of people
a laptop on the floor with a cat on the laptop
A plethora of stop signs in the same vicinity of each other.
A pizza with a sign with a cartoon mobster.
There are horses walking beside of the cars.
A cow lies down in a pen and looks at the camera.
Three boys peel vegetables and cook at a counter.
A police officer on a police motorcycle rides past a line of men in uniforms.
A wide photo of two people kite surfing in the water.
Several trays of pastries sit on a table.
Highway road sign announcing exit ahead for vehicle traffic.
A black and white photo of a woman asleep on a park bench surrounded by foilage.
a toddler sitting at the end of a surfboard on the beach
The worker is cleaning the eating area for the customers.
A baseball player swinging a bat while standing next to home plate.
A woman sitting beside a table full of fruits.
A train with smoke coming out going down the tracks
A airplane sitting on the tarmac at an airport.
A gross bathroom has graffiti all over it.
Dog trying to pick up an object with its mouth underneath a bench.
a close up of a white keyboard with a black monitor
Two military men are cutting a large cake.
A woman walking down a street holding an umbrella.
A red stop sign posted next to a tree next to a sidewalk.
a clocktower standing high with lights on
A dog and cat in a master bedroom looking at the camera.
THERE IS A BATH TUB AND A SINK IN IT
Boy in purple shirt holding a tennis rack on tennis court.
A full view of a picture cloth with an animal.
CHEF IN  KITCHEN WEARS FACE MASK WHILE PREPARING FOOD.
An assortment of fruits and vegetables sitting on a counter.
Two people walk down a walking path.
This is an image of several kids playing soccer.
A man carrying a surf board into the water where there are other people.
The pizza in the box is divided into four slices.
A farm picture with an old cabinet and a horse with its head down.
A plate of food including chicken, rice, and beets.
A plate of food with onion and broccoli on it.
Square white plate with a sandwich full of meat and dressing.
A group of people walking down a wet sidewalk.
Man riding white horse in the street while others watch.
A man holding a camera standing in a crowd.
a train on a track near a platform
A man doing tricks on a skateboard outdoors in a city.
A group of people mill about on a lawn of a building.
A green highway sign beneath a beautiful blue sky.
A toilet with a wooden seat is in a small bathroom.
A big road sign listing three different locations
Hotdog sandwiches sitting on ears of corn on a table.
The young child is riding swiftly on a skateboard.
A very close up look at a tasty looking pastry.
Motorcycle police and their bikes with Battenburg markings
There is a seagull flying towards beach umbrellas
A man leans against a wooden box on wheels that contains a teddy bear and a basket.
A large white bus is traveling through the city streets.
A young woman walks in the rain, smiling and holding an umbrella.
A neon green toilet and sink are by a large trash can.
Some people gathered together on the snow covered ground.
A man riding a surfboard on a wave in the ocean.
The crowd of people are gathered in front of the building.
this is an unmade bed with a flowery blanket
A long line of skiers is waiting on a snow covered mountain.
Smiling young girl holding video game controllers while standing
A woman in a blue riding jacket rides a dark brown horse on a riding course.
A brown dog laying on floor under a brown mat.
A couple of people riding waves on top of boards.
A sandwich on a plate with a side of coleslaw on a tray.
A small boat is going down a water channel.
A woman with an umbrella standing by a fountain at the park.
A clock behind a fenced in area in a city setting.
A cat sitting on a couch looking intently at something.
Street signs showing streets with a one letter name
A young man scoffing a huge slice of pizza from two paper plates.
A table cluttered with a bunch of stuff.
A woman wearing sunglasses and a hat is smiling.
A train sitting parked on tracks next to a platform.
A young woman looking at a store display and holding an umbrella.
A clock that has been placed on a window sill.
A giraffe in a grassy fenced in enclosure.
The  colorful lights are illuminating the darkened street.
A stuffed zebra posed and being chased by stuffed wild dogs.
A rear view mirror has the reflection of a truck.
A small pizza sits on a granite counter top next to a napkin.
Two firetrucks with their lights on are stopped on this road.
A woman sits cross legged near a pile of eggs.
Guy on bench looks over while eating pizza
Chickens on a sandy beach with a motor boat in the background.
a couple of zebras are inside of a caged area
A bus stop sign that is on a pole.
A English muffin lays on a plate next to a drink.
An ornate antiqued pole holding a clock with trees in the background
Small white sheep below another sheep eating in an open field.
Several kites are flown along the shoreline on a cloudy day.
two people holding surf boards on a beach
A zebra standing on top of a dry grass field.
A couple of giraffes looking attentively at the camera.
A non passenger train sitting out on the tracks at a curve
Colorful lights reflect off the items inside this bathroom stall
A man standing behind a woman holding a bat.
A man playing with his dog near the water.
Two men playing frisbee in a large field
The man is outside playing Frisbee with his dog.
A large elephant standing in a grassy field.
A man is using his board to surf a wave
A young man wearing goggles with spiky hair dressed up like Robbin.
A toilet sitting in a bathroom under a window.
Sink with electric toothbrush and toothpaste sitting on the top.
Plate of vegetables made from knitted yarn on wooden plate.
A group of young people playing a game of frisbee.
Two young girls sitting a big bench on the beach.
A small dog with long hair sits on a computer desk.
Several brown cows grazing in a field.
a person standing next to a fire hydration that is spraying water
A cat underneath a car on the pavement looking Rome underneath
A very large jetliner sitting on top of a tarmac.
A group of people sitting down at a table together sharing a meal.
This truck has an open deck for the passengers.
Interior of a public toilet stall in a country that squats to defecate
A counter topped with small different shades red tiles
Young boy in front of a large elephants cage.
There is a parking meter with one side covered up.
A large passenger jet flying through a  cloudy blue sky.
A black steam engine train sitting on top of rail road tracks.
A very close up view of a very pretty bird.
A store that has trees on the side of the building.
A large stuffed white teddy bear sitting on a bed.
A baseball player wearing a white and red suit with the number 19 gets ready to hit his bat.
a large bed is in a white room
Several pieces of furniture are in an empty parking area.
A small kitchen with stainless appliances and red cabinet doors.
The baseball between the pitcher and the batter during a game
A vase of flowers on a table near a window
A fighter plane is taxiing down a runway.
The father and daughter are under an umbrella on the beach.
A man walking on the sidewalk next to a suitcase leaning against a lamp.
A cat laying in a bowl on top of a pillow.
A woman is standing in front of a birthday cake.
Cars driving on the street and people walking on the sidewalk in a city.
A plane is sitting on the ground at the airport
A pier stands in the ocean while people wade in the water.
A green fire hydrant sitting in the middle of a sidewalk.
There are apples and oranges on top of a table
A person cut out a bird shape out of a piece of paper.
a big crowd of people that are looking at a zebra
A large bear walking around a zoo enclosure.
a zebra standing next to a car on a bright day
A woman flying a kite on the beach under a grey sky.
A large flock of birds flying in the air.
A soccer player is about to kick a soccer ball
a fridge stove sink and dishwasher and a dinette set in a kitchen
A group of people standing on top of a building near a large clock.
Different types of fruit displayed on a table.
Skateboarders waiting to hear the go ahead word to skate down a ramp.
An open laptop computer sitting on top of a wooden desk.
cars on the road that are nothing but blurry lights
A bedroom scene with a bed and dresser.
A young woman is pulling a casserole out of the oven.
A man in a blue shirt serves a tennis ball
A table holding two trays of cookies and a cake.
A full view of some cows grazing on a field.
A plethora of apples sitting inside a bowl.
a home made pizza sits on a trey
A white toilet missing seat in an old bathroom.
a yellow and black train is on some tracks
A sepia-tone photo of a man and a boy standing near a stove.
A piece of cake sitting on a square plate.
Man standing on side of busy street next to a mall.
a small airplane sitting in the middle of an airstrip in a field
a couple of surfers are walking out of the sea
There is a group of people flying a kite together
Two saucers have a doughnut and cappuccino on them, respectively.
A woman seems to be doing yoga on a surfboard in the water
Man holding dog mouth open to brush teeth in tiled area
3 microwaves cooking something and catching on fire.
A man who is riding a wave on a surfboard.
A clock on the wall inside a mass transit vehicle.
A computer and keyboard are on a computer desk.
Two men stand near another man who is jumping onto a bed.
A group of people standing around a living room
a heard of sheep on a grass field.
A group of people ready their skiing equipment in the snow.
A tennis player throws the ball up to hit it.
The traffic light is in front of the building.
a man waterskiing behind a white boat on a lake
A black and white cat is sitting in a window.
A sandwich and french fries on a paper plate
a man is flipping through a book on a bed
Disc on beach, with dog prints in sand
A dog is standing next to a cat on a suitcase.
A plate with asparagus, broccoli, carrots, cauliflower and a sandwich.
A group of elephants standing together in a field of grass.
A red fire hydrant with a hose sticking out of it.
A airplane that is sitting on a tarmac.
A display case holding various types of donuts in metal racks.
small boy eating food from a white plate
a large green and white clock tower in the middle of a plaza
A skateboard zooms down the railing at the skate park.
a man in a tie holding a cigarette and looking down
A cross country skier on a trail, smiling.
Some people riding a motorcycle near a bunch of motorcycles.
man having fun with a video game system
A man sitting on couch with two little girls.
A group of people stand by a red lighthouse.
The mother smiles as she holds the baby boy.
A man teaching a girl how to play tennis.
A broccoli head with onions and potatoes by a wooden wall.
A woman standing next to a man near a traffic light.
A person on some skis in the snow.
an image of a woman sitting in a dark room
A very long and wide road with some assorted vehicles.
Two hotdogs and a side of french fries in yellow containers.
A stop sign in front of a brick building.
Four men in a lake attempting to stand up on a board together, with their hands raised in the air, and one man in the water.
people are at outdoor seating with umbrellas overhead
A tennis player prepares to hit a forehand on a red clay court.
Small group with a folding table next to a decorative old bus.
The bowl is full of broccoli and some kind of meat.
an image of a place setting with soup and biscuits
A bathroom with a large white tub and his and her sinks.
We see a picture of many many teddy bears.
A group of men sitting on a snow slope while attached to snowboard.
A pitch approaches the batter in a baseball game.
A man in glasses eats a slice of pizza.
this photo is blurred it is of a house
A black cat standing inside of a piece of luggage.
Pack of zebras in a zoo standing together.
A metal pole with three street signs pointing different directions.
A man riding on a wave on top of a surfboard.
THIS IS A PICTURE OF A TOILET AND SINK IN A BATHROOM
Man smiling with hat in kitchen with mess around
A young man in a baseball uniform with his arm pulled back.
a red bench and some buildings and lights
A bird sitting on the branch of a tree near leaves.
A sink with some cups on the counter top.
A man on a horse without a saddle stands on a hill.
Low view of small passenger train moving through the countryside.
A person with an umbrella and some cars on a street.
A man is playing Wii tennis in his living room.
A large jetliner flying through a cloudy blue sky.
A man standing on a beach near luggage.
an empty and clean wood floored home kitchen
A brown and white dog sits on grass next to a Frisbee.
Many kites fly above a crowded beach.
A picture of a room with a table that has a vase and candles on it.
a plant that has a yellow bird on it
Young man exclaiming over an unripe green plantain.
Two teddy bears that are sitting next to each other.
Some ripe bananas are in a brown wicker basket.
A young boy is skateboarding swiftly through a crowded park gazebo.
Four people on skis standing in the snow
A book on finance sitting on a bed.
There is a blue and yellow train stopped at a train stop
Two oranges and a banana laid out to look like a sad face
there is a military truck that is stopped on the street
A crowded harbor filled with small sailboats and other watercraft.
a blue bus is parked by a bench
Three cats on a bar watching television very closely.
Chefs and cooks are preparing meals in a restaurant kichen
A microwave and a cone on asphalt by bushes.
a red and white plane and a blue and white plane
A very tasty looking cheese and vegetable dish
A person with a snowboard next to a man with skis.
A person that is eating some food in her mouth.
A very small kid in the road next to a big yellow bus.
A movie cover with some food on top of a plate.
A silhouette of a woman with a tennis racket.
A teddy bear is sitting alone in a window.
A woman sitting on a brown couch with two children.
Littleboy been playing with a Nintendo Wii and amused
Two children playing a miniature version of tennis on a city street.
A group of sheep walking in a grassy pasture
A small airplane in the sky and another in the water.
A pair of giraffes is stretching up to a limb in perfect harmony.
The bananas were cut to put chocolate inside them for a treat.
Two benches are empty on a sunny day.
Two young males playing a video game together in front of a tv.
A man in green and a red haired woman sharing a laugh.
two guys riding bicycles while carrying their surf boards
A bunch of different types of tools in a play kitchen.
a street sign on a wooden pole near a fire hydrant
Fresh cut flowers in a glass vase on a tablecloth
A banana sitting on top of a table next to a  paper.
A person is parasailing on the water under a cloudy sky
A fire hydrant is placed in a wooded area
A pizza sits on a table and it has cheese, olives and broccoli on it.
Sheep grazing in a lush, green field on a lavish farm estate
Older Americans ride in a simple parade float adorned with red, white and blue decorations.
A person wearing a glove holding a chili dog.
A close-up of the rear end of a propeller plane.
A black park bench sitting near the water
a person walking on a sie walk talking on a phone
Several cows are on a sloping grassy hill.
a blue and white plane flying over a lake.
a bike with a tarp and boxes of items
Two men standing on the street wearing a suit and tie
A shirtless man with a hat and sunglasses holding a frisbee in one hand and in a stance where he is preparing to throw the frisbee.
A tub and shower with a curtain in a bathroom.
Some cars at a traffic light, one with a red sticker on the back
A group of people play a game of frisbee.
A bathroom with a separate area from the sink.
two zebras walking next to each other in a desert area
A man is sitting in a chair watching television with a remote control in his hand.
That looks like some sort of huge satellite.
A person is typing on a lap top and there is a person up on screen.
a giraffe in a field with rocks and grace
The giraffe is standing alone in the field.
A can of soda and a cat with kitten next to a monitor.
a bread with some noodles and minced meat
a bird that is sitting on a log in some water
A man wearing a bow tie walks in the rain with an umbrella.
Large man in leather biker outfit with a small brown dog.
The wooden boat is floating on the river near the bank.
A kitchen with a stove, refrigerator and a microwave.
People flying kites in the snow on a sunny day
A plate with grapes, green vegetables, and noodles on a child's place mat.
a cow walking in a crowded city street
A closeup of a bull cow with horns on its head
A person does a snowboard trick on a rail in the mountain
There is a grilled sandwich on a white plate with sauce
A stove is shown with a mixer next to it.
dilapidated, dirty bathroom with mold and water damage
A yellow bus and blue bus passing on the street
A cheese pizza pie is in the serving dish on the counter.
A soccer player blocks the goal during a nighttime soccer game.
A person is snowboarding down a hill fast.
A man holding a ball as he leaps into the air.
A kitchen that has wooden floors and a bay window.
The "Yoctangee Park" sign has a Native American on it.
a park bench that is on top of some bricks
Side by side view of two oval plates, one with fork, with chicken salad sandwiches and rosy new potatoes, by an open and an unopened bottle of lager, a pepper mill, paper towel roll, basket behind.
A very large elephant in a field standing next to a pond
There is a woman drinking from a fire hydrant and several other people nearby.
you can see a large belt that is used to make donuts
A man holding a tennis racquet on a tennis court.
A man holding a racket playing tennis at the court
A woman sitting on a bench reading a magazine.
A young boy and girl playing on a ride.
A large grizzly bear walking through tall grass.
A pair of scissors next to a writing instrument of some sort.
A small child using skis to ski down the hill.
Surfers walk out through the surf toward large waves.
A person is holding a computer and watching a flat screen t.v.
A bus is stopped in the middle of the road.
Elephants with passengers walking through a calm river.
A little boy that is standing on a skateboard in the street.
Meat, carrots, and a roll sit on a small white plate.
A bathroom with a wall mounted toilet and TP dispenser.
A herd of zebras in a tall grassy field
a close up of a plate of fruit with apples
A zebra standing in water next to grassy area.
A large herd of elephants at the edge of a body of water.
a kitchen with a refrigerator near a window
A small elephant laying on the ground in the mud.
A red train or trolley car is shown at a station.
A man holding a baseball bat on a baseball field.
two people riding skis across a snow covered forest.
a person is touching a small stuffed bear
A couple of horses grazing on a lush green field.
A woman standing by a yellow fire hydrant
A man plays with two young children in the grass.
An elephant stands between two bushes on a dry field.
A shoeless foot standing on top of a skate board
A man stands on skis near a snowy mountain.
A farm house and barns in the background with horses and farm animals in the yard in front of them.
Several views of mean playing with a white disc on grass.
An ornate wrought iron frame holds a sign reading GARAGE.
a man playing tennis going for the return
A child in white shirt laying on bed in wooden crib.
A man and woman pose for a photograph while sitting on a moped.
A close-up picture was taken of a giraffe.
A yellow plate topped with different types of food.
A women is laying on a board surfing a small wave.
The zebras are eating the grass in their habitat.
A man dressed up for a themed party.
A black and white photo of an old train
a women that is eating a very long hot dog
A post with a clock and several birds sitting on it.
An Asian stir fry on a plate with chicken, brown rice, and broccoli.
Two men skateboarding down a road near some cones.
The street sweeper has a safety triangle on the back
A motion blur street scene of people and a bus.
A man riding on top of an elephant.
a car on a road with people standing on the side walk
A stuffed bear and a stuffed bunny sitting beside of one another.
A man taking a swing at  a tennis ball
A photo of a green and red train on a set of tracks.
a vintage photo of some kids playing on a bed
The skier is upside down in the air.
A kitchen counter with an unassembled food processor.
A group of people sitting on a train.
A dog that is standing on top of a fire hydrant.
Three white plates topped with pizza on an orange table.
a black and white photo with a sign next to a building
I cant see wht the images are in this one
A planter is full of green plants along side a fence.
a monument with many kites flying near by
A living room with hard wood floors and furniture.
A rusty looking parking meter is on the pavement.
Three horses are walking through the grass wearing blankets.
A man in a suit and tie beside a stack of suitcases.
Two pieces of pepperoni, sausage and ham pizza on a plate.
A bed with a stuffed animal looking out a window
A batter, catcher, and umpire stand on a baseball mound.
A closeup of a shelf displaying a canned beverage and a muffaletta sandwich.
Man standing on a skateboard with a person sitting on it.
A bathroom stall that says did you check your lipstick
A large wooden clock hanging from the side of a large cement pole.
A small blue and silver airplane spewing smoke at an air show.
A bag sits on a white sheeted bed
A white house with a red top next to the ocean.
A large green truck with giant tires on it.
A packed Chinese train is filled with commuters.
There are several bananas tat are in te tabe
A vase of flowers that is on a table.
Two dolls with crazy hair and interesting clothes.
THERE ARE BLACK AND WHITE KEYS ON THE KEY BOARD
A computer desk is very cluttered with various items.
A trellis and arbor with a bench under it
A bunch of zebras grazing near a road where vehicles are driving by.
A group of parked motorcycles at a parking lot
A person can be seen trying to cross country ski as though they are on a slope.
a person is skiing outside in the snow
A market display with the rows of vegetables in baskets.
A man leaning in to see the laptop he is using
A man in black jacket playing a game with a Nintendo Wii controller.
A cat is sitting on a laptop's keyboard
An old man in a suit and tie is staring.
Two pizzas sitting on top of a counter top.
A four way traffic light showing the green light lite up.
A rack with many accessories next to a refrigerator.
Two UPS trucks are parked side by side beside a building.
A skate boarder practicing his tricks on the ramp.
a little bow outside in a yard by a bar, playing frisbee
Guy in shirt and tie walking away from the chairs
A man doing a trick on a motorcycle.
a kite flying in the sky above a body of water
A group of fruit, vegetables and eggs on a kitchen counter.
A man standing next to his wife as she holds their baby.
A modern kitchen is displayed with silver decor.
All white bathroom with shelving unit over commode.
A surfer in a wet suit rides a wave.
A beautiful woman riding skis down a snow covered hill.
A blue sign in front of a bamboo wall.
A bathroom with white toliet and sink visible
five used toothbrushes in a clear glass on top of a sink
Black and white photograph of a man on a motorcycle.
A bunch of young boys playing soccer and having lots of fun.
An adult carries a child and a surfboard through the waves.
A man holding a yellow frisbee in his right hand.
Three boats filled with people floating down a river.
a player squatting down to return the ball
There are a collage of pictures of different foods
A dog standing outside next to a car.
A woman is taking the first bite of a banana.
a steel bridge over the water with a train
A baseball player swinging a bat towards a ball.
Five snowboarders in yellow jackets perform a simultaneous jump.
A barber giving a man a haircut with a blue smock on.
A wooden table with two plates of food and a paper
an oven and a small table in a home kitchen
A person sitting at a table with a cup of coffee.
An open computer next to books on a table
a big zebra that is on a dirt ground
Vehicles and people on a crowded city street.
Two people posing next to a giant statue with a suit case.
Three planes fly high in the sky in unison.
A person on a surfboard riding a wave.
A tray with four different types of food.
A man holding a child with a toothbrush in its mouth.
A LONE GIRAFFE IS GRAZING IN A OPEN FIELD
A variety of vegetables hooked on sticks on a tray next to a remote control
The wheel of a bicycle going down the street
A mailed postcard of people in a boat being rowed
Sausage and cheese on bread on a plate
a person riding skis on a snowy surface
a man standing on a porch holding a bat over his head
Yellow construction trucks parked in line on a dirt road.
A red train traveling out of a dark tunnel
A living room filled with furniture and windows.
A man is lacing his boots while several others are ready to ski.
Mushrooms are used in many variety of dishes
Several oranges hanging on tree branches in a grove.
An antique semi with flames painted on it.
The woman is standing by the elephants outside.
there is a slice of pizza with mac and cheese on it
A dog sits in the side car of a motorcycle.
A bathroom with a wooden frame around the mirror
A person on a skateboard near a building.
Two single beds that are made up with a night stand between them.
A large clock tower on top of a tall white building.
A woman wearing a towel holding a blow dryer.
A solar panel powers a public phone booth.
Picture of a bathroom with three paintings over the toilet.
A photo of a place during bike week.
A person is handling broccoli on a cutting board.
An orange and white cat standing in front of a flat screen TV
A view of a person's legs sitting on a bench alone.
A man stands on a white object while playing Wii.
A large elephant is staring in front of a fence.
A man sitting on the beach behind his surfboard.
The bathroom is clean and ready to use.
broccoli cauliflower and carrots in a white bowl
The Master of Hounds leading the dogs out for a fox hunt.
Crammed and congested city street in oriental area with many people and buildings.
A cat walking through a kitchen by a eating tray.
a person holding a tennis racket on a tennis court.
a shops table filled with apples oranges and other fruits
A man wearing skis poses for a picture in the snow.
There are two beds in the bedroom, along with a desk and a television.
A tight, rectangular kitchen space, with kiwi colored walls and a grey door, shows cabinet and counter spaces of pale wood, holding built in appliances, that borders a white tiled floor.
a single person standing on the side of a snowy mountain
A baby crying with a teddy bear in its arm.
A man and a woman sitting on a motorcycle.
A view of a tree with pink flowers as soon in a mirror.
an image of the back end of a childs car seat
The giraffe is standing inside of the pen.
A dimly lit bedroom that has odd colored walls.
Roadsigns showing stop lights, right and left turns and warning cyclists to dismount.
Two polar bears are sleeping atop some rocks.
Two bowls of food on top metal plates.
There is not much space left for anything else.
A person skateboarding on an outside basketball court.
Carry on bag sitting on bench near metal railing.
A sticker promoting vegetarianism has been placed on a stop sign.
This restaurant provides laptop computers in the booths for each of its patrons.
A yellow bowl filled with soup next to another bowl of soup.
A couple of VW buses parked in front of a small brick house.
a new kitchen cabinet with a sink being installed
Two cakes shaped like trains are on gold foil.
The room is crowded with many things including chairs, a bicycle, and a table with cups on it.
A train going through a tunnel under a building
An elephant is walking through the mud behind a gate.
Several bruised oranges and lemons mixed together.
A public toilet with the lid up in a stall
A framed picture and reed diffuser sit on top of a toilet in a bathroom.
A man in a white sweater placing a turkey in an oven.
All aboard for a ride on the tourist train.
A bathroom wall with three urinals on the walls and images of women peeking out behind trees on the wall.
A tennis player is trying to hit the ball.
This is a downhill skier sticking his pole into the mountain.
A girl is having fun playing a video game of tennis.
Which one would you choose to drive, the beauty or the beast?
A single train at a train stop with many train tracks.
A couple hold their cellphones while taking selfies.
Several zebras walking together in the wild
A street free sign sitting under street lights on a bridge.
A memorial with various plaques and American Flags on it.
a toy animal is wearing a feathery hat
the white vase has drawings of women on it.
Two birds are sitting on some gray cement.
A man on skateboard riding a skate ramp.
Some cats laying on a bed and posing for a picture.
a train on a track near people
A lady in red water clothes skiing on a lake.
A beautiful woman sitting in a bed holding a tooth brush in her mouth.
A wire basket of bananas and apples on a table.
Some people sitting and painting a road divider
A young man is playing frisbee in the park.
A wooden bench under a tree in the field
A girl takes her friends picture while wearing leis.
The zebra is in the field standing all alone.
Several cross country skiers prepare to start down a course.
A woman skiing down a steep hill as snow flies up in the background.
A older TV on a shelf with videos on shelves on either side.
An eagle is standing on top of a pile of rocks.
A woman playing with a dog while another person is skiing.
The white toilet is sitting in the corner of the bathroom.
Three sheep standing together in a grassy field.
A lady is standing by the white truck.
A man with a bald head has a cell phone to his ear.
A chicken and cat walk in a barnyard.
A living room with a large couch and a coffee table.
a tall building with some clocks on it below a cloudy sky
A bunch of broccoli that is near carrots
Bunches of bananas are shown for display at the market.
A sandwich cut in half and a cup of coffee.
A baseball player holding a catchers mitt on top of a field.
A wooden double door refrigerator with one side opened up.
A number of seagulls stand in the shallow water as the tide sweeps over the beach.
a man dressed nicely and sitting next to a female
People holding signs on a one way street.
A giraffe kissing a man with a shaved.
THIS IS A SIDEWALK SHOT OF A PLACE CALLED THE LION
a person sitting at a table eating food from a plate.
The slice of pizza has large chunks of tomatoes on it.
This is a spacious bathroom with an interesting tile pattern.
Dinnerware with fruit painted on them beside a matching vase.
Two girls are smiling and staring in their school uniform.
A woman in a room with multiple cats laying and walking around.
A vase of flowers on a white sheet.
A person taking a piece of dessert from a plate.
A man playing tennis going high for the ball.
Boats in a river with trees alongside in a rural setting.
A motorcycle rider gives a thumbs up to the camera.
A truck is carrying a load of logs.
this is a pair of women sitting on bikes
A plate with a hot dog, chips and a strawberry.
Woman in a field playing with spectators watching
A man and woman that are standing in the sand.
A small cat is walking behind a bike.
A man in black jacket riding skis on a snowy slope.
Two people standing near the ocean with sails in the sky.
a man with a hat skiing on the snow towards a building
A man that is standing on a court with a racquet.
A woman is trying to catch a frisbee.
A stop sign on a residential area has caring under the stop.
a bench near a tree near a light pole
a girl in front of a stop sign
A surfer in a wetsuit in the curl of a wave
Two rectangular boxes with chop sticks have food in them.
a young man on the beach holding a sall
A girl is at a table with two pizzas.
a little boy standing beside a toilet in the bathroom
a man in a wet suit stands on top of a rocky hill
Two ponies are running through a grassy field.
A man riding a skateboard down a cement ramp.
a close up of a cat on the ground looking in a mirror
A statue of a man riding a horse on a tower of rocks.
a guy sitting on a balcony using his laptop
A parking meter is next to white wires.
A group of men playing a game of tennis on a dirt court.
There are many giraffes standing among each other
Player walking away from home plate carrying bat during game.
A woman strikes her tennis racket against a ball.
Guests gather around and converse at a wine tasting.
Several people mounted on horses riding down a trail.
a group of surfboards stuck in the sand near the ocean
A table topped with ripe bananas sitting in piles.
A young boy standing on a street holding a skateboard.
A man wearing a black ski suit preparing to go down a snow covered hill.
a very large pizza with a fork and a knife
Man in wetsuit surfing next to a small wave.
A Chinese lady on a boat wearing a Chinaman hat
A pink cat creature sewn to the side of a pink bag.
A dog who is sitting on a couch.
Many people standing in a field with a red flag and many kites.
A large white cat sitting on a table in front of a TV.
a bathroom with a lot of toilet paper next to the toilet
A few people are in outside in the snow, with their ski gear.
Two military officers cut a cake with two civilians.
this kite is being flown above a city
A man on a horse in the middle of the street.
A man holding a cigarette and talking on a cell phone.
The polar  bear is white and showing his teeth
A black cat laying on a green pillow.
A plate of pastries with fruit and a fork and knife.
A man surfing waves in the ocean on his surf board.
A group of black cows with horns standing in the middle of a street.
A broken surfboard on a beach with trees in the background.
A group of people sitting around a table eating.
Two people are passing a man playing a piano on the street.
Black and white photograph of a fence next to a fire hydrant.
Young child brushing teeth using blue and white toothbrush.
Vintage red truck parked on a parking lot alone.
Two hot dogs with chili, cheese and tomatoes.
a small dog is sleeping on a chiar
A man on a surfboard rides a wave.
A baseball player is winding up for the pitch.
A row of many kites in the shape of cows fly along with other kites.
A woman holding a plate of food and a glass.
A red fire hydrant on sidewalk next to a wet street.
a star shaped kite flying high in the sky.
Four pans of food on a stove in a restaurant.
The blue necktie shows a picture of a pocket watch.
A stop sign on a one way street.
Traffic signal on the side of a bridge outside.
A team of two makes their way down the water on a primitive raft.
Two red double decker buses passing in opposite directions.
A skier posing for picture while straddling a tree.
Four people with a group of elephants on a hill.
two cups of coffee next to a white plate of pastry and icecream
A backpack with rollers is sitting unattended in the middle of this forested dirt road.
a fire hydrant sits off a city street
A bowl of soup, a metal spoon, and an orange on a wood surface
People on a slope snowboard and skiing next to trees.
A toilet sitting in a bathroom that is being remodeled.
A cardboard box contains some old vegetables and some trash.
Line of fire trucks driving down a city street.
A plate full of different types of food.
A man is holding scissors to his own head.
A big pretty commercial plane on the runway.
a person sitting on the ground wearing a suit and tie
A little blonde boy wearing a tie and purple shirt
A vase filled with pink flowers on top of a table.
Two trains stopped side by side in a railway station, both with platforms
A bathroom with shower and plenty of toiletries.
Slices of pizza on plates and drinking glasses.
A parking meter reads COPE on one side and four dollar signs on the right.
Two trains on parallel tracks near a station
A men's public washroom with a blue floor.
Four people sitting around a computer station talking.
A man next to a woman with his horse by a house.
A sign on a metal pole on a street.
A man sitting on a couch using a laptop
Pack of elephants in tall green grass as one has its trunk raised.
A home made pizza with cheese is on a shelf.
A keyboard, mouse, and wires on a desk.
Man speaking on phone with large sideburns
A bus is going down a rural highway road.
An Asian meal with noodles, vegetables and soup.
A woman pushing a baby carriage by a building.
A horse drawn carriage stopped near the water.
A donut is laying on a large noodle looking mat.
A piece of cake on a white plate next to whipped cream.
People on the sidewalk near a no left turn sign on a post
this living room is done in colors of black and red
A street sign prohibiting bicycles, skates, and skateboards.
A dog lying on a cement porch in front of a brightly painted building with a motorbike next to it
A beached sailboat in the sand with a chair next to it.
Fire hydrant in non-traditional paint, whitish yellow paint with black polka dots in front of old style firehouse with USA flag.
The reflection of two dogs being walked down the street
A young man riding on top of a skateboard.
A vase sitting on top of a wooden table in a living room.
A woman is looking at pastries in the shop's window.
A pizza with onions peppers and cheese and coke to drink
An elephant standing on the ground near a lake.
An adult on skis is standing near a group of children with skis on.
A bathroom with red and white tiling and a toilet and floor drain.
Two horse next to each other walking down a road .
A man reaches to catch a frisbee in a grass field.
a group of kids standing next to each other in a room
A train is approaching alongside a body of water.
People are walking on a beach alongside giant rock formations and flying a kite.
A bright kitchen with blond wood table and chairs and side server.
Cross country Skiier trekking through heavily forested land with snow.
A boy learning to skateboard in a park
Gourmet pizza cooked and sliced and on a plate
A variety of furniture sits scattered in a storage facility.
A woman wake boarding in a lake having fun.
some different items of food in a glass case
Two tennis players play tennis on the court.
Several horses walking along the beach by the ocean.
Some big baskets filled with tasty looking apples.
A polar bear standing open mouthed on a glacier
There is a blue bike leaning up against the wall.
A picture of a car waiting at a intersection.
A zebra grazing and standing on the grass.
A set of professional knives attached to a mounted magnet.
A woman is sitting on a leather couch smiling at another woman.
An airplane flying over a big harvest moon
Group of giraffes standing behind a caged in area.
Various equine horses and zebras inside stalls under a tent.
a bent white sign with a black pole
The buses are parked on the side of the street
Baked pizza with red tomatoes and green olives.
A cat drinking water out of a water bowl.
There is a cat standing in a toilet.
A very pretty horse in front of a big metal structure.
A man and a baby with toothbrushes in their mouth.
A hot dog or sausage in a bun with bowl containing condiment and bacon on the side.
Man chopping a chicken on a butcher block with a bottle of wine in front.
A bench that has some water drops on it.
A street pole that has a street name sign, a one way street sign and a map sign on it.
A girl is standing in a field and flying a kite.
A surfer posing for a photo with a surfboard
A close up of a zebra's back with its neighbor's mane in the background.
A cat watching water go down a sink drain
A man laying on a bench and a woman next to him touching his face.
A person wearing blue jeans and black tennis shoes riding a skateboard.
A very close up view of some very tasty looking food.
a person is reaching for a piece of pizza in a box
A man in a suit and tie posing for a photo in a large building.
A street sign where St. Stevens St crosses 17 Ave S.
a man in white holding a plate playing
A man is holding his arms out on his surfboard in the middle of the sea.
The chef is putting ingredients on the pizza.
Young baseball players on a field with a pitch being thrown
A modern style kitchen filled with may different items.
A home office features full bookcase, a laptop and a red leather chair.
A man happy about a truckload of bananas.
A varied collection of glass bottle containers on three sleves
the girls are standing in a room with the window behind them
A couple of women shaking hands on top of a tennis court.
A person wearing a wetsuit with a surfboard under one arm.
A baby boy sucking on a pacifier while wearing a diaper.
A man posing for a brochure picture with Akieys translations on it.
A woman is watching a kid and man playing Wii.
A group of guys standing behind tables on a stage before a presentation.
White bowl with assorted fruits being eaten by fork.
We are looking past a speaker at a monitor.
A white stuffed teddy bear sitting on a couch.
A wooden desk has an open lap top on it and a pair of scissors.
A refrigerator and a stove in a kitchen.
A woman is smoking a cigarette and on her phone.
A bowl full of food that is sitting on the table.
A person standing on a sandy beach flying a kite.
Street sign showing the name of the street in English and then in 
Asian characters below
two bikes sitting on a walkway next to some trees
Several men standing beside of each other in a line.
A giraffe leans its head over the fence of an enclosure.
A personal size pizza with tomatoes, spinach, and garlic.
A school girl in a uniform in front of a window.
a man is parasailing out at the beach
A female showing an open door to a refrigerator.
A bathroom with a sink, tub, shower head and mirror.
A zebra with his head down eating grass.
A man stands on a street corner next to a stop sign.
A retail sign is hanging above a stop sign alert for added effect.
A mounted police officer riding down a city street past parked cars.
A passenger train moving along a railway in the country side.
Tw people ride horses with trees in the background.
A white car at intersection of two roads.
a young man staring into the camera sticking his nose in between the handles of a pair of old metal scissors
A giraffe standing next to another giraffe on a lush green field.
A room full  of electronics and musical equipment
A man riding a wave on top of a white surfboard.
A clock tower with sculptures and a bell.
Two people in a park throwing frisbees at the camera.
The two zebras are standing together on the land.
A made up bed in a well decorated room with art pieces on the wall.
Women hiding from the sun on a city bench
People sitting at the umbrella covered tables next to the river
A train going down the rails passes under a pedestrian walkway.
A cat standing by some bushes outside in the woods.
A bathroom with tiled floors and double sinks
Three stop signs in the middle of the street.
Two men are snowboarding and skiing in the snow.
Man with camera holding kite in park setting.
A man is sitting behind a laptop computer.
Two tennis players are walking by the tennis net.
The man with the red and black bookbag is walking toward the building.
An adult man helping a youth on a skateboard.
this is a sign at a gas station
A piece of thin-crust pizza sits on a plate.
Zebras inside of a fenced in field eating grass
A young girl scrunches up her face as she holds a video game remote.
A cactus sits in a pretty green vase.
A city bus stopped at a crosswalk on a street.
A girl sitting at a table drinking from a bottle.
A person with there feet propped up in a chair in front of computer equipment.
A street side shop next to an intersection.
A train sitting at a train station platform.
a big plate of food that is on a table
A man swinging a tennis racquet on a tennis court.
An old lady smiling in a pink kitchen.
Bray pickup truck parked in driveway of residential home.
A white kitchen with stainless steel appliances and granite counter top.
This is a black and white family picture taken in the mid 1900s, of Grandpap, and his progeny
A picture of something and it appears like food.
A flock of ducks floating on top of water.
A woman with an umbrella stands with her belongings on the ground.
A white, yellow and blue airplane on a runway.
A resort with palm trees, bridge, people and bushes.
a surfer on a surf board in the midst of a wave
a person standing in a living room playing nintendo wii
Several people are riding in a horse race.
A red city bus driving through city streets.
Bird walking in the water near the shore edge.
A white and black bedroom with a white bed.
A plate of food, dishes with food, and a pot of flowers sitting on top of a table.
Two pictures of a burger, onion rings and a beer.
Several pans containing a few slices of pizza are displayed on a table.
A man with his arms crossed in a Santa hat and wearing a tie.
The young person is carrying their surfboard into the water.
A refrigerator in a kitchen next to a dining room.
an image of a broken fighter plane on the runway
The employee is carrying gas canisters on a bicycle.
A skateboarder comes off his board on a ramp.
Some kites flying over some buildings in the snow.
there is a very tall tower with a clock on it
A new roll of toilet paper is on the back of a toilet.
a crowd of people are looking off of a balcony
a child on top of a buket on the front lawn
A man riding on the back of a green motorcycle.
Boats docked in the water with a cloudy sky above.
a close up of a number of different remote controls
A woman sitting in front of a plate of food.
The person is reading music and playing a keyboard.
Commercial passenger jet at gate on airport tarmac.
A man is surfing on a wave while another floats with is board.
Two people are drinking red wine from wine glasses.
A child standing completely upright in front of a refrigerator.
A woman is on a red surfboard in the ocean.
A male and a female sitting together, the female is texting on her phone.
A person guiding a child down a hill on skis.
A white airplane is on a crowded airport runway.
A person is standing on a snowboard near a bridge.
A giraffe is eating grass in an open field.
A number of peach trees on a sunny day.
a man talking to a group of kids as a cow stands in a cage
a stuffed animal sits in front of a book
Several people are playing at a beach with a boat in the distance.
The man is sitting on a low ledge.
a dog sitting in the passenger seat of a big truck
a person taking a photo in a mirror
A couch and furniture in a small room.
Three men who are standing around a campsite.
A sign mounted to a pole that reads " No Stops ".
A cat curled up on a bed next to a stuffed animals.
A jet is sitting on the tarmac with blue sky's around
Broccoli dish in a bowl with a fork inside of it.
there is a card board bus with a cat sitting in it
A man holding a large umbrella with some girls and a woman underneath.
Three people are posing next to a raw pizza.
A single person skiis down part of a mountain
A person holding a half eaten hot dog with toppings.
A small child is in the kitchen with an adult and dog.
This white passenger bus is waiting at a stop
A train on railroad tracks beside a platform.
Two dogs sleeping on a semi made bed
A woman standing next to the ocean flying a colorful kite.
An ornate clock is surrounded by artwork and white arch.
A hitter watching the baseball approach during an at bat
A young child holding a skate board and pointing into the distance.
a small plane flying through a blue sky
A dog sitting on a bed in a sweater with a indifferent look.
Three elephants with seats and umbrellas stopped by a body of water
A street crossing with a street sign for Mulholland and a no-U-turn sign.
Chicken and assorted vegetables are frying in a pan.
A photo of people looking off into the distance holding an umbrella.
A skateboarder launching his skateboard into the air as he rides it.
The little girl ordered a piece of cake at the restaurant.
A dark road with power lines and street lights.
a transportation bus parked in a parking lot
A boy cutting a pizza on a wooden cutting board.
A small bird sitting in the sand at the beach.
A zebra, walking on dry grass, is seen from the rear.
there is a piece of cake on a plate on the table
Clock tower overlooking a red shuttle bus outside.
A young foal nuzzling its mother in the nose.
Two boys perform skateboard stunts on the street
The cat is sitting on top of the remote.
Various home appliances are lined up on the sidewalk.
A child smiles in front of a container of carrots with a stuffed rabbit.
A casserole sitting on a counter with apples and a measuring cup behind it.
A strawberry pound cake with a slice taken out.
a red and white airplane ascending in the sky
A plate of food including rice, broccoli, protein and a sauce in a bowl.
Two girls eating food at a Chinese restaurant
A formally dressed man with a martini poses with two women in evening gowns.
A big pair of scissors is on a wooden box.
A tall vase full of orange flowers sitting on a table.
Group of people crossing a busy city street in the rain.
A male college student playing frisbee in the park.
A big commercial plane flying low by a bridge.
Cars stopped on a road blocked by a herd of sheep.
a woman is holding a toothbrush up to a masked face
there are many birds that are standing by the water
A large cut pizza on a dining table.
A person putting sliced carrots into a dish.
A male emo hipster wearing a furry jacket in front of a laptop computer.
A pile of carrots, radishes, green beans, and broccoli on a cutting board.
two bulls in a field between bushes with a sky background
A couple of birds are walking around the grass
A cat relaxing on a plaid couch on a person's clothes.
A work out ball sits on a chair near a cluttered desk.
A cat walking into a kitchen with a phone and fridge visible.
A man near the ocean catching a frisbee on the beach.
A cow with a tag is staring at a viewer.
This is an image of a man with an umbrella.
A person on a cell phone by a big stone wall.
A man and a elephant that are standing in the dirt.
A crumbling bathroom has a sink and a medicine cabinet.
A female pedestrian stands in the center of a crosswalk as a double-decker bus quickly approaches.
The young batter wearing a helmet prepares to swing.
The sheep are all standing around together in front of a monument.
A crow sits on the roof of a blue car.
a kitty standing in an empty food dish eating from aniother
Panorama of a field with cows next to a dirt road
A clock repairman working at his table displays his wares on the wall.
A painting of a house showing the bathroom, kitchen and bedroom.
a street sign on a pole with a sky background
A photo taken through a window at houses on a hill.
Three people near a truck in a sunflower field.
a person riding a surf board in a wave tunnel
A panda is eating a frozen treat with fruit in it.
A cat sitting on a window sill outside.
A cake decorated with things from a barbershop
A group of elephants walking down a river with people riding them.
A man is kissing a woman on the cheek.
Vintage tour guides stand next to an early bus.
A man holding up two ripe bananas in front of a house.
A white plate topped with carrots, potatoes and dumplings.
Shoes rest on a carpet next to a drawer with a picture on top.
A plate with waffles, butter and a fork and knife.
Man crouched over a suitcase looking at the items inside
A stop sign on a corner of a road
A woman on a surfboard in the ocean.
An old goat with big horns resting in the shade
A child in a room with a remote in hand.
A new home banner sits beside a small curvy road.
An orange has a frown drawn on it with a knife in it.
a bathroom with a bath tub a sink and a mirror
Many kites in a field launched and launching
A clock that is on the side of a building.
A woman sitting on the ground next to two dogs.
Food sits on top of a refrigerator covered in magnets.
a person standing in a living room playing nintendo wii
A gray cat is wearing a red knitted rabbit hat cozy.
A clock in a busy city at night.
Two patrolman on horseback standing in front of an establishment.
A large crowd watches a professional tennis match.
A large black and white cow standing in a desert field.
An orange cat laying on top of a wooden bench.
A close up of a motorcycle parked on the sidewalk next to a door.
A jumbo jet Fed Ex plane on a runway of an airport.
A brown teddy bear in a forest with trees and shrubbery.
a couple of birds stand on a grass hill
A decal of a skateboarding man is applied to the wall.
A bathroom with a pedestal fan in ti.
an inflatable blue car on the beach with a man walking beside it
Hungry man enjoys lunch at a local restaurant.
A small dog sitting on top of a computer  in a bag
Abstract picture featuring girl on tennis court with racket.
A couple dressed to be married are pretending to talk on cell phones.
Friends playing and taking pictures with a camera phone.
A girl is petting a horse out in a field.
Traffic on a city street with busses, trucks and cars.
A giant giraffe made of building bricks, outside of a building.
A train sits on the rails beside the station.
A toilet has several toilet paper roll dispensers.
A young boy using a lap top by a table
Guy in a helmet on a skateboard in red.
A young man on a skateboard doing tricks on the cement
A couple of people near a truck on the road.
Many bunches of bananas sit atop this grocery store display case.
A man is skateboarding while at the park.
A professional baseball player running around a base on the field
A flat screen and a keyboard and mouse on the desk
A city garbage truck with three men in the front.
A scene of something that is quite attractive.
A man in a shop holding a picture of two men.
A woman tennis player is waving at the fans while she holds her tennis racket.
A road shot has a radio antennae and a small section of windshield, a brown hillside, the vanishing road beside it, bikers, close to the antennae, and far away, and off in the distance, signs, a car,  and a big blue sky.
A heard of cows stands in front of a man with a tractor.
A baseball team talks with coaches on the outside of the field.
A crowd is watching a baseball game being played.
Television and computers on with no one utilizing them.
a baseball player throws a pitch to a batter
A child holds the line of a kite flying in the wind.
A man standing over many doughnuts on display.
A group of young men standing next to each other on a ski slope.
A penguin is running through a pasture as sheep graze.
A small portable set of burners with a tea kettle are on the counter top of a neat, clean efficiency kitchen.
A reflection of a dog in a vehicle's side view mirror.
A car driving by a herd of sheep.
A tennis racket is laying on the floor of a tiled room.
A fighter is jet flying through the clouds
A miniature blue bow of fruits next to a penny.
There are two pieces of cake on a plate and a glass of pumpkin juice.
A boat that is inside of the water.
A Thomas the train engine model cake with writing on the platform.
A FedEx plane moving on the snowy runway.
There is a white cake and some small cookies
view of a bathroom with white toilet and white sink
A workspace inside an office with snowy trees outside the window.
The next hitter in the baseball game saunters to the plate.
A little boy that is holding a bat.
A large mirror reflecting a bus driving down a street.
Silver and green train sitting at a train station.
A clock that is sitting on top of a metal pole.
A horse jumping over a wooden jump at a horse show.
a person racing on a motorcycle on a race track.
A smiling young buxom woman is displaying a sandwich and a glass of beer.
A woman standing in a room with a remote.
Two giraffes that are standing by each other in a field.
a baseball player swings his bat at a ball
A person on skis riding down a race course on a hill.
A large clock fixed to a building as vehicles pass by.
Two giraffes rubbing their heads and necks together.
A person standing in the snow with their hand up to their face.
A small boy skiing down a snowy hill
A passenger bus that has two levels driving down a street.
A view of a city street through the windshield of a vehicle.
A man working on a propeller driven airplane.
there is a dog laying on a couch with many blankets on it
A living room filled with furniture and a TV.
A young child swinging a baseball bat at a baseball.
A man and his boys play Wii Fit in their home
A jockey is on top of his horse number 6.
A row boat is tied up to a dock.
A man on his motorcycle with a teddy bear attached.
two large elephants walk on the green grass
A woman is cooking over an open flame in the cabin
A woman standing  near a kitchen counter talking to someone
Tennis player getting ready to back hand the ball over the net.
a brown and white dog is riding a skateboard
A young boy is riding his skateboard down a hill.
A dog holding a yellow frisbee in it's mouth.
A plate of fruit near some other bottles of liquid
An elegant white vase of colorful flowers rests on the windowsill.
A bathroom with a small sink vanity and a toilet.
A woman in heels pulling a suitcase behind her
Two horses pulling some carts in the street
A flock of birds sitting on top of a set of power lines.
A vintage photo shows students sitting at their desks.
People sitting on the beach and sitting on beach chairs.
A man is at bat in either a baseball or softball game
an old laptop and a dog rest on a bed
a blonde girl is wearing a clip on tie
A cat looking up between two plastic bottles
Two cat lying on a floor playing with each other
A man performing a jump on a skateboard.
White show horses and handlers performing during public event.
A girl eagerly bites into a hot dog bun
A man in an orange shirt pushing a stroller.
someone jumping up to get a frisbee out of a tree
A train is passing through a residential area with houses, trees, cars and pedestrians.
a man sitting on a rock while he watches elephants in the water
A white jet airliner on runway with mountains in background.
A red tray that has some food and an orange drink on it.
A tour bus unloading at a rest stop.
Two shirtless men playing Frisbee in a grassy area
A woman leading a brown horse down a sandy beach.
A man surfing a small wave in the ocean.
An ad for Costa Rica shows a beach scene with surfers.
A toddler is in the bathroom holding his ear with one hand and his other hand is closed together.
A red and white tow truck tows a white car down the street.
A passenger train is passing a cargo ship.
Two gray elephants standing next to each other.
People are riding horses through a parking lot.
A close up of a dog wearing a Christmas themed hat.
Man holding paddle in air on surfboard with patch in corner
A busy looking street area in an asian country.
A tall giraffe standing next to a tree.
People sitting in a chair lift in a purely white landscape
A man in black coat standing under umbrella next to a building.
A young man sitting on the beach with a surf board.
A passenger train parked at a train depot.
A little girl sitting at a table with a piece of cake.
Three people are riding down a street while buses are in the background.
A person standing on skis with a backpack in front of them.
Two young boys are seated with their legs crossed.
Men are lying on a couch with a computer on the table.
A kite is being flown by a man in the distance.
A group of skiers posing for a photo.
Two hot dogs smothered in salsa on hot dog buns.
A hotdog and fries sit on a table.
Many elephants are walking near a muddy watering hole.
A boater smiles as he paddles his canoe.
A rusted up bard sinking into a body of water.
A bathroom with a sink, toilet and picture on the wall.
A pizza is on a plate of tin foil.
Dinnerplate with me vegetables and other condiments.
A young man is sitting in a chair and has mismatched outfit and a name badge.
A man on a skateboard performing a trick.
A brown and white dog and person standing on a wooden floor.
people on the beach playing with a brown cow
A desk area with a window view with mugs, tablets, and books.
A city bridge with a clock on the top of it
Two zebras grazing on grass in a field.
a woman with a blue umbrella standing by some stairs
A person on a surfboard is riding a big wave.
A large clock hanging off the side of a building.
The cat is lying on top of a pair of shoes.
Shrimp, broccoli and carrots are in white dishes.
Canoes and motor boats sit along the water's edge.
A persons legs with a dark colored cat rubbing against their legs and shoes.
A RETRO FOOD CHOPPER IN A CORNER ON THE COUNTER
A picture of a tennis player about to hit a ball.
A yellow and green fire hydrant on the side of a street with peeling paint.
Several elephants are standing in a  desolate field.
A picture of some food on a plate.
Two skate boarders riding down a paved path.
A person on a court with a tennis racket.
A group of people in a park with food.
a person jumping a skateboard into the air
A view out the window of an airport terminal
A polar bear is looking over the grass at something off camera.
A woman serves a tennis ball during a match.
many people in a kitchen area preparing a pizza
Two identical airplanes are flying side by side with people doing tricks on top of them.
A black and white dog sitting in the grass next to a frisbee.
THREE MEN SITTING ON A HUGE GREEN BENCH IN FRONT OF A BIG YELLOW BUILDING.
a young boy holding a tennis racket
A clock tower with ornate designs above a bridge.
a big plane sits parked as a bunch of people watch
A baseball player catching a baseball in a catchers mitt.
A man sleeping in clothing on a bed.
this is a computer and books on a desk
a man is standing surrounded by a lot of luggage
A man jumping up on a blue tennis court with a black tennis racket in his hand.
A dog laying under a brown computer desk.
Antique black truck with a barrel in the bed.
A bunch of stuffed bears and gift boxes in a suitcase.
A horse near another horse in a building.
View of a smartphone sitting on a computer keyboard.
A blue train traveling down tracks next to a building.
A man with a kite on a hill
A couple of bears on a shore near some water.
A wedding cake design with roses and wine glasses.
A group of people standing with some motor bikes.
A baseball player dropping his bat and beginning to run
A skateboarder does a trick in a crowded skatepark covered in graffiti.
Two people in the snow on skis taking pictures.
a line of kites that look like cows next to the road
An arrangement of items from a woman's purse including wallet, cell phone, MP3 player, gloves, hairbrush, eyeglasses case and day planner
A man holds the hand of a child as they look at a row of cows.
a bathroom with fancy sink in the corner
the toilet in this bathroom is in disrepair
The city bus is parked on the side of the road.
A traffic light with a bike signal on a pole.
A dog and a sheep are separated by a wire fence
The entire pizza is in a box atop the dishwasher
a soldier is receiving an award from a man in a suit
A group of giraffes stand next to a building and tree.
a white green and black sign and a bicycle without wheels
A teddy bear with a book is placed in a wooden chair
A bobble head baseball figurine on a desk.
a meal on a table which includes pizza in a box and a bottle of beer along with a beer mug
a small group of zebra and giraffe in a savanah
many trains on tracks near a building
A man standing next to a red motorcycle.
A man smiles while holding his cell phone.
a close up of a school bus parked in a lot
A person holding a rope hovers over the ocean.
A boy and a girl under an umbrella.
A man standing on a lush green field holding a kite.
A zebra standing in dry grass has dark and light stripes
Computer desk with monitors and large monitor displays.
People are gathered around at an outdoor table.
A loving couple who has fallen to sleep together on a couch.
this is a  pizza cut into slices
a man that is walking with something in his hand
A fire hydrant with a painting of a face on it.
a woman in red is riding a horse
A young boy standing ready to hit at the plate in a baseball game.
A group of people seated at a table in a restaurant
A man seated on a park bench with his head down
A very clean, modern living area has a very comfortable couch-bed and a wide screen TV.
A player is in motion as he reaches back to throw the ball.
Two elephants touching each others trunks beside each other.
A man in racing gear and number under a banner.
Two men sitting on a yellow boat in the water.
several elephant type large yard ornaments setting outside.
an image of a plane that is taking off in the center
A man and woman standing in front of some pizza.
a woman in a bathing suit standing near water taking a picture
A picture of a giraffe walking around its enclosure.
showing lemon, red pepper, zucchini, ginger, and yellow squash
A toothbrush holder with tooth brushes inside of it.
A red double deck bus traveling along a city street.
There are baby birds in a birds nest
Two fancy dressed people ride on horses down the street.
A large black bear walking across a lush green field.
A woman holding a teddy bear in a costume while wearing a really tight shirt.
A person snowboarding down a snow covered hill.
Apples, plums, peaches and pears sitting on a metal counter top.
A woman is reading a book as she sits on bench with a sign in front of it.
Woman on a Kitchen counter on the phone with paint.
many people of all ages skiing on the snow clad mountains.
A pizza on a large knotty pine kitchen table
A group of policemen on motorcycles in a city.
A crowd of people standing around each other in front of a shack.
A large white polar bear standing on a icy pool.
A suitcase sitting in a living room of a home.
Boys playing with a colorful kite in a park
A street sign on a pole next to a building.
A cat that is laying down near a shoe.
A woman with long red hair packing herself into a suitcase.
Two people talking and a young lady that is reading a book on a bench.
Many sheep are out in the green grass.
a person swinging a baseball bat at a ball
Man on grassy field getting ready to catch yellow frisbee.
A piece of asparagus quiche and carrot salad on a white plate
Multi colored cat laying on and among shoes and boots.
A shaggy mother pony and her foal in a field.
A person is getting a slice of pizza from a platter.
A close-up photo of a young cauliflower plant.
A comfort bus is driving on the street.
a woman standing by a window while talking on the phone
Skateboarder jumps high off of a ramp into the air.
A parent and child playing with a plastic basebat and ball on the beach
A grey bird perched on a tree branch
A MAN IS ON THE SNOW BOARD IN THE SNOW
Adults gathered in living room playing video games.
Two woman at a table full of wine
A "pet crossing" sign with a peace sign on it is on a pole by a tree near the highway.
A group of zebra standing in the tall grass.
A man in an apron standing at a table full of oranges.
A bed near an open window with a small fan in the windowsill.
A newly married couple kissing next to a food van.
there is a small bowl with a lot of food in it
A boy running and flying a kite in a field.
A cat lying on a pink blanket sleeping.
A large mirror with black framing on the wall of a bathroom above the sink.
A bunch of people walking and doing things down the street
a plate that has some cut up vegetables on it
A bunch of doughnuts that are on a tray.
A man is holding a bunch of banana's
An elephant scratching his ear in the sun.
Two horses pulling an old fashioned style carriage down an urban street
A baseball player holding  bat while standing on a field.
two people standing next to an elephant in fenced enclosure.
A red and white napkin covered with fries, a burger and coleslaw
Several objects displayed on a kitchen table including bread, oranges and plating.
there is a blue left turn sign on this street pole
A man riding on a bicycle down a street while holding a surfboard under one arm.
A woman with an intense look rared back with her tennis racquet.
Young soccer players on field during match play.
A plate of fruit next to a cup of coffee.
A baseball player holding a bat standing next to home plate.
Some young ladies in swimsuits sitting on a dock over water.
A boy riding on the back of a motorcycle near a truck with pineapples in it.
A stop sign at an intersection that has stickers and leaves on it.
A beige and white bathroom with white toilet and honey colored hardwood vanity
A little girl wearing a hat has one foot on a skateboard.
Two desktop computers sitting on top of a desk.
The desktop computer has three different working screens.
a few people that are standing next some motorcycles
A bird on a table eating from plates of food
A desk with a midi controller to make music with.
A boy and a girl on a boat while another boy is standing on land with one foot on the boat.
A slice of rich and decadent cake covered with frosting sits on a plate.
A tugboat sits beside a ferry on placid water with a mountain in the distance.
A man is holding two sandwiches one  in each hand.
The cattle are standing in the dirt path.
Display case full of several kinds of donuts in a shop.
A momma zebra and her baby running through a field.
A man in a black jacket taking a picture of a sink area.
A cat sitting on the edge of a sink in a bathroom.
Teddy bears seemingly hug one another against a dark background
A small bathroom with a sink and vanity.
a big animal that is in some grass
A big sandy beach with some kites flying in the air.
A man laying in bed with a book over his face.
A baby plays with a teddybear while sitting on a green blanket outside.
A small cat has it's front paws inside a toilet.
A bird sitting on a house eave in a backyard.
Some people sitting at a table with open luggage and papers
A man in black jacket flying a kite on a beach.
A urinal in a public restroom near a wooden table.
A man in a very fashionable cleanly decorated bedroom.
A container with a meat sandwich and fork is sitting on the grass.
This seems to be a bear laying on the snow.
A bedroom with a bed, desk and a television.
Far shot of a clock on the side of a building.
A rural street at an intersection with cars in the distance parked on the curb.
A black and white colored cat on top of a wooden bench.
a woman riding a surfboard on  a wave in the ocean.
a bath room with a toilet and a shower
A gray bathroom is lit up to show to sinks.
Four planes fly through the air in a black and white photograph.
A baseball player is in motion with his bat.
A slice of pizza is on a plate on a table.
there is a large bowl of food on top of a table
a train on a track near a platform
A pizza with pasta on top and olives and pepperonis.
A grey teddy bear with a red bow and a card.
People in a square near a small clock tower.
A striped cat laying on a wooden bench
a silver oven some pots pans a knife and cabinets
A young boy skis down a slope with adults standing in the background.
A young boy eating mushrooms near a pizza.
A bear looks ahead from a field of vegetation.
A housewife holds a platter of food in the kitchen.
Two giraffes in a zoo enclosure stand by a wall.
a garbage truck in the city late at night
A group of elephants gathered near some poles.
A man holds a skateboard in his hand.
The official box for the Wii game showing a hand holding a controller.
A man walking across a field holding a baseball bat.
A line of people crowd the sidewalk beside a business.
Two men watch as yellow aircraft flies over a lake.
A plate of food that includes meat, broccoli and potatoes.
a vintage photo of a cake walking on a toilet
A apple that is taped to the back of a laptop.
An old lady is smiling happily sitting on a motorcycle.
a brown teddy bear is sitting on a green bed
A man with sunglasses talking on a cellphone.
A WOMAN IS EATING A SANDWICH OUT ON THE GRASS
A kitchen with hard wood floors and wooden cabinets
Giraffes standing together and other animals in the background.
A fat gray tiger cat laying on top of  bed up against a pillow.
A bunch of luggage bags with tags on the floor
An empty chair is set in front of two computers at a work desk.
A Ferris wheel is visible behind the building's clock tower.
A group of people are putting their sweet treats all towards each other.
a double decked bus parked by a stadium
A group of people comparing cell phones together
The two tables are each covered with food and plates.
Young girl posed with a bunch of cell phones and a "New Years" party hat.
The man in the hat walks along using his cell phone.
A man that is leaning over a tray of doughnuts.
Two Zebras are eating grass together in the wild.
A group of cars driving past a mcdonalds near a bridge.
A woman performing a shot in a tennis match
A Canadian airplane with a big red maple leaf is flying high.
Plane next to a boarding ramp under a cloudy sky.
Sandwich made of two doughnuts sitting on top of a plastic plate.
a girl balancing on a surf board while a man watches behind.
A cross country skier stretches on an open field of snow.
A man walking across a field holding a wand near a dog.
A beach setting with tons of people around the shore.
A train station stands majestically and functionally while passengers wait for their train.
A table with a keyboard and some other items.
A little baby zebra running around in a fenced in area.
Some motorcycles are being displayed in a window.
a dried up stream stands two zebras and there are other animals in the background with trees.
Three people posing with sundaes in glass bowls.
A black dog laying on a tile floor next to wall.
A folder sitting on top of a wooden bench.
An empty double-decker bus rests against the curb, alongside some buildings.
A row of outdoor food tables look very primitive.
A plate with two doughnuts, strawberries, and coffee.
This is someones bathroom sink in their home.
A tall clock with a small tree beside it.
Two women who love and care for their horses.
A man skateboarding in an old abandoned pool.
a person on a train station platform
A long train sitting on a railroad track.
The baby boat is drinking milk from it's mother.
A meal at a restaurant of a salad, a toasted sandwich and a pickle
A group of men playing instrument next to a wooden wall.
A set of coffee mugs sitting together on a small wooden table near a bedside.
a big airplane that is parked on some concrete
A small dog rests in a large dog bed, snuggled on a blanket.
Several zebras in an open area during a not so sunny day.
Women sitting at the table eating meals at the restaurant
A man bends over an open toilet and looks in it.
People at a park, taking walks, sitting on the grass and throwing Frisbees.
A train engine carrying carts down a track past some buildings.
A woman hovering over food on a wooden table.
A cat laying on a TV in the middle of the room.
A man rides a bicycle carrying snow skis.
A mockup of an African elephant stands in a museum
this is a bird sitting in some grass
A woman chops vegetables in a kitchen.
A classroom with a rug on the floor that looks like a computer keyboard
Cupcakes with frosting sit on a foil covered tray.
An old photo of a man with a pipe and a beer.
A woman with bleeding nose and blood stained shirt looks into a cellphone.
A nice shiny suitcase is positioned alongside sneakers for a quick getaway.
A monk is looking at a mobile phone among ancient architecture.
Many sheep grazing next to a busy road.
There is a person sitting on a motorcycle.
A small girl eating a plate of food with a fork
A lush green field with colorful kites flying above it.
White dog sticking his nose out from under red and white striped bed ruffle.
A giraffe resting it's head on a fence at a zoo.
A busy street full of cars and buses with buildings in the background.
A plane flying with a smaller plane above it.
An elephant is walking towards a tree in a park.
A large elephant walking across a field of grass.
A small black and brown dog standing next to a cow.
a red bus is parking in the field.
A bus makes its way through the city street.
Boats in a river on a foggy day.
This unusual animal figurine sits in front of a clock.
A group of men are in discussion around bananas.
Two zebras who are in a field together.
A desktop computer has two keyboards and two mice.
Several people walking around near a white van.
An office with file cabinets, a keyboard and chairs.
An elephant guided by a man in a blue shirt and followed by another elephant.
A pan sitting on top of a stove top under a wooden spoon.
Two women standing on a purple tennis court.
Munching in the grass is a daily habit.
A yellow school bus parked in a parking lot full of snow.
a man holding a cell phone towards the camera
Balls of garbage sitting on top of a toilet.
A group of men cutting a giant sheet cake.
A tram is traveling down a green track
A pizza that is topped with an assortment of items and sliced.
A man wearing a red baseball cap walks along a grass field with a backpack
A big bear is standing next to the bars.
Several people fly kites above a paved outdoor area.
there are many beer signs on the side of buildings
The clown is driving down the grassy area.
A bus that is parked in a lot next to another bus.
A cat sitting on haunches next to a wooden door.
a man doing a jump with a skateboard in the road
A skier leans as she makes a turn down the hill.
people sitting at tables next to a building in the background
A stop sign is posted near a road with a bridge in the background.
Men on horseback going through a crowd of people.
A group of people on line at an intersection.
A couple both holding a knife and cutting their wedding cake together.
A meatball sub served with french fries on the side.
a desk with a keyboard and a monitor on it
A beautiful woman playing a game of tennis.
A person cutting out pictures of clothing items.
A giraffe out amid the trees and grass.
a double decker bus going down a road beside some stands
Two boys playing frisbee on a soccer field.
There is a small yellow bird standing on a fence
A living room filled with furniture beneath a window.
A couple of me playing tennis on a plane flying in the air.
Three men are standing on a baseball field with a crowd watching.
A family are in their skies posing for the camera.
A small red belt clip cell phone case.
A dresser with a clock and a potted plant on it.
A person wearing brightly colored clothes is riding a motorcycle.
A dimly light living room with wooden floors and large windows.
A heard of Zebras moving with another animal group across a field.
A girl is showing off a stadium hot dog
Someone is using a small grill to melt his sandwich.
A bright green frog on a bright green plant.
a close up picture of a large variety of fruit
A plate of food on a wooden serving tray
A snowboarder with ski poles in midair facing the ground.
The person is putting toppings on his food.
a man in an orange and white striped shirt with some scissors and machines
Two women share some chocolate cake and coffee.
two people eating food off of a paper plate
A skateboarder is featured at different positions on a ramp.
People standing next to a bus with a cat face on the front.
A group of walkers are seen while passengers ride in a train.
A bright bedroom with a red bedspread and someone laying on the bed.
A polar bear standing high on a rock.
A man and a woman smiling while holding an electric keyboard.
A river that has many boats floating in it.
The two bears are wondering about the point of the camera.
A woman showing a teddy bear to another woman and child.
A table that has been served soup and fruit.
A view of a kitchen from the doorway.
A car is seen in the reflection of the microwave.
A catcher reaching out to catch a ball while the batter is swinging.
Cars line up to coin meters on at a busy sidewalk
A guy sitting at a table in front of a birthday cake with candles in the cake
A highway sign on a rocky slope along side the road.
this is boats sitting in water near grass
A dog chasing a group of birds outside.
A man and a woman standing beside each other.
A cluttered and dirty kitchen counter top, with food spread around.
A tomato and an apple sitting on a table.
There is a sink and toilet in the bathroom.
A fire hydrant with writing on it on a street corner.
A man in a suit holding a red ukulele
A young child smiles as he holds a tennis racket.
A herd of cattle walking along a sandy beach.
a brown teddy bear and some wooden block toys
a couple of dogs running around a field
A guy riding a bike and carrying a surfboard turns to look behind.
Unattended luggage in a roped section of an airport lobby.
a man holding a sandwich and another on a plate
A person holding up a cell phone taking a picture.
White flowers in a tall brown contrasting vase.
A white horse looking out over a fence.
The sky is dim as the sun changes positions behind a building.
A group of men riding on a horse drawn carriage.
A wood bench under a tree in front of some bushes.
A bus broke down on the side of the highway and all the passengers had to file out onto the side of the road.
A cellphone, piece of fruit and cup are on the table.
A living room area with wood accents on the wall and floor.
someone that is holding a wii remote in their hand
A sign for the Atlantic City Convention Center.
A man is standing behind many different fruits.
a close up of a sandwich on a plate
A tall white clock tower with a black clock on each of it's sides.
Plate of food, including hot dog, ribs, beans, and corn.
The surfer in the wetsuit is coming through a very big wave.
A group of people fly guiding on the sand.
a person riding on an elephants head walking on a dirt road
the woman is giving the solider something to eat
A bench is sitting in front of the water.
A toddler with a pacifier wearing a neck tie
There is a suitcase with items surrounding it.
The baseball player is getting ready to take his turn at bat.
A green sign  that says rockaway beach on a post.
this image is of a boy with a skateboard doing tricks
A boy does an ollie in a skate park on his skateboard.
A bowl filled with soup sitting on top of a white place mat.
A trash can on a corner has a microwave in it.
A picture of a dog sitting in the backseat of a car.
A person standing in a living room with a fire place.
Motorcycle police are on large bikes in a crowd of people.
A picture of a full bathroom with a large tub.
Balloons and banners decorate the open fair grounds.
A woman holding a tennis racquet on a tennis court.
a person walking on a city street with an umbrella
A vespa parked with a cover in a fence
A fire place sitting in a living room under a mirror.
A classroom with a purple chair and a chalkboard.
The interior of a bathroom made of stone and colored glass.
A pizza laying on top of a wooden board.
Man with no shirt holding frisbee in grassy, rocky area
Scissors, a hole punch, and paper laying next to each other.
The luggage boxes are downloaded from the aeroplane.
A stunning skyline sits in the back drop of traffic lights.
all of the parking meters on this street are covered with plastic bags
there are many people laying in the sand at this beach
People are purchasing food from a fruit salesman.
A woman is standing looking down at luggage.
A smiling man at a table has a wine glass.
A man riding a wave on a surfboard near a para sail chute.
A tennis player on sand in the middle of a play.
A person that is in the snow having some fun.
A group of people are sitting by a truck on the ground.
Several cakes are on display in the bakery
a laptop sitting on a table, with a beer and tv in background.
View of a snowy mountain outside the windshield of an aircraft.
This kitchen layout appears choppy and full of "blocks".
A plate with a wide variety of food on it.
a woman in a dress and a tennis racket in hand
A couple of men on horses and people on bicycles in a courtyard area in the nighttime.
people skiing on a snowy ski bank while wearing ski wear.
A smart device sitting inside of a white bunny bat.
A clock that is embedded in the ornate top of a building.
A pair of woman lunge after a tennis ball on a doubles tennis court.
A young person sitting in his seat working on his laptop.
A dog looking out a window of a car.
A living room with two blue couches and entertainment equipment.
A giraffe looks at the back of its enclosure.
A plate with food and a newspaper on a table.
person cutting paper with scissors at a table
an older person standing playing nintendo wii system
A young man in striped shorts rides the waves on a surfboard.
A brightly lit, quaint and clean living room.
A man is surfing on a wave in the ocean.
A group of three men riding snowboards on a snow covered slope.
A person is holding a nintendo wii controller
A small pizza sliced into four pieces garnished with green leaves.
a group of people under umbrellas at a beach
A person is flying a kite high in the air.
A fluffy cat is sitting on the sidewalk.
a toothbrush holder with four toothbrushes in it
An office with a two desks and a filing cabinet.
The person is flying a kite with two strings.
Bananas are hung up to ripen at an outdoor market.
a woman is standing by a sink in a kitchen
A woman sitting at a table holding up a pair of scissors.
A black and white kitten is asleep on a keyboard laptop.
Couple sitting at a table in a restaurant with pieces of cake.
Carrots, celery, nuts, onions, and bay leaves are mixed together in a bowl.
A man standing on top of a snowy mountain
A kitchen area with a stove, refrigerator and sink.
a white cat covering itself with an umbrella
a bird is standing on a green bench
A group of holiday bears are arranged in a group.
A big white bird standing in front of rows of benches.
A little boy holding a baseball bat getting ready to swing
A futuristic bike parked in front of a sail boat.
An elephant is spraying water out of it's trunk.
The yellow train is headed towards the final destination.
Black statue on marble base surrounded by security ropes.
Table and chairs set up at the back of a church.
this is a traing riding through a city
an open suitcase and a closed suitcase on the floor and a cat on the bed
A man about to hit a tennis ball with a racket.
people walking on a path around log cabins
a man on the tennis court with his arms stretched out
There are two zebras in a rocky plain
A man with a small backpack cross country skiing
A tennis player standing on a tennis court looking up.
A woman holding up a large carrot in a backyard.
A person walking in the ocean with a surfboard under their arm.
two slices of pizza sitting on a plate next to a fork
a couple of people on skis ride through the snow
A bunch of bananas hand from a banana tree.
A couple of men playing a game of frisbee.
a hot pocket sandwich  laying on butcher paper
These families are riding on the backs of elephants
A fresh vegetable shop in a vegetable market.
A person on a skateboard on a street.
A man riding a surfboard in the ocean on water.
A skate boarder falling down in a very big ramp.
Livestock, people, and vehicles on asphalt near a building.
The city bus is parked in the parking lot.
A person with a kid on top of a horse.
A person playing tennis on an outdoor court with trees.
A statue stands in a courtyard near a colorful flower bed.
A man riding on the back of a horse.
The cabin of a small boat has two couches
Two decker bus entering leaving Winchester Bus Station.
a big sign saing where to go for parking
A view shows the bedroom and bathroom close together.
Two zebras are facing away from each other.
A child flying a butterfly kite while another child rides a scooter.
A couple of giraffe standing on a lush green field.
A baseball player waiting for his turn in baseball game.
Two people ridding horses on a dirt trail with woods behind them.
A person walking with a small brown pony on a leash.
A blue tent sitting in the middle of a forest.
The child's bedroom has two low beds and storage space for toys and entertainment.
people walking with umbrellas in a rainy  london england
A bedroom with bicycle, computer desk and checkered bedspread.
a man doing a trick on a skaeboard
The celery and carrots are on a cutting board with a knife.
A hand holds a piece of fruit with the peel cut off.
Group of black chairs sitting underneath a blue umbrella.
two benches sitting on the beach by some trees
A group of green traffic lights on a street filled with snow.
A dog running behind three sheep in an open field..
A piece of pizza on a white plate with multiple toppings.
Several airplanes can be seen at the airport but there is also snow on the ground here.
The little girl in pink shirt and beige pants throws the frisbee.
A car driving down a street near stores with bicycles outside of them.
The female tennis player is heading towards her next match.
there is a bench on top of bricks by the water
Lady loses her ski on a snowy hill.
A yellow cat wears a blue plastic sports hat.
Two boys carrying hot dogs and other snacks at an outdoor sporting event.
A beach with an area with umbrellas and an open area without them.
A close up of an apple mouse and the numberpad of the keyboard.
A person looking at their cell phone at another person taking a picture.
A chicken sandwich and sweet potato fries on a plate.
Two teenage boys playing a game of frisbee.
Two people holding umbrellas looking at a statue of a man.
A group of people waiting in line to board a train.
A few snow skiers are going a mountain slope.
A woman sitting on a bench at a park.
A man riding a skateboard while a group of people watch.
a close up of two slices of pizza on a plate
A plate of vegetables arranged with flowers and herbs.
an air plane at an air port run way
A giraffe laying on the ground looking forward.
A living room filled with furniture and a flat screen TV.
A giraffe standing underneath a beautiful rainbow in a cloudy sky..
A coffee cup, food, and a passport sitting near each other.
The meal is prepared and ready to be eaten.
a soldier is carrying a couple of bags
Dinner is served in a tray on the table.
A blue sign that is pointing to the restrooms.
A person on a motorcycle making a sharp turn in the dirt.
a black and white photo with a double decker bus in color
A woman wearing a white t-shirt and visor with pink shorts playing on a tennis court.
Messy apartment in the middle of packing for travel.
A close-up of a laptop on a desk with a book.
The group of friends are enjoying their drinks.
A washroom with many photos hanging all over the wall
A girl is dressed in all red holding a red umbrella.
a bunch of stuff is loaded in the back of a red truck
A giraffe with dark spots lounges in the grass.
A beach with several kites flying just slightly off the ground.
A tall tower with a clock on it at night.
A man sitting at a table using a laptop computer.
A market is shown on the side of the road.
Seven carrots of varying sizes lie on a table
a person riding a bike wit ha dog in a basket
A person on a skateboard being watched by a crowd.
A street sign showing the intersection of Main and B.
A yellow bus that is sitting in the grass.
A bowl of food contains meat and broccoli.
Surfer on knees on surfboard while riding wave.
A herd of sheep grazing on a lush green field.
A group of men standing around each other playing a game of baseball.
a dog sits under neath a chair with a person in it
A girl and a man are playing Frisbee on a lawn.
A man is holding a box of some sorts near a bus and someone wearing a strange outfit.
a little girl playing a game of wii golf
A beautiful young woman laying on her stomach in front of a laptop.
A group of skiers with backpacks carrying their skies up a mountainside.
some people walking across a road with a sign on it
A close up shot of a giraffe against a blurry background.
Clock in middle of a sculpture on top of building
a man standing next to a laptop and bottles of beer
A brick sidewalk of various colored bricks next to a street with cars driving on it.
A plate with broccoli, potatoes and a meat with sauce arranged on it.
This is a picture of a women trying to figure out where her keys are.
A woman gesturing with her hands and sitting at a table with a computer.
A pile of veggies next to a pile of bananas.
A skateboarder doing tricks on a ramp in the sun
A man and girl are standing on a field holding baseball gloves.
Two people hold up tennis racquets over a net.
A pizza is topped with vegetable strips and garnishment.
A cup with three pairs of scissors sitting on a table.
a close up of a cat paw near a book
Two skiers are traversing up a tall mountain.
a man with a hat and a baseball bat swinging at a ball
sandy deserted umbrella lined beach with houses on top the cliff
An historic training sitting on railroad tracks.
A vase filled with flowers on top of a table.
Someone flying a kite in the sand on the beach.
two little bird on a tree touching beaks
A kitchen painted white with an automatic dishwasher and a large window.
Many cartoon modeled objects are in the sand.
A blue, white and red fire hydrant sitting on a sidewalk.
A woman sitting on a beach taking a picture of a number of kites
the table is set with many things to eat.
This girl is happily filling her plate with the healthy and creative food choices served at the buffet in this yard party.
Fresh fruit for sale hang by the side of the street.
Two giraffes are standing side by side in a field.
A man standing behind a white frisbee on a lush green field.
A large number of cattle confined in a small area.
A person riding a brown horse in full dress.
A large airplane sits at a gate at an airport.
A group of children sitting in a red wooden canoe on the seashore.
A skier flips upside in the air performing a high jump in the mountains.
A couple of men racing motorcycles next to each other.
A painting of a luminous glass bottle seems to glow with inner light.
The dog is lying on the white sheet.
A cow sculpture sits on top of the grass.
A group of motorcycles are sitting in front of a building.
A tennis player looks at a tennis ball as she lifts up a tennis racket.
A man holding a square shaped pizza pie.
A line up of motorcycle cops riding motorcycles on a street.
A person holding a skateboard with a dog tucked in their jacket.
A boardwalk with a fence and bench lit by streetlights.
Two yaks are standing in a grassy field.
a barbecue sandwich with onions in a paper tray
A jet waits on the runway of a mountain airport
the horse is bending its head over and grazen
A small yellow plane is leaving the hangar.
A tree-lined city street with car and motorcycle traffic.
He is skateboarding down the wall at the skateboard park.
The gray elephant family is crossing the ditch.
A businessman giving a slide show presentation in a meeting room.
The two young children are sitting at the table together.
A girl in white shirt and blue shorts playing tennis.
An odd looking mechanism sits on a dirt road while beyond it someone rides a bicycle and in the background small flags are flying.
A fuzzy black cat is sitting on a laptop computer.
a plate with some eggs chicken and tomatoes on it
a woman is standing at the beach with a surfboard
Two computer monitors sitting next to each other.
A person sitting down eating a sandwich next to a street.
Serious looking couple with light brown Teddy bear, side sun light.
Two trains sitting side by side on the tracks.
A woman in glasses is taking a bite out of food.
a group of sheep are all outside in the soil together
An empty bathroom with a toilet and sink.
An egg is served on top of a small pizza.
This is a public restroom that is fully tiled.
A group of people standing around a van in the rain.
Two men are in a green train with yellow lettering.
The little boy is brushing his teeth with a toothbrush.
A dog jumping into the air to catch a toy.
a baseball player getting ready to hit the ball with a bat
A pie sitting on top of a stove top oven.
A bunch of very cute signs hanging by a business.
a close up of a bowl of fruit with oranges
The traffic light is visible for all of us to see.
a couple of people on a motorcycle dressed as santa
A person snowboarding down a slope at an angle.
Two women walk near a man skateboarding with a child.
The fork sits next to a piece of chocolate cake.
Some grilled fish is on a white plate with a fork and some carrots.
Cars driving on a road near traffic lights.
Slice of baked dessert item on platter ready for consumption.
A man holds his arms in the air while standing in the snow.
A man standing in a bathroom brushing teeth while wearing monster mask.
A plate of pizza on a restaurant table.
A black and white cat sits on a wooden porch
The man watches his reflection while brushing his teeth in the mirror.
two bento box meals with meat and vegetables
A digital clock shows the current time at 653
A baseball player holding a baseball bat in the game.
an image of a close up of food with meat and veggies
A small kitten figurine on top of a cellphone screen.
Two light brown cows standing inside of gated corral.
A beach filled with lots of people next to the ocean.
A street light that shows, horse crossing on it.
a large truck in a field with trees in the background
A man riding on the back of a white surfboard with two small dogs.
a bench at a train station with seating on the front and back
A woman presenting cupcakes with lit candles to a baby.
A toilet in a stall with the toilet seat up.
There is a man dressed in a purple tie and black suit.
a street full of people walking and one riding a small motorcycle
A large group of people looking at an elephant behind a fence.
an image of a group of people in the woods playing with frisbee
a elephant balances on a stepping stool
There are three people posing with their drinks.
A city street at twilight showing a bus crossing the intersection and people standing on the corner.
Tabby cat sitting on the hood of a blue car.
A young man jumping a metal railing on top of a skateboard.
A person with their hand on the mouse of the computer
A white bathroom with a toilet and a brown and white  tiled floor.
The large tray has a large sandwich, two pickle slices, and a bucket of fries.
A group of people sitting around a living room.
Two birds sit near a plate of partially eaten food.
some boys having some food at a table together
A dish with mean inside of it .
Three people smiling and sitting at an outdoor dining table that has place settings for four plates.
A baby sitting in the middle of a bunch of teddy bears.
A tennis player about to hit the ball.
A train is waiting at the station for passengers.
A para sailer approaching the beach on  a sunny day
A man riding a skateboard is making a jump over a bench.
A city bus stopped in front of a building.
A piece of cake with many colours on a plate
An umpire gets ready to call a player safe or out.
a person with a red umbrella a building and a car
A blue jacket laying on top of a fire hydrant
A young baseball player winds up for a pitch.
Images on the same man song tricks on a skateboard
A pretty little girl standing on a hardwood floor.
People are laying on a sunny beach near the water.
A woman looking at a website on her computer.
A person with skis and gears standing in the snow.
a sky full of kites floating in the wind
A stuffed blue bear with a tag in a room.
A man eats a pizza in a small restaurant chain
a man wearing protective gear is on a skateboard
A woman surfer walking along the beach sand.
A silver oven door is reflecting the wooden floor.
2 girls laughing while one holds a telephone
A long white bath tub near a white toilet bowl
A white plate of food on a table.
There are many doughnuts and pastries arranged on platters
A silver colored refrigeration unit, in a kitchen.
A person that is playing in a tennis game.
A dog chews on a box in a grassy yard.
the girl id licking the spoon of batter
A man throwing a Frisbee in a parkland
A small apple tree sitting next to a  wooden fence.
A book shelf filled with lots of books.
A living area with a futon, chair and a window.
A display case at a store filled with lots of different vegetables.
A cat is sitting on a car hood on a wintery day.
A man playing tennis with two people watching the game
a clean bathroom that has a big mirror
A person that is surfing in the water.
A pile of submarine sandwiches sitting in a stack.
A white table with a bottle of soda and a hotdog.
A plate of chocolate donuts and one has sprinkles on top sitting on a blue platter on a table.
A man in a baseball uniform about to throw a ball.
A child eating a slice of pizza at a table.
A group of motorcycles parked on a dirt parking lot in a mountainous region.
A large kitchen with a metallic refrigerator freezer and a center island.
A man who is in the air with a skateboard.
A young man riding a skateboard on a walkway.
A young person on skis on a ski slope
A banana split with white and dark chocolate
A white dog sits in a basket with wheels on the floor.
A woman poses with a large teddy bear.
A corner of a room with a very big sink near a toilet.
A man and a woman with cell phone in hand behind table of food trays.
A man unpacking a laptop computer in his living room
A tennis player gets ready to hit the ball.
A sign for a restaurant and bar on a building.
A young child brushes their teeth with a blue toothbrush.
A small toy is sitting on a plate of pizza.
A red, yellow and white transit bus traveleing down a street.
A herd of dairy cows in a field behind a fence.
a cat looking out from an open doorway
A baby sitting on a kitchen floor in front of an open refrigerator.
A piece of broccoli partially surrounded by knife blades.
a few drag queens make some cake and eat it
A cat sitting on top of a desk.
Black and white photo of three suitcases stacked on top of each other.
A man sitting at a table with pizza.
A surfer riding on a wave well if it's crash in the ocean
A man and dog are interacting on a bed.
Two men sitting in the snow with their snowboards on while one man is standing.
A large truck on a open city road.
Four remote controls are placed next to a Universal remote still in its package.
A person selecting some bananas from a bunch.
A young woman feeding cattle on a dairy farm.
Three young men eating food while sitting on an indoor bench.
a person on a bicycle wearing a hat in a parking lot
A teddy bear dressed in a pair of underwear sitting on a chair.
Two zebras that are standing together in a field.
Two people sit on a couch by a guitar.
A cat that is standing over a bowl.
A silver train parked in front of a train station.
a waterway and a train going over it on a train bridge
A cake that has dogs around and on top of it.
A beach area that has seagulls on the rocks and sand near the water.
A woman surfer riding a wave crashing behind her.
a man on a snowboard is on a ramp
A cartoon image of a man on a pair of skis.
Batter, catcher and umpire at a baseball game.
A picture of some oranges stacked on top of each other.
A man standing on home base with a baseball bat.
A large teddy bear sits at a yard sale.
A room with holes in the wall and dirt on the bed looking utterly disgusting.
A white sink sitting next to a toilet under a window.
A utility truck parked on a incline covered in graffiti.
A woman sits at a table with an open laptop in front of a screen.
A group of people standing around a elephant.
A man sleeping on a couch holding a ripe banana.
People in a hall, bags and suitcases on the conveyor belt
The group is going skiing  on the snow.
a girl choking up on the bat waiting to hit the ball
A person taking a picture on their cell phone
A person flying through the air while riding a skateboard.
a cat layling on a red blanket and looking relaxed
A personal pizza and beer on a table
Several cows are standing near each other in the grass.
Is that a tiny computer next to the phone?
Two people stand next to a grill with hot dogs talking.
One white sheep standing still on the pasture near a dried up tree.
This is the grill of a large truck.
A man plays with a frisbee in a grassy field.
A train on the tracks up on a bridge.
A large airplane mid flight among the clouds.
The people are walking down the street with their umbrellas up.
a bus that is parked in a parking lot
A group of bikers make their way up the city street, as a line of buses park by the sidewalk on the opposite side of the street.
A small bathroom with a vanity on one side and the shower on the other.
A toy kitchen with a play sink, stove and oven.
A girl wearing protective gear while riding a skateboard.
Two teddy bears in front of two vases of flowers.
A man in a den playing with remote controllers.
A bag of luggage filled with personal items.
Small celebration cake on a table with happy birthday decorations.
woman in a hat feed a giraffe out of hand
Two giraffes, and antelope and some zebra in tall dry grass
A man on top of a car standing next to a group of mountain goats.
A cat is in front of an open refrigerator door.
THIS IS A PICTURE OF A KITCHEN ISLAND WITH SEATING
A woman is paddle boarding down the river.
A young lady playing soccer alone on a soccer field.
A big bunch of ripe yellow bananas on display.
A pair of men playing a game with some remote controllers.
A laptop and some suitcases in a room.
A group of people standing on the beach watching a low flyinf plane go overhead.
Kitteh at rest on somebody's black and white shoes
a couple of people sitting on a couch plays with a wii remote
Plate of food including rice, meat, and vegetables.
A blurry image of an object with signs behind it and motor bikes.
A baseball player in a white uniform holds a bat up while standing near a. catcher and an umpire on home plate.
A group of white sheep walking through a wide grassy field.
A kitchen with many cups on the window sill.
One tall giraffe on top of the dry terrain.
A man riding a red scooter down the street.
A man is standing on base at a baseball game.
An elephant is the focal point in this photo.
A little girl in a store playing with four large white Teddy bears.
Woman in red shirt on a horse in a river.
a man is talking on a phone outside
An older man is examining a table of bananas.
The front of a store with its doors wide open
Two children sitting on a skateboard riding it on down a slope.
Dogs gather to eat food out of a metal bowl.
A bunch of hot dogs sitting next to each other on a table.
A cowboy boot filled with flowers sitting on a bannister.
A white toilet sitting next to a white bath tub.
Two bears in a sunset sitting on a hill.
A young man holding blue handled scissors to his tongue.
A train is shown next to a platform.
Women smiling looking into a mirror while fixing their hair.
a living room with some antiques and a book case
A kitchen with light wooden cabinets and an island in the middle.
A dog is tied to a cart on the side of a motorcycle
Meat with lentils, rice and vegetables sit on a blue plate on a wooden table.
a fire hydrant stands before a partially visible cave
A costumed employee is holding an open umbrella.
A zebra herd standing around in the grass.
An ornate building is viewed by a crowd.
Several Air Canada jetliners parked at an airport.
A man in a suit standing beside his bicycle.
Two small beds are now together to form a single bed.
a close up of a person holding a book near a dog
A bunch of sheet and geese in a field with a bible quote
A woman surfing in the ocean and riding a wave that is crashing behind her.
Birds are in the water and sticking their heads in
An Asian man riding a motor scooter on a street
some boats parked on the side of the river
A toothbrush and a mirror in a bathroom.
Winded dog sitting and eagerly waiting for a frisbee to be thrown.
A piece of luggage sits by train tracks with passengers waiting.
A bunch of cats sitting in a fenced in enclosure
Two plates have what looks like a hot dog and seaweed.
This fruit basket contains orange and green fruit.
A shot of feet riding down a street on the skateboard.
A person that is brushing his teeth in a room.
a little wooden bench sitting in front of some trees
A group of people riding on a bush.
Two Zebra in an empty field with trees and buildings behind the field.
a dirty kitchen ith various appliances in it
There are two people standing outside on a balcony of a very large living room
A cat drinking from a toilet in a bathroom with toothbrushes.
A curly brown dog is laying beside a novel.
A police vehicle carries away a car from the scene of an accident.
A piece of broccoli next to a kitchen knife setting on a painted wooden bench with the paint chipping of it.
A dog doing tricks commanded by a person.
Two planes sitting in a field on a cloudy day.
The airplane is in the air flying over the mountains.
A gray dog has a pink frisbee in it's mouth.
a close up of a woman wearing a shirt and tie
a green and white street sign in a busy intersection in a city
Two horses are sniffing a frosted cake as a lady stands in front of them with a plate.
Two hotdogs with a hand full of fish snacks
A red brick tower with a clock in it.
A computer mouse sitting on top of a laptop keyboard.
The two buddies are cross country skiing through the mountainous region
A shiny kitchen gas stove and oven with a black counter.
a cat almost all the way inside the bowl of a toilet
A desk with two monitors, a keyboard, a mouse, and a binder.
A kitchen with a refrigerator, ovens, a sink, and cabinets.
a black cat is laying next to his colorful toy
A glass vase with a green plant in it sitting in front of a window.
A young woman sits at a computer in an office.
A girl standing under an umbrella reading a book.
A bunch of stuffed teddy bears with flag shirts
A foot next to a snowboard on the ground.
a house very big showing a city clock
A baseball player jumps over another to catch a ball.
A person sitting on the floor  playing computer games by holding remote.
Skis displayed on a sedan mounted ski rack.
A woman eating a hot dog bun covered in sesame seeds.
A bowl of rice, meat, peas, and carrots.
A stop sign, a kosher butcher sign, and a Rite Aid sign
The concert audience is composed of many young Indian men, some taking pictures of the performer.
A giraffe standing with a bird flying in the distance.
a baseball player swings his bat at a ball
A boy and his younger sister looking at a steam engine
A surfer is gliding through a small wave.
A young woman is playing a tennis game.
A horse galloping through the sand on a farm.
A child's hands hover over a small uncooked pizza sitting on a tabletop.
Three people posing for a picture in a parking lot.
The person is sitting while holding the string of a colorful kite.
A large herd of sheep are grazing in the snow.
A man sits on a bench looking at a book in the subway.
A group of men stand playing a video game.
there is a male skier that is riding down a mountain
Small crocheted teddy bear on the side of a quilted blanket.
A bus is in traffic near a sidewalk and eatery.
A white and black passenger bus at a paved intersection.
A group of people hand flowers to a man.
Two people with green shirts caring for some animals.
A variety of fruit - including oranges, apples, pears, and Kiwi fruit - sit in a cardboard box.
Mature man speaking on microphone in front of curtains
Some young soldiers are looking at their pictures.
A passenger bus parked in a parking lot.
a couple of elephants are in a field
A bike standing on a sidewalk next to a road at sunset.
A tabby cat sleeping with its head on a laptop keyboard.
A train yard in a city with a train in the distance
There is a close up photo of an elephants face wearing a garment
Horses stand around a horse trailer grazing and drinking.
A red fire hydrant outside a shopping center.
Two teenage girls performing chores in a kitchen.
Two men overlooking the activities of students on small computers.
Two birds are flying over a sandy beach.
there are many different dishes on this table
A woman smiles as she eats a lunch of Chinese food.
The cat is at the desk near the computer.
A birthday cake is shaped like a sheep.
a topless man laying on the bed
some sheep standing together while surrounded by some tall grass
The are two bananas, the brand of them are dole.
A small bathroom with a yellow towel on the floor and a rack with magazines and various other items.
A picture of an open air zone that looks incredible.
People walking on the train platform pulling luggage bags on wheels
The room in the old house is ready for the new mother and baby, decorated with vintage finds.
A group of kids is skiing in the daylight.
a bunch of urinals are lined up on the walls
A group of men standing next to each other holding a racquet.
A cat that is laying down next to apples.
THERE ARE DIFFERENT SIGNS ON THE STREET
A woman sitting on the ground in an organized room.
A person that is eating some food on a table.
A giraffe with his long neck bent over and his mouth on the ground in an outdoor area.
large plate of french fries in sauce on a white tabletop
a yellow car turning on a somewhat busy road
A group of people on a street next to a food truck.
Two people pose together for a photo of themselves on a ski resort besides the ski lift
The pinnacle of the building is illuminated at night.
Three elephants standing by a man made waterfall.
some table and chairs sitting around a building with a clock on the top of it
A group of boats tied to the rocks near the shore.
An acoustic machine, speakers and remote control are sitting on a table.
Three people pulling suitcases behind them on a wet pavement
Two sheep are standing on some short grass.
A simple computer desk with a desktop monitor keyboard and mouse and a laptop computer.
The two people are in the kitchen cooking.
A dirty train is sitting on the train tracks.
A woman ists on a chair while a child stands under an umbrella with red dots.
A small baby bird on a piece of metal.
a all white bathroom with blue tape on the walls
A blender pitcher on the counter near a sink.
A horse with a white stripe is in the woods.
The dog is in a field on the side of a parking lot.
a close up of a propeller on a plane in the air
An elephant is walking across a dirt road.
People are standing outside of an old airplane.
There is a blue pick up truck broken down on the road
Man standing up playing a video game on a TV.
A bed with a comforter that is slightly pulled down and pillows that have a note on one of the pillows.
A sign that reads 'plaza drive' is being displayed.
A mother and her child giraffe walking in tall grass.
a white motorcycle is parked in a spot
A living area with a couch and a television.
A boat is going down the middle of a channel.
A three-piece bathroom with wood shelves and a round mirror.
A black bear is standing by the rock
Three zebras walking in a dry grass field.
A young person riding a body board on a wave.
A tan building facade with a bench out front.
A little girl standing on top of a tiled floor.
A black bear lying down in the grass next to trees.
A train is moving swiftly through the station tracks.
A red tray of food on a table.
Banana on table with three colored plastic wafers.
A woman holding a dog above a bowl on a counter.
The Christmas presents are left in the kitchen.
A glass full of drink is on the table next to a slice of pizza.
A bus stop and sidewalk near a park.
Woman standing in grassy area near baseball field.
Red wine being poured into five crystal glasses.
A girl dressed in pink sports gear stands on a snowboard at the top of a snowy slope.
Women playing in field with flying disc during competition.
A kitchen area with stoves, coffee maker and cutting board.
A small elephant toy pushed against an orange.
A pizza with mozzarella, tomato, and basil on a table with silverware.
a bunch of cars that are on a street
a woman in white shirt talking on a cellphone.
Three people ride their horses down a beach.
The large room has a lot of furniture in it.
a group of motor bikes parked in front of a store
Dog sitting in the back basket of a bike outside the shop
A lady scratching her head in the bathroom.
A long boat with an ad on it floats down the river
A front end of a boat sitting over a body of water.
Three people are getting off the train with their luggage
A black and white image of a line of umbrellas
A wall dedicated to white cloth with suitcases out front
A small airplane sitting on the tarmac at an airport.
a person riding a surf board on a wave
Two trains on separate tracks travel through a city
A towel with his nose right next to the camera looking towards it
A lot if people are in the conference too
Two teenagers with backpacks are on the street corner.
An animal is eating some food out of a bucket.
A bushel of greens are on the table with various fruits.
A man wearing a suit and a blue tie
A man holding a broom on a surfboard with a dog.
Black and white boat sitting at a pier near a building.
part of a road with assorted food on tables for sale
a girl petting a pony on the back of it's neck
A green train engine moves down the tracks with many cars behind it.
A couple of giraffe standing in front of a cage eating hay.
A young girl stands on her bunk bed holding a paper.
A group of three women sitting at a table sharing a cup of tea.
Two halves of a sandwich that is on a plate.
some people are playing ball in a field
A bearded man poses with his breakfast meal at a cafe
A toilet that is sitting in a bathroom under a window.
There are two children who are holding tennis rackets.
Multiple images overlaid of several women playing frisbee.
A slice of lemon pie with frosting on a white plate.
Group of women on a soccer field with the ball in the air.
a male in a green shirt a bowl some food and a pan
Several giraffes stand near each other in a large grassy area
A ski boarder riding up a big hill doing tricks.
An abstract graffiti on what looks like an old train
A boy holds a baseball glove on his left hand.
A person doing a tail slide on a rail in a skate park.
Three people in an art gallery using their phones.
A teen is seen mid-jump while flipping his skateboard in an indoor skate park.
A fat cat laying on a rug and shoes.
A hotel room features a balcony over looking the water
A man riding skis across a snow covered slope.
The shadow of a skateboarder in the middle of a stunt.
two small children sit next to each other
A plate with a hot dog, chips and a strawberry on it.
A person laying down on a bench outside.
a person in a black shirt a horse water and trees
bathroom with its door open and is very clean
A hot dog sitting on top of a white plate.
Sheep and lamb standing in pasture by stone fence.
An old picture of two women with two small sheep
A person near a large screen with others at a long table.
A man and a woman cutting a cake with a large knife.
A room with a tea pot and two blue and white vases.
The skiers are ready to try the snowy slopes.
A woman looking up at someone taking a picture
A person with something in their mouth while holding a cell phone.
A group of giraffes eating leaves off trees.
an image of a dog eating on his plates
A baby eats some cake with a fork while several people hover over him.
An orange truck driving down a street full of men in the back.
A red train sits next to a passenger platform at a station.
Group of different types of vegetables sitting on a metal railing.
Man dressed in black snowboarding down a mountain.
The little girl pokes her finger into the sheep cage.
a tower with a clock on it in front of a street light
A red light on a yellow contraption in a n intersection
Group of people riding on the top of an elephant.
a white and red boat with a bunch the people on it
A group of young men riding skateboards in a skate park.
a dog passing in front of a girl on her cell phone
Lighthouse on a point with sailboats near it
Brown cat sticking its face into a pair of white shoes.
A harbor with various boats and people walking on the pier.
A lone giraffe walking in dry vegetation in front of a tree.
Two luggage cases near a desk and bed
A clock with glow in the dark hands, sits in a dark room.
Man playing game with Nintendo Wii control next to kid carrying a cup.
Kids swimming and surfing in shallow water on a beach.
A small snowman with a person holding a carrot next to it
two chicken patties filled with cheese in the center
A work desk with a computer books and keyboard
A plate that has different types of food on it.
The kitchen has wooden cupboards, plenty of counter space, and a sink adjacent to the oven.
A group of sexy young ladies wearing bikini tops.
two giraffes headed into a building and another one standing by the fence
A girl serves a tennis ball on the court.
A person is holding a purple bear with no eyes against a yellow back ground.
A person is holding onto a cellphone somewhere.
A white coat on a bench on paver stones.
A man in a tie sitting on a wooden log.
A small girl is on the beach near a kite.
A couple of men adjusting their ties in front of red steps.
a person riding a horse next to a baseball field
A yellow and grey train on train tracks.
there is a man with a beard sitting in the grass
The little boy is pulling the suitcase by the handle.
TWO PIECES OF PIZZA BOTH DIFFERENT IN A BOX
A large bear standing on top of a stone ground.
A street scene showing a group of cars stopped at a red light.
There are two brown eggs in a metal bowl
Two men on a boat with a dog on the front
A woman walking a dog by a table of food.
Two people under an umbrella on a wet sidewalk with stars.
A tall building with a massive golden clock on it's face.
A large building with a clock and some trees.
Two men standing in a kitchen preparing food.
There is a red light on a traffic light
two women riding down the snowy hills on sleds
A very old fashion looking red smaller bus.
A group of people are around a birthday cake.
A close up of a cut into piece of food
A large plain with a couple zebras and many antelope.
A baseball player wearing green and white standing next to a baseball player wearing red and gray on a baseball field.
A  young woman sitting near a tree eating food.
A man is throwing a Frisbee into the air.
A beer and a slice of pizza on a table
The large bathroom is reflected in the mirror.
an image of a bear that is in the woods
A naked woman sitting in a large suitcase.
A photo shopped photo is shown with a tiny fire hydrant.
Person holding a toothbrush under a faucet with running water.
A bunch of sheep together in a very narrow area.
A airplane that is sitting on a tarmac.
The surfer is riding the wave on his surf board.
an image of two zebras in the middle of the wilderness
A man in a T-shirt is typing on a laptop.
Airplane at airport loading gate under hazy skies.
Two giraffes are neck to neck in an enclosure.
A man holding a surfboard is standing by the ocean.
A zebra grazing on grass in it's natural habitat.
A lighted fish tank above a toilet in a bathroom
A young man holds his skateboard while in a courtyard that is next to a large rock building.
the traffic signs are easy to read for the street
Big Been clock tower in London, England on an overcast day.
A wooden table with a purple laptop and orange pen.
A dirty brown teddy bear in a trash can.
Many people flying kites on a cloudy day.
Something delicious and sweet is done baking in the oven.
Two large piece of broccoli laying on a piece of paper.
Large sized truck with a medium sized black dog in the passenger seat.
A kitchen that has a wooden cabinets with a wine holder.
two people standing in the snow by a sign
A batter is getting ready to swing at a pitch.
A white clock tower at the top of a tiled building.
A person is doing a skateboard trick outdoors.
A remote control sitting on a wooden table.
a group of people stand under neath a tent on a beach
A baseball player throwing a ball on a baseball field.
Two people waiting at an intersection carrying umbrellas
Some people that are walking on a sidewalk while it is raining.
An orange motorcycle is next to a red car.
A laptop and two controllers on a small table in front of a couch.
A living room has a large animal cage in it.
Large amounts of desserts set on different platters.
An assembly line machine has many goods on it as two people stand in the background.
Two people are playing video games in a living room.
A young boy rides his skateboard amongst pedestrians.
Group of white sheep walking in a field of grass together.
A keyboard, mouse, and computer monitor on a desk.
A living room filled with furniture and a flat screen TV.
cherry tomatoes  and various food dishes on a table top
A building with large windows sitting inside of a building.
A couple of women standing with a boy inside of a kitchen.
man crosses skis while jumping in the air
A small sewing kit sitting next to a pair of scissors.
A yellow and blue bus is going down the street.
Two large plates off a variety of food .
Horses communing with each other on a shady street.
A group of people order food from a food truck.
A man cooking a large number of hot dogs on a grill.
Many motorcycles are parked side by side.
Three pieces of cheese bread are on a plate.
A man flying through the air while riding a skateboard.
A bunch of keyboards with mice on top of them.
A woman cutting a birthday cake on a tray.
Two toilets sitting on a sidewalk with a cardboard box.
A black bear is emerging from the grass to cross a paved street.
A laptop with a small screen is chained to a desk.
A man on a piece of equipment resembling a bicycle that has very large wheels.
A kitchen counter has a coffee pot and microwave.
A man is hitting the tennis ball with a racket
A black Sony remote control being held in a hand
A young man standing next to a racecar on a display lot.
A man doing a trick on a skateboard off a rail.
One giraffe from a group of two reaches through a gate toward a group of people standing outside the gate.
a woman and child watching a herd of elephants in a gated area
A group of men in suits sitting on couches talking.
A man at a table with a bowl of food.
a kitchen cupboard with the doors open and plates and bowls on the shelves
A cruise ship docked for letting passengers off to port
Cars move through an intersection below a green stoplight.
A big yellow train travelling by a road.
A black cat is on a laptop computer.
Two travel bags on shelf with a metal rail.
Three Giraffes are standing in a row and they are all different sizes.
A woman sitting at a table eating a giant hamburger.
A bathroom with a white bath tub and a sink.
a big sausage in a roll with cheese and cups of sauce and a person
A white dog on a bed looking in a box.
A photo taking of the inside of the building looking at three balconies and the clock.
Wisps of smoke on a public street at night.
A wooden chair that has a black vase with two flower holders at the top, and two sets of flowers in the vase.
A beach covered in kites next to an umbrella.
Many people sit at a table eating a meal.
A giraffe standing in a small piece of shade.
A dog laying on the side of a car door.
The evening sky  on the lake foretells hope "Red sky at night, sailors delight."
a small boat parked next to a bigger boat in the water
A woman stands with an umbrella next to a building.
A red bike locked up next to a a pay meter.
Pick up truck parked by side of road with white building in distance
a group of people that are flying kites
Poultry and broccoli on white pizza, with lemon slice.
A man dressed up in zombie costume is wondering around the street.
A kitchen counter full of freshly picked vegetables.
A bird themed clock sitting inside of a green box.
A giraffe munching on leaves with man standing in front.
a man is sitting at a table on a train
A falcon sitting in a pond of water.
a big colorful buss parked on the side of a road
A airplane flying through the sky with a leaf on it's tail.
An intersectional street sign stands in front of a vast mansion.
A broccoli and cheese quiche with a piece missing.
an image of a boy walking on the beach with surfboard
TWO PEOPLE ARE TRYING TO GET A BICYCLE IN THE BACK OF A VAN
Two red and white cows standing in a pasture.
A man holding a frisbee on a beach with a clouded sky.
A young child is swing at a ball with a plastic bat.
A man looking at a laptop next to a beer can and speaker.
An Alaska Airlines passenger jet sitting on top of a runway.
People reaching for sandwiches on a plate sitting on a countertop.
A man about to run to first base after just swinging a bat
A group of three boys sitting on top of a couch.
A woman is jumping in the air with a frisbee
A yellow and black bird perched on top of a dead sunflower plant.
dog sitting on dog chair with toy next to its paws
A close up of a care with an advertisement for a movie.
Pastries shaped like bear heads are displayed for sale in Japan.
Someone is touching a white plate that has a sandwich and chips on it.
People at a table with cups and a plate with donuts on it.
A large bathroom with tile flooring and white fixtures
a trunk of a car filled with a lot of luggage
lady in the jacket is sitting on the concrete bench smiling.
A group of cows laying next to some trees.
some rice chicken broccoli and carrots on a black plate
Some yellow school buses parked in a row.
A woman in grey shirt on park bench with cellphone and bicycle.
A couple of foreign language road signs.
two zebra standing in front of some goats
Many people hold umbrellas on the street during a rainy day
A dog catching a frisbee in its teeth in a field
Plates of food are on a ledge overlooking a soccer game.
A person sitting on a beach with some animals.
A desk with two laptops on it and both turned on.
Two people watch TV on a couch with their legs propped up.
fourt plates of vegetables and fruit sitting individually in each
A little boy standing next to a sheep smiling.
A city street is busy with cars and a clock tower above.
A teddy bear sits on a stair railing.
A black dog in the snow playing with the Frisbee.
Busy traffic in a city intersection at night.
A bathroom complete with a toilet, sink and window
A man in a suit sits alone on a bus.
People walk through a shop with flowers on the table.
A counter top that has a mug on it.
A ship is sailing away from the dock.
Two bears relaxing in a pond side by side.
A woman with sun glasses on a cell phone.
Ski patrol with helicopter at accident on steep ski slope.
This is an incredible picture show of individuals having a fabulous time.
an airplane that is parked out in a grassy field
A man performing a skateboarding trick on a rail.
A toilet sitting next to a sink, towel, vase and mirror in a bathroom.
an image of a tour bus that is parked outside a house
A woman texting on her phone, while sitting in a chair.
Person wearing grey clothing on a motorcycle on a city street.
Two young child skiers are headed down a small slope.
A table with two people and two pizzas on the table, one at each place setting.
an old black and white photo of four people sitting on a bench
some little kids sitting in the grass with a green frisbee
A dog watching another dog on a television at home
The apple computer is sitting on the bed.
A young boy playing with a plastic ball and bat.
a close up of a cat laying on a dresser and watching tv
A girl holding a wii remote looking forward
A chicken burger and french fries laid on a plate.
A group of men, standing while playing video games.
a clean bathroom with some flowers and a window
A herd of sheep are grazing in a field.
Several giraffes are near a fallen tree on the grass.
Small groups of people, including a person walking a dog, are scattered about an outdoor area, encompassing some streets, that is filled with classic cars.
An elderly woman poses for a picture in the park.
It is never too young to teach a child about tooth brushing.
A man with a tie, dress shirt, sweater and headphones.
A man on a bench is looking at a boat in the water.
A skier is performing an advanced trick on a slope
A bunch of street lights in a town hanging from ropes
A woman concentrating on her work at a table in a sunny room
Three men in military suits are sitting on a bench,
A boy and a girl sitting down to eat a pizza.
A carrot sitting on top of a wooden cutting board next to a small green knife.
a man sitting in a lawn chair eating food
An airplane is lit up as it sits on a runway.
View of a subway train through a mirror.
a cup that has some flowers in it
Two bears are romping in the water with one showing teeth.
A train going down a track beside many skyscrapers
A young boy wearing a baseball uniform and holding a baseball bat.
A horse walking down the road, in the daytime.
a bunch of people on skate boards ride on some cement
A person is preparing a meal in a large home kicthen.
a bunch of motorcycles sits parked on a street curb
A man surfing on a wave in the ocean.
A view of a bathroom that is in the process of being remodeled.
Group of people holding orange and blue frisbees.
The backyard of a big house with outdoor seating furniture.
A white cake with decorations of penguins and a Merry Christmas message.
Two people are playing tennis in an outside court.
Two elephants are in a field of grass together.
A boy sleeps with his head on a pillow and an arm around his cat.
a lonely horse tied up in the desert.
An photo of a lake, fire hydrant, and sign.
A pair of zebras cross a dirt road in the plains.
A sub sandwhich sitting on a napkin next to a glass of water.
a home made pizza sitting on a table top
A large metallic refrigerator freezer combo in a kitchen.
A man with two children posing on snowy ski slopes.
A couple of men standing between two large elephants.
Two men on a boat in a lake near a house.
A skier and snowboarder going down the snowy hill.
A small personal pizza sits on a small white plate.
A man rides his bike on a deserted street.
a computer room with shelfing that displaying various electronic devices
A cut in half sandwich sitting on top of a white plate.
A woman talking on a cell phone while wearing a bag.
A cow that is standing in the grass.
A street with a street sign and a stop light
a kid skating very high on the walking steps
A man flying though the air while riding skis.
A group of people riding sailboats on blue water.
A man standing next to  a woman under a kite in a tree.
Two woman playing with Wii remotes and a man in short shorts sitting in a chair watching.
a table full of different kinds of pizzas
A mom holding her baby while working on her laptop.
A man on a surfboard riding an ocean wave.
two vehicles are sharing space wide enough for just one
a white steeple near the roof of a neighboring building.
Pair of elephants walking along the shore of pond in desert.
A silver bowl filled with salad on top of a table.
The food on the plate looks really healthy and hearty.
A young man playing tennis on a tennis court.
Two people are sliding down the mountain slope.
Two women a man and a boy all riding horses down a river path.
Man laying on ground with skateboards under hand and feet being nailed by another man
there is a baseball player that has hit the ball
A strange looking shower curtain in an ordinary looking bathroom.
A man standing next to a parked motorcycle.
A small white dog is standing on a desk chair
A black and white dog walking down a  sidewalk.
The people are playing the game in the living room.
The motorcycle racers speed down the curvy track.
A woman laying in bed while clutching a blanket.
A person riding on the back of a brown horse through a dirt field.
A PUBLIC BATHROOM WITH CLEAN FLOORS AND WINDOW
Christmas teddy bear next to a coffee cup of a candycanes
a man swinging a tennis racquet at a tennis ball
A bot watches while a man cuts a blue and yellow cake.
Signs on the corner of an east London street by apartment buildings
A bus drives down the street in a town.
A group of people sit holding glasses and smiling at a table with several bottles.
a blond woman with a spoon and a blender
a man riding a motor bike with a usa flag on the back
Passengers board the transit bus from the station at the loading zone.
A baseball player slides toward a base as another waits to catch a ball.
A group of people sitting on a trail side with a dog looking onward.
Several animals cross the road with a human behind them.
The cat is laying on the pink blanket by a window.
Thirteen children and one adult dressed in baseball attire holding sports equipment.
A dog is lying down on the unmade bed
an image of man riding his bike down the street
A couch that has several blankets on it.
A person slicing something with a dog watching
An adorable little gir sitting on a park bench.
Three red motorcycles with riders in protective gear are on the street.
Two cows stare out while being in the meadow.
A turkey sandwich smothered in cheese on a plate with vegetables.
An elephant walking into watering hole while a mother and child watch.
A bed with a brilliantly colored bedspread and pillows.
some fruit and veggies sitting on a counter
Grape tomatoes, apples, and an onion are on a table.
A cat is wearing a small blue backpack.
A woman is eating food as she sits in a crowd.
The baseball player reaches out to catch a ball.
A hotdog on a colorful plate with ketchup, some ketchup spilt on the table.
A view through a bathroom doorway without a doorway, showing turquoise tile and an unfinished wall section.
A small boat washes up onto the beach.
A fence is put up in a desert climate.
Very large bicycle sitting in the middle of a freshly polished flooring.
Two plastic baskets filled with food sitting on top of a table.
Looking down at skiers holding their skis on the ski slope
A lady dressed warm on a bike in the street.
Chicken sandwich, french fries, herb tomato, pepper salad with sour cream and ketchup condiment.
Three people standing outside a small airplane on wet pavement.
a man jumping over a black box with a skateboard
People crossing the street in a busy, overcast city.
A group of people play a game of tennis.
A filthy bathroom with a grimy tub and toilet and grime covered floor and walls.
A "Greenwave" bus stopped at a bus stop next to brick buildings.
Three skiers jump to the snowy ground in front of a tree line.
a stuffed elephant with a brown stuffed teddy bear leaning on it
A horse stands is front of people on a sidewalk.
a girl sitting on a bench looking at her cell phone
Two men sitting on the street in front of a building.
Many horses are walking near the guard rail down the side of a street.
a number of kites flying in the sky above a field of people
A messy desk with a computer that shows a young child on the screen.
a table top sitting inside of a kitchen
Two giraffes neck up closeup from behind at dusk.
Two dogs are sleeping together on the bed.
A tan clock tower with a black and white clock.
The bagel sandwich has many ingredients inside of it.
A desert sitting in a plate that has congratulations written in chocolate
A spacious  bedroom with access to a balcony.
A deep red and white airplane sitting in front of a hanger.
Rings radiate from a gray bird in the water.
A machine with multiple clocks on it with wheels.
a model airplane sitting next to a bigger plane
A man vigorously serves the ball during a tennis match
A toilet sitting in a unique bathroom with painted and designed walls.
A person riding skis on top of a snow covered slope.
a kid eating from a blue plate and a spoon
A woman with a suitcase sitting outside at a park.
The aerial view shows a crowd with many umbrellas below.
A man holding a ball in his hand in a room.
Black and white vintage picture of a man in a suit with glasses.
An Apple mouse sits on a desk next to a keyboard.
A tennis player reaching with his tennis racket at the ball.
The breakfast setup includes pancakes with a cherry.
A baseball player is at the plate about to bat.
A black case on the ground with a small tire and jack.
A animal with a very scared look on his face and a red thing on his head.
A white toilet sitting in a bathroom surrounded by tiled walls.
A cat on a table next to a vase of flowers.
A young girl sitting on a bench holding a toothbrush
Giraffes, zebras and ostriches in a large enclosure.
A sheep stands alert with it's face to the camera while it's offspring, head hidden by the sheep's wool, drinks it's mother's milk.
A dog jumps in the air to catch a white Frizbee on a grass field.
a person holding a  cell phone  near a corch
Several male horse riders crossing a river to shore.
A white horse is standing on grass in the country.
Food stands with red umbrellas on a crowded street.
vintage black and white photograph of two baseball players
People out in the ocean on surfboards by a large cliff.
Three mirrors mounted on a tiled wall with lights.
A person standing on the snowboard on top of the snow.
A red phone sitting on a table by a folder.
A man is walking down a main street.
The street sign indicated the names of the two streets.
A woman holding a tennis racket in her hand.
A close shot of a grilled cheese sandwich on a plate.
a tennis player getting ready to swing a racket at a ball
a red and white bus a bicycle and some people
A white toilet sitting under a bathroom window.
A white dish plated with corn, carrots, tomatoes, onions, olives, herbs and oil.
a tan teddy bear a white sheep and two other bears
two girls soccer teams are playing soccer and player from each side fight for the ball.
A box filled with two slices of pizza and sewing equipment.
A young girl standing on a grate with a racket.
A cupcake, piece of cake, and tort with raspberries.
Street sign with plants growing around it on the side of the street.
A girl standing next to a bed standing next to a bed.
An elephant stands in front of a body of water.
Displays of deliscious looking dessert in store window.
Two people are walking over tracks with stuffed animals near two other men and a lady standing by a model train.
Closeup of two laptop computers sitting on a desk.
A Singapore Airlines commercial aircraft landing on the runway next to the water.
A young boy jumping in the air on a skateboard
People that are making a pizza from start to finish.
A bus parked in front of a building and beside a fence.
A man on a snowboard in the snow.
A Skiier on trail hillside posing for picture with hands out
A man sticks his tongue out to have his picture taken.
a man in a suit in front of a white truck
A couple of tennis players on a large, fenced-in outdoor court.
Small slice of pizza sitting on a table next to the bottle of beer.
a table top with some trey of food on it
A major league baseball player in the batting box.
five bagels are sitting on a silver tray
A small sandwich with lettuce and tomato on it.
A yellow wooden bench swing hanging from chains.
Two plates with dessert crepes and a cup of coffee on a red tablecloth.
Some carrots and bananas in a small bowl
Several people enjoy a day at the sandy beach.
A hand holding an apple with the tip of a knife piercing the fruit.
some people trees two blue umbrellas and chairs
A beautiful young lady looking into an empty microwave oven with lust.
A brown horse in a grassy field with trees behind.
A teddy bear with a red bow holds two red, white and blue pom poms
A clock mounted to a wall next to tall buildings.
a white horse at the top of a hill
A toilet with the lid opened placed beside a shelf.
A Women chef outside holding a pan with food in it.
A train has graffiti on it while it sits on the track
A man preparing to swing his bat as another holds a glove.
A blue motorcycle with rusty tailpipe, parked beside a truck.
a train on a track near a platform
A bowl of antipasta with sausage and beans in it
An airport scene where aeroplanes are landed on the ground.
Skiers doing stunts over a hill of snow.
a piece of bead with some sliced cheese and bananas on it
a woman laying on a bed but peeking at someone
A large horse studding next to a baby horse.
Two plates of breakfast foods on a restaurant table.
A piece of wood has a fresh pizza on it.
Several people riding on horses at the beach.
A black and white picture of a man in a suit wearing a tie.
A baby in a high-chair being handed his first birthday cake.
an image of a living room setting with fireplace
this is a piece of broccoli on a table
A large crowd watches as a pitcher throws a ball.
A collection of vegetables inside a grocery a store.
A desk contianing a computer monitor, telephone, modem, CD drive, and a cat above a keyboard drawer containing a keyboard and a mouse which is above a tangle of wires and next to a bed.
a table with a white plate and knife with food on it .
A man riding a motorcycle in the middle of the street.
A tennis player in blue returns a volley.
A man with a baseball bat that is standing in the dirt.
Several pieces of luggage and bags near moving trucks.
Two slice of cake and a fork rest on a plate
A woman sitting on chair holding an umbrella.
a male wearing white is playing tennis on a court
A peeson at a table is eating a small pizza
A little girl is standing in front of a refrigerator.
A faded red fire hydrant on a sidewalk near a building.
a person riding skis on a snowy slope
there is a woman standing outside talking on the phone
Multi colored scissors with a multi colored ribbon.
A sign in front of a railroad explaining how to board the train.
a snowboarder flies through the air with an onlooker taking a picture
A large propeller plane sitting on top of an airport tarmac.
A man holding his arm out, holding a game remote control.
A parking lot full of open blue umbrellas.
A pair of surfers approach the water's edge, where the waves spread thinly over the compacted sand.
A kid in the grass swinging a baseball bat.
a bunch of boats are sitting in a harbor
The little boy is too close to the stove in the kitchen.
This is an image of the inside of a modern kitchen.
A man looking at a large pepperoni pizza.
A woman posing on a skateboard on a sidewalk.
Two plastic model airplanes lie on the ground.
A bathroom with a white toilet and a white sink
A family on the beach points into the water.
A person is riding a surfboard on the water.
Two sheep in a vast field during the day.
a lady standing in front of potted plants.
A guy on a surf board riding a wave.
Compute on desk in next to green wall area of living space.
A male taking a picture of himself wearing a cardboard Happy Father's Day tie
BUNK BEDS WITH LADDER TO TOP BED WITH STRIPED SHEETS
Two cats are laying on the keyboard of a computer.
A couple of sheep are on a grassy field.
A crowd of people crossing a cross walk.
A black cat laying by two pairs of slippers on carpet.
An oncoming railroad train traveling down the tracks.
A woman is hugging an orange fire hydrant.
A young surfer surfboarding in the ocean doing tricks
A donkey painted with stripes has a snack while hitched to a decorated wagon in Mexico.
A steer standing next to one that is laying down.
Three baseball players are on the field during a game.
A baseball player swinging while the catcher waits for the ball.
A small baby lying in an open suitcase.
Jet-skis sitting on the sand in front of the water.
Two parked motorcycles in a lot near a large field.
A beach with many umbrella's and chairs with people by them.
iphone playing game while donuts are in background
A giraffe walking through a grassy area near some rocks.
a woman reaching up while jumping to hit a tennis ball
A boy is holding a teddy bear figure.
A woman is on a tennis court in mid serve.
a woman holding a tennis racket by the side of a road.
A blue room with a brown double door and a closet full of clothes with a pink television on a stand.
A giraffe standing on top of a lush green field. near trees.
Motorcycles parked with pedestrians nearby at outdoor event.
People in a field flying a kite with large clouds in the sky.
A blender filled with liquid on a counter.
Two men are working on a train at a station.
A man that is bent over in a boat.
A person riding a ski lift over orange traffic cones.
A man on skis going down a hill away from other skiers
A train moves through a heavily forested area
a male is riding a horse and some cows a street and trees
A woman is tossing an omelet in a frying pan.
A calico cat is standing outside a shoe store looking in.
A gothic building with a magnificent clock tower featuring gothic columns and arches.
Pictured from above are clothing and shoes scattered on a wood floor.
Two giraffes on a hill and one is walking towards the other.
Many people are flying their kites on the beach area.
A plate of food containing a sandwich with a tooth pick, lettuce, tomato fries and cole slaw.
A man is standing by some parked motorcycles.
The line of people are riding horses through the plains.
A close-up image of a black dog in a room.
Many motorbikes are parked on the side of a city street.
An individual is hiking in the snow with some skiing utensils.
A sign for handicapped parking with mountains in the background.
A bedroom with a bed, desk, and tv with paper and pen on the table
A large open kitchen has wooden cabinets and white appliances.
A man and woman sitting closely on a bean bag type chair together, and the man is holding a banana in his hand.
A dog herding the sheep by running towards them.
A white sheep standing on top of a dirt road.
A giraffe stands tall among grass and trees.
An equestrian lady riding on a brown horse.
a shelf holding onto some assorted paperback books
a plate of pizza on a table
a small boat parked on the ground on display
A large clock anchored on top of a building
A passenger jet flies over houses on a coastline.
Cattle walking on dirt path through green mountainous area.
a giraffe  standing beside a building and part of a tree
A young person is jumping his skateboarder off of a lodge.
The cows are looking at the photographer taking the picture.
A girl with a coat and hat on is pulling luggage.
a red and silver train is coming down a hill and snow
A field and a fence sitting in front of a group of buses.
A rider is dressed in red riding gear while sitting on a coordinating red motorcycle.
A YOUNG LADY ON THE COURT PLAYING TENNIS.
An orange and bottle of orange liquid on a table.
a close up of a woman in pigtails a shirt and tie
A tall white and red light house sitting on a green hill.
A european city in nice a sunny bright day
A man holding a cake knife and stretches it out toward a cake as he stands next to a woman in a darkened room.
Many cattle are on the field while people ride them in the background.
Two people skiing on cross-country skis on the snow.
A woman wearing plastic gloves handing out fruit slices from behind a table.
a counter top with a microwave inside of it
A surfer on a surfboard flying over the crest of a wave.
A man holding a baby, eating and sitting at a table with two pizza atop.
A multi colored train comes around the bend on the tracks
A dirty train sits on the railroad track.
The Big Ben clock tower towering over the city of London.
A group of three men standing next to each other.
A group of zebras that are standing in the dirt.
a popular sporting event being being witnessed by spectators
A girl with big glasses is brushing her teeth
A young man in a bathroom dancing while looking at his reflection in the mirror.
A motorcycle has a red and white plastic container on the side.
A sandwich is on a delicately-designed plate with other place settings.
A person wearing skiis and jumping off a snow hill.
A large panda bear laying down in a forest.
A grey cat smelling a cut filled donut on a plate.
A cat sits on a fence under an umbrella with ghost lights
A motorcycle rider riding on the street near a grassy hill.
A small train traveling on the railroad tracks
A couple is cutting a wedding cake together.
A group of people sit in a boat with a bike.
This meal has four pastries, grapes, strawberries, and sauce.
A shirtless male tennis player awaiting the ball.
Two women wearing hats standing near a fence.
A man rides a wave on a surfboard.
a girl that is kicking a soccer ball around
A person is holding a cup with food and a plastic sword in it.
A tennis player returning a tennis ball hit to him.
a little kid that is sitting on a toilet in a bathroom
a city neighborhood with a stop sign on the corner
A woman standing with a bag in a mirror.
An umbrella strapped to a bicycle a rain shower.
A locomotive on the tracks near buildings and wires.
this bed is very large and is under a window
Colorful red bar stools are lined up in a kitchen.
some stools a white refrigerator and wood table and chairs
a one way sign in front of a tree pointing to the right
A person on the street with a skateboard.
A female tennis player in action on a court.
A purse has it's contents laid out on a table.
a man and a woman sitting on a river with an umbrella
Various fruits and vegetables sitting on a table.
An adult riding a bike next to a little boy.
Two people and a dog sit on a sidewalk and watch a commercial bus pass them.
A person is holding a donut with two fingers.
A cat sitting on top of a black refrigerator..
A large tall building with a small bird flying over the top.
A knife point on the surface of an apple.
The little boy is standing in front of the new refrigerator.
A family of zebras and a giraffe in a grass field.
a large group of cattle have been fed fresh hay.
A train traveling down tracks next to a  mountain.
A man with a white beard sits wearing a top hat and a suit.
A vase and two candles sitting on top of a table.
A red truck with a flame paint job.
A brass clock stands in a train station.
a river with frozen sections floating in it
A child holding a dragon kite standing in the grass.
A bird walking on a beach with something it it's mouth
A kite on on the ground on a grassy field
A train is on the tracks that is red and yellow.
A yellow and green bus is going next to grass.
A baseball player sliding in to a base while the baseman tries to tag him out
A train moving along a track during the day.
A snowboarder sliding down a hill in the snow
a close up of some white puffy balls
A mother elephant and her baby walking through the brush.
A group of teens are playing frisbee in a field with a view.
The man is snowboarding down the snow covered hill.
A table with a laptop, a phone and a drinking glass.
A man standing on the sidewalk with his skate board.
A man in a black suit carries a checkered umbrella as he walks on a crosswalk.
A large brown dog holds a Frisbee in his mouth.
A steak covered with seasonings of mushrooms and broccoli.
Night time view of pole with too many signs on it resulting in joke street name.
Airplane flying low over the treeline and field beyond.
Different types of foods and vegetables side by side.
A train prepares to depart from a station.
Birthday cake decorated with a frosting in the shape of a truck.
A sandy beach next to the ocean covered in kites.
there are many people that are riding a elephant
Person doing a trick on a skateboard on side of building as others walk by.
A person sits next to a laptop on the wooden table.
A large hill in a green pasture of grazing cattle.
A MAN DOING A BICYCLE TRICK AMONG OTHER BICYCLISTS
A portrait of a group of tennis players and coaches.
A train on a track going under a trellis.
A close-up of a stop sign in a snowy landscape.
A bus is leading the pack heading towards the hotel.
A pastry with fruit, mug and fork sitting on a counter.
A photo taken from a field looking at a train going by.
A yellow commuter train sitting at a station.
There is a plate of mushroom pizza on a table.
a brown bear walking on the side of the road
a woman is walking and talking on her phone
a bathroom with a toilet, a sink and a mirror
Two men running for a frisbe on a field.
A smiling woman in a formal dress holds an umbrella.
A brown bear walking around in the river.
An espresso machine brewing fresh coffee and a toaster.
A woman riding a bike with a woman on back.
a little dog running up to two bulls next to some bushes
A baseball player swinging a baseball bat
A locomotive engine blowing steam as it comes down a track.
A mime soaked in the blood of the innocent while standing in a park.
A chefs knife and a cutting board with uncut mushrooms and half of an onion.
A woman smiles as she stands in skis on a snowy hill.
A couple of people that are in the snow.
A street sign reads "Jack Kerouac" on a street corner.
A row of bikes sitting next to each other as people ride bikes past them.
A fire hydrant located in a clearing in  the woods.
A prepared pizza is sitting on a table.
A woman in a cowboy hat and Texas flag on a horse.
A skateboarder is mid air doing a trick
The gourmet pizza includes several very special ingredients.
A person in a wet suit riding a wave
A green motorcycle sits parked by a gas station.
a pair of scissors with long shears sitting on a pattern
A pond of water with three giraffe walking in the dirt.
A person sitting on the sidewalk holding an umbrella.
The giraffe is standing alone in the wilderness.
A woman holding a banana in front of her mouth.
A satellite dish is near the produce hanging above a door.
A fake zebra is shown in the lobby of a hotel.
Guacamole sits on a white plate with a garnish of shredded carrots.
Several people walk out of a bus onto the street.
A park bench has four people sitting on it under a large tree.
A group of people standing on top of a sky slope.
A red garbage truck and a man behind it.
a close up of a laptop and a mouse on a small table
A white toilet sitting next to a white sink.
An orange cat trying to look underneath a closed door
Cat sleeping in a high chair in the kitchen.
A male tennis player in action on the court.
A group of five sheep standing in a row
A white refrigerator freezer sitting in a park.
A bird sticking it's beak in the water.
A motorcycle has a paint design in green.
A couple of brown bears sitting and standing next to a brick wall.
a person riding a surf board on a rivier
two people in a kitchen area preparing food
a couple of white couches in a room
All of those bikes look exactly the same.
A giraffe standing in front of trees and an open field.
a big kitchen that has a lot of open space
Many pictures and toys are posted in the office
A dog is running around some cows in a field.
A broken pair of sissors with a half of an orange handle.
A person holding a smart device in their hand.
A jet fighter sitting on top of a field of green grass.
The young giraffe are eating from a branch.
A woman looking at a group of giraffes.
Two planes parked next to a runway on the grass.
a white dining table and two chairs by a window and a cat in the corner
A COUNTER FULL OF DESSERT INGREDIENTS AND BEER.
Two brown horses in a pasture eating grass.
A baseball player is on home plate with his bat.
Three zebras are standing together in the dirt
a man doing a rail slide on a skateboard
A young Giraffe enjoying the sun on the grass.
A street lamp is on a street with a sign and flowers.
a desk that has a computer and a keyboard on it
The lady is holding a baby eating dessert.
A black bear in the background on a grassy slope.
People are standing and sitting near the street.
Kitchen with silver appliances and brown cabinets.
A side mirror of a vehicle showing a street sign.
A large bus on a open city street.
A teddy bear is being tied on a pose with pink ribbon.
An orange and white cat is sitting in a car.
A fenced in area shows two leafy and low-hanging tree branches, casting shadows, and making shade for two horses that are grazing at some patchy grass.
A plate with some meat, bread and salad on it
a person trying to get something out of a plastic case
Row of black suitcases on a wooden floor.
A man holds a candle in one hand and an umbrella in another.
A red bike is parked between others as people walk past
Someone in the air on a snow board
Lots of construction materials at a childrens park
Three giraffe standing next to a man in front of a blue barrel.
two men and two women receiving some kind of reward
An old picture of three student in the library with there teacher.
Broccoli, green beans and various other foods in a tray.
An older gentlemen is wearing a black suit with a white shirt and tie and a red flower in his lapel.
A street corner view from the bottom of a clock tower.
The lunch was in a box and had carrots, berries, grapes and a sandwich.
A picture of a meal of artisan pizza.
Two riders on the backs of horses riding along the beach.
A tour bus making a right turn as people wait.
A bedroom with a window, armoire, chair and table with plant.
Group of four people standing and playing a video game.
Urban area intersection with traffic signals displayed at sunset.
A cat sitting on a bench in front of a building.
a person holding a surf board in a body of water
A snowboarder is at the edge of an outdoor jump.
A person riding a bike on the road near some stores
A man stands with a beer in his hand.
An assortment of veggies sitting on top of a wooden table.
A surfer's surfboard is going straight up on a turbulent wave.
Penguin balloon, an orange, coins and beverage at computer.
A bride and groom walking from the church with umbrellas.
Pedestrians cross at a crosswalk in a crowded city.
A row of parked buses sitting in front of a buiding.
a close up of two slices of pizza on a plate
A finger that is pointing at bread on a plate.
Plastic bento box lunch example with fresh food
a lady on a phone sitting on a couch
A large continental jet sitting on a tarmac at an airport.
A man propped up against a bike looking at a cell phone.
A collection of sailboats docked in a harbor.
A man standing under a street sign looking at paper.
a man is sitting in front of a small cake
A man pouring a drink into a glass while a woman watches across the counter.
A very cute looking girl on a cell phone.
Woman standing on the porch holding a tennis racket.
a close up of a dog near a door way
a couple of signs are hanging on a wall
Three people are handing bunches of bananas to a fourth person.
A young man roasting a chicken in an oven.
A girl raring back at a soccer ball on a field.
A sandwich cut in half sitting on top of a wrapper.
There is a little girl playing with a ball.
a baseball player that is standing at home plate
Four dogs playing with a Frisbee on a lawn.
A Soutwest Airlines jet airplaine taxiing along a runway.
A white bed that is in its room.
a giraffe walks through a bunch of bus
A little girl standing on skis in a snowy area.
A small front is lying down on the leaf.
A beautiful young woman standing on a tennis court.
a bathroom that has a sink and some lights
A woman in black jacket sitting in snow with snowboard.
A person standing looking at a  large statue with clocks built into it.
A stop sign with people walking down the sidewalk.
A young woman riding a skateboard at a skate park.
A bus is stopping to pick up people in the snow.
a group of young people watching a young boy skateboard down a rail over some steps
A group of cow standing in a patch of dirt in a pasture.
A woman that is sitting near a coin meter.
a toilet and a bidet sit in a bathroom next to a garbage can
The herd of sheep is walking near cars on a street.
A cellphone sitting on a table with a cup.
a white bus is driving on the dirt
A man riding a skateboard down a street.
two people standing in the snow mountain with their skis
A couple hugging each other to pose for the camera
A skateboard that is sitting on a beach.
a women that has a large pizza on a table
The horse looks at the camera while the people talk amongst themselves.
a man doing a trick on a skateboard going down a hill
A man holding a box of food while wearing glasses.
Two guitarist playing while people sing in the background
A small car is parked in front of a scooter
A lady flying a kite with a black dog nearby.
a lady o a urban street holding a see through umbrella with two men standing behind her.
a half of a pepperoni pizza on paper
A young boy that is holding a baseball bat.
two people playing basketball at an apartment complex
A street scene looking down at cars and motorcycles parked.
a man standing while attempting a trick with a white frisbee
The zebra is standing alone grazing in the grassy field.
A group of women are walking with cups.
Two odd-looking birds wander around in a field.
The blue bus has arrived and parked on the side.
A small brown ukulele sits on a small wooden table next to a vase.
A man against a concrete wall talking on a mobile phone.
An airplane hooked up to the umbilical walkway at an airport.
A guy letting a bird eat from the palm of his hand.
Three people stand around a small aircraft on a wet runway.
Three zebras graze in a field with grass and trees.
a large hotdog with lots of mustard and a Hawaiian punch soda
a group of boats parked next to a dock in the water
A young boy swinging a baseball bat during a game.
A beautiful young woman in a bikini feeding a baby food.
A airplane that is in the sky near clouds.
A blue and red tour bus standing by a building with a tile roof.
a sign with soem names on the top of it
a disk with a computer sitting by two windows with a view
elephants at the zoo standing in front of a waterfall
A spoon next to a plate with fish, rice, beans and broccoli.
a line of people that has skies on
A couple skiers on a snowy mountain side
The handle bars in the restroom are sturdy.
A split image of two different women holding a object resembling an arrow
A man standing next to a little girl on top of a field.
A group of cats looking out of a window.
A boy swinging a tennis racquet on a court with other kids.
a red white and black sign of a man working
A wooden paneled door opens to a spacious bathroom.
A white-bearded man stands holding a puppy and a stuffed animal.
A pocket sandwich filed with meat, cheese and a pickle.
A cat sitting next to a bowl filled with water and roses.
A white plate topped with a pizza and a knife.
a female in a red dress is on a bed with a laptop
A white plate topped with meat veggies and rice with sauce.
A piece of bread sitting on top of a plate.
A sandy beach covered in lawn chairs with blue umbrella over them.
two people in the air standing on snow boards in the snow
A man is at a table with three plates of food.
a person standing next to a truck with its hood open in a parking lot
A group of seagulls are flying over a wooden dock that is sitting in a lake during the early part of the evening.
a person cutting a pizza with a knife
A brown puppy passed out after drinking a bottle of coke.
A large adorable cat resting on a big soft pillow.
A sheep is laying on its side while another sits against a fence.
there is a pink fridge and a pink stove in the grass
A train on the tracks next to a wooded area.
A room filled with furniture and boxes and clutter.
A man does a handstand on his skateboard.
A man sitting on a kitchen floor has tools spread out beside him and is holding a drill.
A couple of zebra standing in the tall grass.
A stop sign covered in stickers next to tall buildings.
A man and a woman on a touch looking at a smart phone.
A boat drives in a large body of water.
This plate has meat, broccoli, and a potato.
a number of sheep in a field with dogs
A man preparing food in a kitchen on top of a stove.
Several children are playing in a fire hydrant.
A small kitten lies next to a laptop.
Two sheep grazing in a field with buildings in the background.
A very tasty looking dish with some assorted veggies.
A top of a building that has a clock and is flying a flag.
A digital clock on a bus can be seen above people's heads.
this is a man standing in a field
A man on his stomach in a white bed.
Two cows are sitting on an open field during the daytime.
two men stand in the sand in a baseball diamond, while one hands the other a bat
Small boy holding up broken umbrella Ina parking lot.
Three blue pieces of luggage stacked on top of each other.
There is a toilet with the seat up in the bathroom.
A woman riding a gray horse in the middle of a street.
A person stands between two tents set up inside of a cabin.
Cow tethered with chain eating hay in outdoor field.
A young boy standing on a grass covered field under a flying kite.
A man standing on the beach next to a surfboard.
a close up of a pot of flowers with a box of flowers
A person engaging in a water sport with skis on.
A woman with skis is standing on the snow.
a boat is docked in some water next to a house and a bridge
A skier skis down a slope, with blue and red course markers in the background.
A vintage photograph of a man riding a motorcycle.
A public bathroom sink and hand drying area.
A train engine is pulling cars down a stretch of track.
A oddly colored zebra laying down on the dirt
A plate with food on it next a a spoon and some more plates.
Two children and a woman on a play-mat in a living room.
A wedding cake with a bride and groom on top.
A group of elephants marches down the city street in front of a large building.
A woman smiling and talking on a cell phone.
A group of zebra standing on top of a dirt field.
a room that has a bunch of beds in it
Sheep are gathered around a lone tree on the hill
Various lights on the front of a white vehicle.
a car with a mirror view of a dog walking behind it
A man standing in the doorway of an umbrella and parasol shop.
A man with a glove that is in the dirt.
A train driving down the tracks near trees and a building.
Two baskets on a table underneath hanging items.
A pink plate with white polka dots and a slice of chocolate cake and white frosting.
A giraffe walking across a dry grass field.
A train on the tracks at a train station.
RED, WHITE, BLUE AND YELLOW TRAIN COMING DOWN THE TRACKS
Ambulance and fallen over motorcycle from viewpoint of injured.
A bench and trash can are seen in this picture.
A person skateboarding on a street barefoot with one foot up
A living room in a well decorated house.
A horse eating grass next to an old fence and building.
A couple of large white airplanes and trucks.
A empty bench in front of a green bush up against a building.
Two horses pulling an older styled coach passing a home.
A close up of stuffed animal bear face.
A man tying a windsor knot in his tie.
A long table covered and used as a desk
The men are in the bathroom using it together.
A food entree is served on a plate.
Two smiling men are cutting into a cake.
An old advertisement for Maxwell coffee with a family sitting around a table.
A dog preparing to catch a frisbee in its mouth.
A couple of people laying on top of surf boards near the shoreline.
Commuter bus at roadway intersection in urban area at dusk.
a man is riding down a ramp on a skateboard
An man and a young girl on a motorcycle.
a magazine cover showing a man getting ready to kick a soccer ball
A couple of boxes filled with lots of donuts.
a black cat next to a box of fruit and vegetables looking up at the camera
A bird is standing on the shore next to the water.
The dog is laying on top of the couch.
A group of people cutting a cake with a sword.
Man on the back of a surfboard riding on a wave.
an image of two people on the beach
The large bathroom mirror is clean and spotless.
Young boy throwing a ball up and catching it
Surfer and black outfit coming down the front of a wave.
Two striped zebras are on knee high grazing grass.
A little girl putting a blue umbrella over a yellow fire hydrant.
A person in black is skiing down a snowy hill with trees.
A man that is being pulled by a boat on a board.
A group of children in a classroom with windows around.
THERE ARE MEN SIGING WITH ALL OF THEM WEARIGN YELLOW TIES
A picture of a empty street very late at night.
a room with a big chair with some boxes behind it
The teddy bear was posed at the table as if he was drinking.
The person is flying a kite at the beach on an over cast day.
A woman displays a homemade pizza dotted with mozzarella and herbs.
A red traffic sign next to a uphill alley.
A woman standing next to a building holding a phone.
Four men jumping into the air to catch a frisbee
A gray and white bellied bird stands on a branch
A man walking down a sidewalk next to a busy city street.
a group of signs that are next to some trees
A living room with a corner chair and a scatter rug.
Two men in pajamas are holding Nintendo Wii controllers.
A large red fridge is sitting on the red carpet.
A small young child is holding an umbrella in the sun.
Different styled sinks next to each other under mirrors
A woman cutting a cake at a bachelorette party.
A man with a surfboard in the ocean.
A couple of kids laying on top of booths.
This is a road sign for La Brea Ave
A baby giraffe standing with other young giraffes in captivity.
A laptop computer sitting on top of a bed.
A female skier competing in a skiing competition.
A large pizza prepared and ready to go in the oven
A man poses and smiles while holding a doughnut.
A plate of sausages, bread and butter, and potato salad.
a person holding a surf board on a beach near the water
A train going down the train track.
A man sitting at the kitchen counter looking at a picture.
Police car is parked in front of a hydrant
A blanket with various items that include a mouse, computer hard drive and a keyboard.
Nine men pose together near a coach and a dog.
Woman deliver serve in a professional tennis match
A cake is frosted with a surfing teddy graham on the side.
A young man is surfing behind the giant wave.
Several people are standing around a decorated elephant.
A large cat sits on the sofa arm next to a girl using a computer
A red car is parked next to a black truck.
The young girl smiles holding a donut with sprinkles.
A man holding out his white eight bit tie.
A cow makes its way down the street next to city traffic.
The meal consists of beef, brocolli, and other vegetables.
A woman in a red coat can be seen in the background talking on a phone.
There are Indian people riding in a cow drawn carriage
A woman swinging a tennis racquet at a ball.
A person walking a dog on a sidewalk lined with vehicles.
a motorcycle with two people driving by a car
Some white cattle roaming down the street of a town.
a black cat with it's head stuck in a boot
A white and blue bus driving down a road next to trees.
some fog traffic lights street lights and buildings
A bus on the side of the road in traffic.
A giraffe sits in the grass next to horned animals.
People being social outside a large colorful amusement tent.
A skateboarder jumps very high at a skate park.
A child watches an animal on a rock platform in a zoo.
A skateboarder skating off the  top of an outdoor stairway.
It is surprizing that these flying kites don't get tangled together.
there is a lot of old stoves on the ground
two big red double decker buses on the road
A giraffe looks like a statue in the dirt.
A woman and two teenagers are holding on to a stop sign.
Two young men in dress clothes and ties standing in front of an outside door.
Three empty wood benches sitting in a woody area.
A kitchen area with a stove, microwave and counter space.
some buildings and some boats are docked in a harbor
A guy is going up the ramp with a skateboard.
Two men are playing Frisbee in the park.
Two people, most likely a couple, are on the bench.
a black bear walks through the woods in the distance
A bathroom scene with a sink, toilet and shower.
A bathroom with a toilet, sink, mirror and shower stall.
A person with a surfboard walks along a beach.
a couple of people are skiing down a snowy hill
People standing at a table putting toppings on their hotdogs.
The blue white bus sign next to the trees on the campus.
The silver refrigerator is across the kitchen from a black stove.
A stop sign that is right by a road.
A man swinging a tennis racket at a ball on a tennis court.
A street is displayed at night with time lapse photography.
An old school bus painted white with curtained windows parked under a freeway
A small elephant is standing next to the other elephants
You male poses against stone wall with leg up.
The sun sets over the trees beyond some docks.
A full view of a market place full of sheep and items.
A grey and white cat laying behind a laptop.
The contents of a refrigerator filled to over flowing
a black bear pokes its head out of a field of tall grass
A single piece of pizza sitting on a paper plate.
A man in a suit carefully adjusts his tie.
a bunch of boats all lined up on a dock.
A man sitting on a couch holding a Nintendo Wii controller.
A sink that is in front of a mirror.
Three young girls holding ribbons in the snow.
Some kids are talking together outside of a house
A black and white cat laying down resting its head on a cushion.
Two zebra standing next to each other next to a tree.
A large clock suspended over a street sign.
An adult black horse and a young brown horse interacting.
A train sits on the tracks at an empty train yard
Two men playing a video game as other look on.
Toilet next to a sink with it's counter cluttered with bottles of lotion and stuff.
a sandwich sits next to some fries
A horse pulling a carriage wearing a straw hat
A bunch of bananas on a small chair.
A man riding a skateboard being towed by a woman on a bike.
a suitcase with writing on it sitting next to a guitar
A black cat laying down on a laptop.
A young black cat resting on a colorful surface.
A plate with two hot dogs covered in slaw, and french fries
At sunset, a surfboard upside down on the wet sand.
A man holds a toothbrush in his mouth.
A vintage baseball team of ten pose for a photo.
A group of people standing in the dirt near large tents.
Two buckets with a bowl sitting between them.
A recently remodeled kitchen with marble and wooden furnishings.
A bus is passing through a city intersection.
A person making a strange face at a very large pizza pie.
A table with a laptop, phone and other devices sitting on it.
Two people working at a market with oranges and apples.
A group of people are playing soccer on a soccer field.
A small dog chewing on a teddy bear
A man holding a Nintendo Wii game controller.
Modern espresso machine on counter in residential kitchen.
A brown horse is on the grass with two people.
there is a surfer that is walking towards the water
Two zebras standing near a pile of sticks and a wooden fence.
Three women and one man wear various skis on their feet while wearing swimming clothes.
A thirtieth birthday cake with candles on it.
A zombie apocalypse is happening on the street.
A dog is on a beach with people in the background.
a person bends down to put air into a car tire
The people are having a discussion about cell phones on the table.
Antique black and white photograph of a couple on their wedding day
A woman using a laptop computer on top of a wooden table.
Man folding banner while holding stick in unfinished carpet
A herd of sheep crossing the road under a cloudy sky.
some old wooden doors decorated with scissors for handles
A toilet with a wooden seat is open.
There is a large cooking pot and some staples on sitting on the shelf.
A person is holding up a large colorful umbrella
A man on his bike is between the busy traffic, including two buses.
A girl is holding the strings to a kite.
A snowboarder in the middle of a jump, with a mountain in the background.
A yellow fire hydrant sitting in a plant with a green top.
A building with a clock on the front and side of it.
A truck that is in front of a building.
To buses side by side with one being a double Decker bus.
a woman some pizzas drinks and bottles and bowls
A group of zebras are with a group of giraffes.
A baseball player mid swing during a game.
A bowl fo soup sitting on top of a wooden table.
A person reaches for the cabinet as the cat sits in the sink.
A large clock is on the colored wall of this building.
A woman underneath a umbrella on a street.
Some very pretty giraffes standing in some trees.
Small boats sit unused in water by a dock.
A neck tie that is knitted or crocheted from yarn.
A cat leans halfway off of a bed.
A group of people standing outside of a building
Two side by side zebras are near the tall grass.
The people are sitting down together having a meeting.
A baseball pitcher pitches a ball while standing on a baseball field.
A dog sits by and watches his owner.
A bench sitting in front of a brick wall on a patio.
A thomas the tank train traveling down tracks.
A mirror hanging on the wall reflecting a toothbrush.
A passenger train that is pulling into a station.
a room with a brown sofa,computer on a table next to a window and a red book shelf
A jet that is flying in the sky.
A white toiler in a very small bathroom.
A plate with a variety of Indian food on it.
A white dog sitting on a ledge of a window.
A man and a woman sit on a bench overlooking the water.
a male is on his stomach riding a wave on a surfboard
A woman standing on top of a lush green field.
Open packed suitcase with too many extra clothes to fit.
A woman dressed in military uniform speaks to a child.
Sheep are grazing in the fenced in area.
A man sitting at a table eating pizza slices
A cat lies on a laptop and paws the keyboard
A cat is sitting on the floor staring at the TV.
A tennis racket being held by a person and balancing a tennis ball at the top of the racket.
A white train colliding with a black car.
THERE IS A WOMAN WALKING WITH AN UMBREALLA
A man pushing a luggage bard through the middle of an airport.
a girl is getting close to a giraffe
A green bus near a curb in front of a brick building.
A baseball player on the backswing of hitting a pitch
A bedroom with wooden floors in an apartment.
A brown stuffed teddy bear wearing a red bow tie.
A man throwing up in a toilet, with his head in it..
A man doing a jump over a wave on a surfboard
A bowl of apples and tangerines on a table.
A man sitting on a big white horse.
A cat enjoying the warmth of a laptop.
A person making food inside of a factory on a machine.
A child wearing skis stands on snow and smiles at the camera.
a pan that has a big pizza on it
A truck in the middle of the street.
A man showing a women an image on a projector.
A freshly made pizza sits on a cutting board and pizza wheel.
A small child heading down the mountain on a snowboard
A mid sized transport plane sitting on a tarmac at an airport.
A man is standing in the street near a frisbee.
The boys are standing beside a group of motorcycles.
A person holding a dog's leash and looking at books.
a truck on a city street in front of another vehicle
A showroom in a high end furnitureinterior design store.
A man in a sports jacket is sitting in front of a microphone.
Airplane being loaded at a terminal on a cloudy day.
Spectators watching a professional baseball game's action closely
A man standing next to a woman with an open umbrella.
A man in a baseball uniform hitting a ball.
An apple on the ground, and an orange on the ground in a picture beside it.
Man riding a bike on a wet street in an urban setting
The skier in the red coat is doing a flip in the air.
The furry cat is looking at it's own reflection in the mirror.
A slice of pizza sitting on top of a white paper plate.
A woman walking down a street holding an umbrella.
Bridge and groom walking down a path surrounded by a crowd.
A man smiles as he plays a guitar.
A batter and catcher during a baseball game.
Teddy bear in sweater sitting on shelf near plant.
Two horses on sand face each other while one urinates.
a clock that is on the outside of a building
A person riding the waves on surf board.
Old fashioned furniture arranged around a parlor on an oriental rug.
A wooden table that has several types of pastries sitting on it.
A white and black cat standing partially in an open refrigerator.
A man with glasses is wearing three ties while holding a camera.
A black cat resting in an flower pot
A bunch of fruits and vegetables for sale on display
A bulky laptop computer on a desk near a lamp.
The people are waiting for the train to get there.
a brown and black ox and a white and black one and grass
An old bathroom with a sink and toilet.
a close up of a clock on a pole near a building
A blurry dog holds a frisbee in it's mouth.
The two elephants are very close to each other.
A couple of people are riding horses on a beach.
A few items laid out on a towel on a table.
A man looks at a hot dog he is eating.
A herd of zebra grazing on a grass covered hillside.
A beach with people surf boarding in the waves.
people walking pulling their bags and the security looking at them
A young child enjoying a serving of cake and ice cream.
A living room with a computer desk in one corner, a coffee table and television.
A teddy bear with no face made from denim.
Black and white photo of woman on chair holding strap of leopard or cheetah skinned hand bag on ground.
there are many people snowboarding down a hill
Chef at counter with baked goods, baking pans and containers of toppings.
Oranges and lemons sitting together on a white plate.
A group of people sit on a boat on the water.
A woman with short hair looks at a cell phone screen.
a yellow sign of a person carrying a surf board
A close up shot of horse, with it's baby in the back.
A white plate topped with eggs, sausage and a cut in half tomato.
A dog running in a field with people around.
A room filled of shelves topped with lots of items.
three baseball players on a dirt baseball field
a wooden table with the tail of a cat and a plate of cookies
A giraffe standing next to a tall wooden pole.
A woman stands beside a pony wearing a blanket
A beautifully appointed bathroom with classic color and amenities
A green and white bus driving past a building.
a fryer that has a bunch of doughnuts in it
A man uses his laptop on a kitchen counter.
A train that is riding on the tracks near the street.
A child holds at bat at a baseball game while people watch in the background.
A close up photo of a brown bear.
a brown bear standing in the shade in the wood
The people are trying to climb the mountain.
A man breaking slices of pizza on a pan
A yellow school bus reflected in a side mirror.
Four red birds perched on a branch in front of the clock tower.
Man in black blazer pouring wine in glasses.
A young person holding a frisbee while standing on a field.
A bearded man in dark clothing sleeping on a sofa.
A black cow is looking over a grass covered chain link fence.
a female playing tennis on a clay court.
Four boys dressed up one talking while the other's are listening.
A group of young children sitting on top of a bean bag chair.
a man surfing on his surf board  doing a trick
a bathroom with a sink, mirror on a tiled floor with a door open
A man is holding a bunch of green bananas in his yard.
3 dogs sitting in front of a fruit and veggie stand.
A train sits in a train yard with an animal.
The young child is learning how to ski.
A MAN IS ON HIS SKATE BOARD IN THE PARK
Small bathroom with toilet, bath tub and sink.
Two giraffes are standing by a tree and eating.
A little girl riding a horse next to another girl.
a guys tie all up closes its black with strips
there is a cat that is sitting on the kitchen counter
A teen-aged boy standing near a jail replica.
A boy holding spoons over a pan filled with food.
Two buses next to each other in front of a fence.
A businessman showing off a unique red tie.
A cat sleeps in the sunlight beside a computer.
A woman who is holding her little dog.
A woman is crouched next to a suitcase on a city sidewalk, she is surrounded by people standing over her.
A man standing with an umbrella in one hand and a flashlight in the other
A red European passenger train sitting on the rails.
a red truck parked on a bridge with people in the back
A small plane sitting on top of an airport tarmac.
A man folding his towel on the beach while his dog stands in the sand.
Ben clock made as a model with bystanders walking by.
A young man tossing a frisbee in a  forest.
A man that is holding a frisbee in his hand.
A fire hydrant is surrounded by and covered with snow.
Ten people and their dog pose for a picture while skiing.
A blue clock with clear leaves coming out of it.
There is a tower with a clock at the top.
A boy laying on a bed with a black kitten.
a little toy fire engine sitting on the ground outdoors
Two men in military uniforms holding a large key in front of a house.
a group of people excited to eat pizza
A guy on a skate board near some graffiti.
A cutting board with a long pizza and knife on it.
A picture of a fire hydrant on the side of the road.
A child's highchair has a little cat in it.
A cat is laying inside a briefcase in a room.
A group of people in white lab coats leading a group of cows.
a white box with different kinds of donuts
A bunch of stuffed toys inside of a homemade castle
Two white toilets, white towels, and a shower.
there are two zebras standing next to each other
A computer mouse sitting on top of a table.
A large bird sitting on top of a speed limit sign.
Many people walk down the street with umbrellas in hand.
Young boy taking swing with bat outdoors in play field.
A man is on his roof with a large umbrella.
A skateboarder doing a stunt on the edge of a ramp.
A cat sitting on top of a shelf by a computer.
Multiple vehicles parked curbside next to parking meters.
A little boy that is standing on a skateboard.
Two people are looking at a truck while a dog is being walked.
The person ski's downhill on the mountain of snow.
Large number of snow skiers at the bottom of a slope.
a herd of zebra standing next to each other.
Two young men retrieve plastic flying discs in the park.
A large sandwich on some paper by a knife.
A PICTURE OF A BATHROOM WITH A PLAID SHOWER CURTAIN.
A green pan that is on a stove.
A man brushing his teeth in front of a mirror.
a large pizza is sitting on a pan
A zebra stands near a giraffe in the wilderness.
A man flying through the air while riding a snowboard.
Baked pizza displayed on serving dish with beverages on small table.
There is a family out on the ski slopes.
A sign that reads public market center is shown.
Young man looking into the inside of a refrigerator through bottles.
a small girl in a white shirt and another person
a dog is under a man with a laptop
lady wearing work out clothes and glasses with a cat in her lap
A clock on a stone tower is against the blue sky.
Large striped zebra walking down a patch of grass.
A man standing on top of a beach under a cloudy sky.
A couple of elephants standing next to each other.
A messy baby eats the broccoli off of the table.
A man plowing the field with two horses on the country side
a group of females standing in a grassy field playing frisby
an intersection with different poles filled with street lights and a camera
Panda bear climbing tree with paw over limb.
A man with glasses playing with a Nintendo Wii.
A giraffe sitting on a rocky dirt and grass covered ground.
A man on a phone on a ddr pad
An orderly bathroom is seen in this picture.
A man standing with a dog in a field of grass.
The person with the bag is walking down the street.
The elephant family is walking down the road.
A skier in all white standing in the snow.
The grinch riding a motorcycle with a small dog with antlers.
there is a withe toilet and the tub has a blue curtain
A elephant fenced in a large land area .
Identical street signs pointing in the opposite directions of each other.
A man and a young girl on a motorcycle.
a man with a white beard and hat on a cellphone
A person with their feet propped up by a flower vase and couch.
A living room arrangement looking into a kitchen and dining room.
Two surfers-are in the Ocean one stands and look's at his board
A man flying through the air while riding a snow board.
A small kitchen with a stove and refrigerator.
A man swinging a tennis racquet at a tennis ball.
A giraffe looking alert at the camera in a field.
A view of a shower and toilet from above.
Two men standing in a living room holding Wii remotes and nun-chucks.
Emotional person hugging a stuffed bear while sitting in a plain room.
A street sign with two streets and two block numbers.
A herd of sheep crossing a bridge over a river.
A small bedroom picture taken through a fisheye lens
A picture of a person fixing a road sign.
the woman is sitting at a table in a purple chair
A very cute elephant covered in mud in some tall grass.
several people play video games with remote controls
A group of people taking pictures of two pizzas in open boxes on a counter
A white toilet sitting next to a bathroom sink.
A hamburger and fries sitting on wax paper.
A nice hotel has a full living suite
A A bowl and a sandwich on an orange plate on a table.
a tennis player swinging a racket to hit a ball
A group of colorful umbrellas sitting next to each other.
A picture of some trash being wasted in a trash.
A truck driver adjusted the straps on his load.
Two groups of people rowing in boats side by side.
A young man riding a motorcycle having a good time.
A girl walking behind an open fire hydrant spraying water.
a woman is petting an elephant and a fence
The man is holding up his chat pad in his hand
A man looks into the mirror as he styles his hair.
A refrigerator and table and chairs in a garage.
A boy with a racquet swinging at a tennis ball.
A peanut butter bagel is sitting on a white plate with several other food items surrounding it.
A white bed sitting next to two windows.
A giraffe putting it's head in a leafy green tree.
A bird sits in a fruit tree with many leaves
Three teddy bears dressed up for Christmas on display
A maroon vehicle stops at the stop sign.
A woman spooning cookie dough onto a cookie sheet.
The silhouette of a group of people and a horse.
A boy in grey shirt sheering a sheep by wall.
An army jeep with an American Flag sitting at an airport.
A young boy skinning carrots into a sink
A happy stray puppy lies in the street.
A street  sign on a busy sidewalk corner
An oreo cookie and chocolate dessert on a plate.
a display shelf with a few bananas on it
A man in a pink bow tie and a pink shirt is being hugged by a man in a blue shirt.
Two street signs indicating no parking or towing.
A picture of a bunch food sitting on a table.
Several bicycles sit parked nest to each other.
A vintage airplane museum, with people walking underneath displays of WWII-era planes in a hangar.
A group of people on skies with contestant numbers.
two boys are playing a video game and people are watching
The side of the building has a large clock and several windows.
a group of people standing playing nintendo wii
A tabby cat is laying in an open packed suitcase.
A skier putting their feet in the skies.
A chocolate bunt cake is adorned with cashews.
A group of giraffes on a jungle path.
A passenger jet rolling along a runway at an airport.
Several vehicles are stopped at an intersection behind a red light.
A young man performing a skate board trick outside.
a bright day and skiing in the mountains
A woman in shorts and heels waiting on a train platform
Traditional looking around the umbrella girl with old clothing.
Small piece of bread and a donut sitting on a white napkin.
A man sits on a surfboard in shallow ocean water
A clock on a tower in the middle of a brick building.
The women sits in shade working on her laptop.
Dad, son and teddy bear are all smiling and happy.
an image of a baby eating a spoon
there are people sitting at a table using lap tops
a living room with a person playing with a kid
Chocalate covered deserts on a stick on the table.
A small white-and-brown dog curled up on a flower-print pillow.
three people standing at the zoo watching a elephant
A man riding skis down a snow covered slope.
A train crossing the road with cars waiting.
A man is wearing a pink shirt and a tie.
The two airplanes are close on a runway.
A bowl of chicken, lo mein noodles and vegetables.
A crowd of people mill about on the street.
Two people skiing on a snowy mountain with a building in the background.
A man on a surfboard performing a trick.
A young boy flying a kite near a house.
A minimalist room features white appliances and beige walls.
There are two horses walking in a grassy field
Two brown horses pulling a carriage as people sitting on the side of the road watch.
A man shaving his face with another man hiding behind him.
A kid laying down with a stuffed dog on him.
a group of three people talking to each other on the sidewalk with a skateboard
Two giraffes standing next to each other in their natural habitat.
A man flying a kite in an open field under cloudy skies.
A woman is sitting on a canoe going down a river.
A group of people with surfboards enjoying a small river.
a cat with its hair sticking out as it looks at a dog by the window
a polar bear swimming in the water by a wall
A kitten that appears to be focused on a computer mouse.
A group of men standing next to each other.
Three packages of toilet paper sitting on top of a toilet seat.
A motor scooter has multiple rear view mirrors.
A little girl crawling out of a piece of luggage.
Two men with racquets on a tennis court.
a little girl sits on a bench by herself
An industrial kitchen has a double oven with glass doors next to a shelf of dishes and utensils.
Many laptops and their assorted wires atop a wooden bench.
Colorful Adirondack chairs at the end of a pier.
There are four goats and one giraffe standing in a group.
Purple  orchid and colored leaves in a green vase.
a bunch of different colored vases on a table
A giraffe and a baby giraffe standing in an enclosure.
pink double decker bus with two woman pictured on side
The view inside a suit case, and a backpack.
A dog standing on top of a boat in a body of water.
A group of men doing tricks on skateboard next to ramp.
a close up of a buses rear view mirror
There is a bowl of food with bread and a plate of fruit.
Three children sitting at a table with food and drinks.
Stop sign at the intersection of two rather rural roads
horses graze and drink from the water at a lake
a black and white dog is herding some white animals
A large orange striped cat laying next to a computer keyboard.
A brown cardboard box with glazed doughnuts and wax paper.
a bed sitting inside of a bedroom on a wooden floor.
A person holding a pair of scissors in one hand.
A small group of giraffes walk across the savannah.
A bundled up woman skier falling in the snow.
A person riding a horse and wearing armor in front of a crowd.
A table with two drinks and glasses flanked by two chairs.
A display case with various types of pastries.
A couple of cats are sitting next to a dirty door.
The guy with the white shirt and baseball cap is milking the cow.
A striped plane flying up into the sky as the sun shines behind it.
A man's torso wearing a brown patterned tie, pens in pocket and a large checked shirt.
A slice of strawberry cheesecake on a plate with a fork
The buffet features several different types of pizza.
A man is leisurely crossing the street on a skateboard
The cat is wandering around in front of the cardboard boxes.
A view from a house looking outside at the front of a black car.
Freshly cooked food and salad on a paper plate with a fork
a airplane that is flying through the air
The right hand of someone unpacking a Wii remote and sports games
A large clock is displayed on the side of a building.
a close up of a plate of food
Young child playing baseball in a local park league
A plate loaded full with well cooked food
Two women and a man posing for a photo on the dance floor.
Zebras and wildebeest walking in their natural environment
A female Tennis player is holding her racket while the crowd and man look on.
A man is cooking a pan full of various foods.
A white dog in grassy field with red frisbee.
A clock and a picture hung above a big window.
A gooey piece of pizza with peppers, cheese and onions.
A fire place sitting inside of a living room.
a woman walking down the street with a baby carriage
three groups of yellow flowers in vases on table
Two males are watching something on a camcorder.
there is a toilet with dirt on it
A baby laying on its tummy on a bed is looking at a blue elephant.
A laptop next to a wall in a room.
A jet airliner leaves a faint trail of smoke during landing.
a laptop sits in front of a group of people
Jet parked with no one around in the area.
A white polar bear is laying in the snow.
A man is riding a motorcycle across the sandy shore line.
A group of children sit on a bench outside.
A group of people standing near surfboards in the sand.
Small children wearing a cast holding up a Wii controller.
A man is playing tennis on a dirt court.
A man eating food while wearing a gray hat.
a man that is cutting a pizza that is on a stove
A sleeping black cat sitting on a pizza box.
Three doughnut holes sit on a white plate with a doughnut that has been topped with topping and drizzled with sauce.
A store with items on display in it's front windows.
a person holding an apple near a tree
A woman on a cell phone sitting on the ground.
A man handing another man something inside of a room.
A person riding a skateboard while wearing blue shoes.
A bunch of people waiting on a subway train.
A horse drawn wagon driving down a dirt road.
A man standing over a table presenting food.
A city street filled with lots of traffic.
People standing on surfboards on waves in the water.
A close-up of a metal statue of a bird landing on the nest.
An owl among a few leaves, next to a wire fence.
A women in a blue shirt cuddles up with her cat
Two girls looking at a calf in a fence.
A man in a grassy field about to catch a frisbee.
The food is a mixture of pizza, salad, and wine.
a group of people walking on a city street
An old fashion looking clock tower near some bright lights.
A person stands under an umbrella on a sunny day.
The is a line of elephants in the street.
An abstract designed bowl holding a bunch of oranges.
A couple cargo trucks parked outside of a few shops.
A man riding his surfboard through the waves.
The head of the black and white horse has a red decoration.
An elephant standing in water and surrounded by grass.
A group of people standing around a green tent next to a horse,
A clock tower in the middle of a road.
a clock on a wooden pole in the middle of a beach
A little boy brushing his teeth with a tooth brush.
A double decker bus driving down a road.
A woman playing games on a laptop computer.
Closeup of a pastry with white and brown frosted petals.
A bird perched on a wooden peg ready to take flight.
A teenager standing on a ramp while holding a skateboard.
A young child riding on the back of a sheep.
A wooden doll is next to a teddy bear.
Gray and white dog sitting on top of the bed with a black cat.
A person in a wet suit in the water engaging in a water sport.
A guy wearing a black wet suit on a white board, surfing.
A man in a suit waits in a room with a tv.
A black and white picture showing small children in a dormitory setting.
A bed sitting in a room near two lamps and a couple of pictures on the wall.
Three boys hanging out in a living room with the T.V. on in the background
A table full of assorted snacks and plates.
A red double decker bus parked on the side of a road.
A hallway lined with doors and filled with suitcases.
a engine sits parked inside of a ware house
A yellow and blue fire hydrant in front of a building.
plated vegetables on white dish displayed on hard surface.
a van that is parked by some people with umbrellas
A cat lying on an open laptop that is on a bed.
An elephant standing next to a tree outside.
Trio of zebras stands idle on the savanna.
a dark picture of two men on skate boards
two cats, one orange and one gray, sit on  a shelf intended for shoes
A submarine sandwich cut in half on a white plate next to a cup of coffee.
A boy with a blue jacket is smiling on a ski-slope.
A young child with a spoon eating a slice of cake
An assorted group of standing and reclining cell phones.
Two pizzas on a wooden table with a person seated.
A person in their car views a ram in the street.
There is a horse race going on in a carriage cart
Signs displaying foot and seating area hanging inside restaurant
A traffic light with an orange and a red having faces drawn on them.
an image of a flamingo drinking something orange
A group of people holding umbrellas standing behind a sign for a umbrella drive.
A little girl is playing a game on the television.
A cat is in a bathroom standing on an open toilet.
A woman stands by her luggage and carries a large bag.
A woman standing on a surfboard riding a wave.
A healthy meal of fruits and vegetables on a table.
THERE ARE CARS AND A TRUCK THAT IS PARKED IN THE PARKING LOT
A train on the railroad track in an underground subway.
TWO CONTAINERS OF FOOD SITTING ON TOP OF CONCRETE STEP
Two men are sitting on a couch and their ties have been tied together.
A clothes line with clothes hanging from it and cattle in the background
A young girl is taking a nap next to her mother.
A stuffed animal with colorful decorations on it and clothes hanging on a wall.
A small bird perched on the handle of a bicycle.
A giraffe standing next to a tree covered in leaves.
A hand holding a piece of food at a table.
A guy on a snow board in the dark.
A display of historic pots and artifacts on display steps.
A group of women standing under a red and white umbrella.
A woman holds an electronic device in front of the camera.
A piece of cake sits on top of a plate.
A herd of cows make their way across a river.
A woman sitting at a table with a little girl and a man.
A man with a toothbrush in his mouth and uncombed hair takes a picture of himself at his computer desk.
A banana, red pepper, carrot, and green apple
A man standing on a baseball field while wearing a glove.
A clock tower on top of a building with a wind indicator.
A woman sitting on a bench with a dog sitting on the ground by her.
A bunch of people in a building doing different things
A group of people stands around and looks at a phone.
A British Airways airplane taking off into the sky.
A dog with a white hat at the field
A woman trying to take a frisbee from her dog.
A group of people at the beach flying kites
A shot of a field and road taken from outside of a vehicle window.
A train passing through a railway station.Railway platform is seen.
A bathroom has a sink on legs and round lights.
A cat sits in the foreground looking at the camera while a bright yellow motor cycle is in the background.
A kitchen with a large white counter top.
The cat naps on a shelf near the desk.
A person eating food from a white plate next to a glass of wine.
The toaster adorned with a face sits atop the tiled surface.
A cat is standing on a board game
Large and small elephants standing near a watering hole in the grass.
A small restroom that is painted the color blue.
There is a fruit slushie next to a very sloppy chili dog.
A airport runway filled with jetliners next to large tanks.
A kid in a white shirt stands on the grass while another boy stands on a pathway near a hovering white Frisbee.
A high shot of a counter with a microwave and other food.
A fruit market with shops of banana and apple.people buying banana.
Delicious looking pasta with a variety of noodles
Two young men sit on a couch in a sloppy room with a laptop, a phone, and a flat screen tv.
A plate of food with broccoli and beef.
A dog sleeping on a rug next to a stuffed animal.
Boats floating on a  lake near a dock.
Sea birds gather on a broken pier surrounded by algae.
Several bundles of fruit hanging from a plant.
A crowded city street with a row of bicycles
A black bear that is walking on a branch.
A small white bird walking across a lush green field.
A woman sits on the curb talking on her phone.
A high statue with a clock inside on a very nice day.
A man riding a skateboard down a wooden ramp.
An intersection of two streets in front of a home.
A man with a snowboard that is standing up.
The baseball player is running from home plate.
A train is going down the tracks in the dark.
A person riding on a skateboard down a ramp.
A large building with a clock tower on top of it
Three ladies and a man sitting in a room with drinks on the table.Two of them playing video games.
A bed with covers turned down and a messenger bag against a pillow.
An old building sits in the background behind an illuminated signal light.
The man is sitting on the post beside the water.
A bathroom with a large tub next to a toilet and sink.
A baseball player pitching a baseball on a field.
A happy couple taking a selfie while sharing a drink.
three people and one is petting an elephant
BLACK AND WHITE PHOTO OF A WOMAN, TWO CHILDREN,HORSE,COW AND A DOG
Several boys on a field playing with a frisbee.
A crock pot on top of a microwave on top of a refrigerator.
A table topped with two pizza and plates next to glasses.
A man carries a surf board as a dog walks beside him.
A young man doing a jump off a ramp at a skate park.
The man in the suit is cutting the cake.
People are flying their kites in the sky.
Several people are standing in a living room while one examines a remote.
A man riding skis down a snow covered slope.
The Central Railway Station tracks in an old photograph.
A bathroom with a sink and toilet and very small mirror.
A woman in a black helmet jumping a hurdle while riding a horse.
Two zeba standing on a dried grass plain looking off into the distance.
A man and a woman smiling at the camera inside a large building.
A wet floor sign is between a toilet and a urinal.
Two marble vases one containing white flowers, the other green grass.
A woman in black jacket sitting at a park bench in woods.
A banana sitting in a bowl that is on the table.
A dinking room table in the living room right next to the fire place.
A girl with long brown hair with streaks of red lays on a bed and looks at an open laptop computer.
Several elephant statues on display in a mall.
a red bus that is in line with other cars
A clock tower on a roundabout next to a building.
Smoothie ingredients are in a blender including blueberries, strawberries, and bananas.
A man on the beach is playing Frisbee.
a back to the future mcclaren and time machine toy
A bear is standing outdoors in the wilderness.
A toilet and bathtub are in a bathroom.
this is a pink box with food inside of it
A trolley bus is coming down the street near trees.
A woman takes a close up photo with her cat.
A counter with a bunch of bananas and oranges on it.
The boy in the green shirt and green hat is holding a baseball mitt.
Adult riding breaking wave in open ocean on sunny day.
four white and blue street signs on a wooden pole
A giraffe is stepping on a log in a grassy area.
A bathroom with a metal sink and an odd shaped toilet.
A piece of chocolate cake is on a plate with a fork.
a couple of men that have wine bottles in hand
a porcelain toilet that must be used by crouching over it rather than sitting on it
The person is bodyboarding as the waves crash around him.
A dog that is swimming in some water.
Multiple men climbing and hiking through the snowy mountains
A tray that has two forks, a bowl , and food on it.
Two birds perched up on a large tree branch.
A lone skier, dressed all in black, going down a hill.
People standing on the top of a green hill area with kites flying in the blue sky.
A man wearing a tie next to a woman.
A woman sits on a bed in a dark room.
An older boy and a young boy are playing a video game.
Two horses in a rope corral in a courtyard with one being groomed by a woman.
Brown and white cows lined up against a barbed wire fence.
A clock tower with a toy doll display below it.
People riding and pushing tricycle carriages down the street.
A train driving along tracks next to a city street.
A boat is docked alone on the side of a river.
Some cattle next to a brick building and a guy on motorcycle.
Woman sitting at a restaurant holding a wine glass.
A large bear in a tree biting into a branch.
A toilet in a stall with a sink attached to the toilet tank and a console attached to the lid.
Some baby bears are having fun on a sunny day.
A person in red jacket snowboarding down a snowy hill.
A farm with dozens of sheep in an enclosure.
A kitchen that has a hanging rack and a refrigerator.
very clean bathroom with white towels and some bathing soaps
A couple at a cafe each on their respective cell phones
a big giraffe and a small giraffe are in their pen
A large dog in a room with yellow walls.
A woman walks through a busy area holding a purple umbrella
a giraffe eating food from a food dispenser
A tennis player is running on a tennis court.
A subway car stops at a station, its doors open.
A snowboarder standing on a snowy mountain looking out.
Some people standing on a surf board on the beach.
A desert with some fruit on a plate.
An adult and child elephant are eating grass.
a black and white dog standing in front of a glass window
The multi-colored cat is standing on a luggage bag.
a bunch of stuff in a home living room
A group of cows grazing near a passing train
a big swimming pool that has some people in it
A train with closed doors near a platform.
Kids playing baseball while parents watch from benches.
two women with a basket sitting at the bottom of the stairs
A rack of bow ties hanging from clothes pins.
Crabs walk across the sand along the ocean.
Two old style planes flying side by side in the sky.
a truck has pulled off the road to look at an elephant
A group of nine jet planes flies in formation.
A woman carrying a surf board by the ocean.
a pink and white plate with some banana slices on bread and a drink
A man that is on a surfboard in the air.
A glass vase sitting on top of a table.
A close up of raw meat and meat cooking in a deep fryer.
A kitten is standing in a refrigerator shelf.
A skateboarder skating next to a concrete street divider.
a close up of an electric blender on a table
a person in a kitchen preparing food
Two Teddy bears sit next to each other.
A group of people set on the ground talking in a park.
A group of young people gather with surfboards on a tropical beach.
Three giraffes standing around inside of their enclosure.
A black and white street sign with a white building behind it.
a young person laying on a couch with s nintendo wii remote
A sandwich with peppers and ale are setting on a table.
Busses paused to a stop at a bus stop.
A red and blue dump truck traveling along a city street.
A sprinkled doughnut sitting on a white napkin next to the bag it came in.
a large elephant that is standing in grass
A brown and black cat licking a woman's face.
Two hitched horses standing next to each other with pink coverings on their heads.
A tennis player leans into her stroke on the court
A pizza slice on plate, beer in mugs and beer bottle on a kitchen table with place mats.
A donut has a bunch of nuts on top of it.
A train sitting on top of train tracks near  forest.
A couple of giraffe standing on top of a grass covered field.
A half unmade bed in a hotel room
a close up of a person cutting a pizza with scissors
A man in a suit and tie holding a water bottle and people with cameras standing around him.
A man trying to get his dog to herd goats.
A man in wetsuit riding a white surfboard on wave.
A woman stands ready with a tennis racket.
A person and skateboard in air over a ledge by a sidewalk of city road with cars.
A teenager rides a skateboard down the stair railing.
The two giraffes are standing together in the grasslands.
A large passenger jet flying over an airport.
A cat laying on top of a blue dresser near a chair.
a blurry photo of an empty city street
a woman is holding a teddy bear in a room
A man watches a flatscreen TV set above wrapped gifts.
a couple of computers are sitting on a desk
A bathroom mirror that is trimmed in gold and reflecting the room.
The back side of a vehicle packed with bags.
Small horse sitting beside a large brown horse.
a plate of pizza sits on a checkered table cover
The Asian kid is gleefully playing with the cellular telephone.
A woman petting the trunk of a elephant.
a man flying through the air on top of a skateboard.
There is a room with various items in the picture.
This is a baseball player trying to hit a ball
The urinal on the ground has a toilet scrubber next to it.
Four giraffes are behind a fence in the dirt.
A pink kite is flying in the sky at a beach.
An orange cat laying on a black laptop in living room.
A man pitching a baseball on top of a field.
A female runner eating a banana during her run.
A guy in a bandana leaning over a laptop.
A elephant standing in a field with lots of grass.
Three zebras are running in bright green grass.
Two guys are shaking hands while one grips the tennis racket.
A laptop on a wooden chair of some sort.
a street with two stop signs and people walking down the street
A picture of a hotel room having just been cleaned.
A brown dog touches noses with a sheep.
A little boy holding a baseball bat on a field.
A group of people that are in a market.
A zebra waling in a field of dead grass by some trees.
Yellow and black older snowmobile inside room with blue walls.
A man standing up in front of doors with a folder in his hands.
A smiling woman with scissors cutting a sign
A clean white toilet with the lid down in a bathroom.
A woman that has purple socks and a book.
There is a large gray elephant standing next to a tree.
A small white living room with sofa and lunge chair
a brown and white cat has its paws on a laptop
A group of people in a room playing video games
A picture of a bear that is in the grass.
Several people in a group are flying very colorful kites.
there are two statues of zebras at a exhibit
The bowl of greens is near a wooden bowl.
Bunches of bananas are placed on flat newspapers.
A giraffe looking up at a tree behind some large rocks.
Four laptops sitting on a cluttered desk with a phone and a pair of headphones.
A man in a black shirt opens an oven door while looking at the inside of the oven.
A group of airplanes are parked at a runway and a truck is parked next to a plane.
The couple is sharing a piece of cake while being photographed.
a man on a pole with an umbrella
A group of people skiing down a mountain in the snow.
A man carrying a basket filled with fruit and clippers.
A green double decker city bus by the curb.
Black and white photograph of people with umbrella next to cars in snow.
Orange tiger stuffed animal sitting on the bed of a pickup truck.
A plate that has sandwiches and chips on it.
Man sits on his parked motorcycle with body of water and bridge behind him.
A cat lays on top of a blanket and sleeps.
A person on a motor bike on a road.
A pizza with a few toppings is on a plate.
A brown basket filled with bananas and apples.
A blue and white fire hydrant on a street.
Fresh produce is arranged in a grocery store display.
a man is sitting at a table with some food
A wooden crate holding bananas under a roof area.
some people in orange are standing together outside
Two people are flying a kite on a hill.
a young person and an older person holding a kite
a fire hydrant is spraying water onto the street
A green bus parked in front of a tall building.
there is a male tennis player on the court in a game
A bunch of stuffed animals stacked on top of each other.
Beach goers enjoying sunny day on sandy beach at ocean.
Two firetrucks are ready to be deployed to a fire.
A city bus drives down a quiet road.
A large elephant walking in front of a vehicle.
Two giraffe's standing in the shade under a canopy.
Woman on cell phone in city at night.
A girl sits on a bench on grass outside a red door.
A variety of Apple Ipod products on display.
Two giraffe standing on top of a muddy puddle of water.
A panda bear that is holding a stick.
A view of a baseball field from behind home plate
A blue and yellow mass transit bus turning a corner.
A  plate of food on a table with a tall glass
A man standing and holding a tennis racket on court.
Small dog playing with toilet paper in bathroom.
Two men and two women make breakfast plates in a kitchen.
Two people in skis standing together a snowy hill.
People are riding horses through the grassy plain.
A building with three steeples and a clock in the center.
A parking meter on the sidewalk of a busy street.
a square shaped pizza with bacon, an egg and tomatoes on a white paper plate.
A full view of suitcases with some clothes on it.
underside of a plane flying through a cloudy sky
A large group of people on the street.
Two men sit in front of large baskets of fruits and vegetables.
THERE IS A WOMAN THAT IS STANDING ON THE STREET
Two triangular street signs on grass next to brick pathway.
there is a small dog that has fallen asleep with a book
A pizza on a tray with a fork and glasses on the table
A dog laying in the back of a moving truck.
A small bird standing on the ground next to body of water.
A couple of elephants standing on a lush green forest.
A close up of a duck walking on a path.
Toddler enjoying playing with a colorful kite in a grassy field
a statue standing next to a clock and some bells
Three plates with a different dessert on each.
The view of a bathroom showing a toilet with a small waste bin next to it.
A blowup seat in the back of a blow up raft
The plate has a picture of a kitty on it.
Horse drawn carriage with a pair of black horses in front.
The man is pitching the baseball on the field.
A planter that is standing on a stand.
A very large doughnut  sits atop a building as an advertisement.
A sleeping  dog laying on a stone walkway.
a man skating on the road very fast
An empty bus is parked in front of a building.
A black table has a white vase with flowers.
Male tennis player on the middle of the court.
a close up of doughnuts on a plate on a table
A skier performs a trick in the air off a ramp
A fire hydrant sits on the curb in the snow.
A small bathroom has an open skylight in the ceiling.
The small dog wearing a pink scarf stands in the yard near a bowl.
A living room complete with a couch, chair and television.
a couple of benches in front of a body of water.
A zebra standing in a sandy spot surrounded by green ground cover.
A boat travels in one direction of the ocean while a smaller pleasure craft travels in the opposite direction.
A laptop set up on a wooden table.
A chair lift over a long ski run.
A stop sign on a pole in a city
Cardboard boxes stacked up in a living room
A fireman is on top of the truck ladder
A big ocean wave with someone trying to stay on the surfboard.
A man hitting a tennis ball on the tennis court.
A man is in a kayak in a pool with a ball.
People sitting at tables working on laptop computers.
a pizza with pesto sauce sitting on some oven mitts
A group of people riding motorcycles is going down a road.
The large double decker bus is coming around a corner.
A brown horse grazing in a grassy area.
China Airlines plane in air with landing gear out.
a sandwich has a bite taken out of it
A suite case that has a large quantity of glasses in it.
The fluffy cat is sitting on top of a toilet in the bathroom.
A spacious bathroom with two sinks and a claw foot tub.
Two young people are riding a bike together next to the parked vehicles.
The three trains are stopped on the railroad tracks.
A kitchen with brown cabinets has an island.
I am unable to see the image above.
there is a man sitting outside at a table with a large pizza
A train engine with train cars behind it, riding on a  set of tracks with smoke blowing from the engine.
A vase filled with a large yellow and black sunflower and other flowers.
A group of people standing around a room together.
women standing next to a truck on display
A large jet sitting on top of an airport tarmac.
there are many blue and white umbrellas on this beach
A boy is sitting at a table eating pizza.
A cat sitting on top of a television.
A white cow surrounded by many dark cows inside a coral.
Computer screen with the keyboard and printer sitting next to it.
Partially open door leading to a kitchen from a hallway.
an Olympic event going on with many skiers
A box of doughnuts being held open by a hand
A FedEx truck waits at the bottom of a San Francisco hill.
A city bus driving down the street to georgetown
some baseball players are playing baseball on a field
Two women riding on the back of somebody else motorcycles
Residential pantry with food items stocked on shelves.
A very large building, that appears to be a truck.
A person feeding a giraffe while wearing a hat.
A young man riding a boogie board on top of a wave pool
Professional dirt biker with woman on backseat of bike.
Two zebras grazing on flowers in a pasture.
The powdered pastry has filling in the middle of it.
Boy with a football book and his dog outside.
A table topped with plates and bowls of food.
He is hitting the baseball with the bat.
a baseball player is swinging at the pitch
An intersection shows  an expanse of empty road and then a car coming out from under a large arch that looks like a giant Chinese letter and stands between two buildings that stand at the forefront of am open walled walkway and retail venues.
A man in his car using his phone
A motorcycle police officer is pulled alongside fellow officers in car.
Two elephants cross a dirt road between two stands of trees.
The man is throwing the baseball during the game.
A skier in a green jacket going down a slope covered in snow.
A large passenger airplane flying against a partly cloudy sky.
A young boy eats a piece of pizza.
A flock of birds flying through the sky.
A giant panda sitting on logs lazily yawning.
a man is making pizza in his brick stove oven
A picture taken from between an individuals knees at the sky.
A part of hands with scissors trimming a plant.
A banana next to a sprig of vanilla and a shot glass.
A group of people in the snow, putting on snowboards.
A chair sitting in the middle of the room, in a black and white photo.
Two riders dressed as knights are on horseback.
A medium-sized brown-colored worm wiggles as a large yellow slimy slug looks on.
A half eaten pizza on a table with dishes.
A beautifully maintained bedroom with rustic charm features natural wood.
Two parking meters that are almost covered in snow.
Some pancakes with icecream and bananas and a coffee
People riding elephants who are wading through a river.
A small group of penguins approaching a pool of water with one already swimming
A group of people sitting around a table eating food.
A stop sign flashes with an exit sign below it.
a bunch of small children holding tennis rackets on a tennis court
A black dog is laying on a white pillow
a guy grinding his skateboard on a wooden post
A bear that is standing in front of a rock.
A man standing in front of a TV holding a Wii game controller.
A man swings his Wii controller back in a living room.
A large clock next to other smaller clocks set to different time zones.
a room showing a cooker and an oven
A man with a helmet holding wires attached to something in the sky.
An elephant with tusks is standing between two fences.
A very big bright colored truck and a van on a narrow road.
The ball player is preparing to pitch the ball.
The unicycle is on the curb in front of a parking meter.
A book in french laying on a bed.
A giraffe standing inside an enclosure with two deer.
A train with multiple cars passing by trees.
Spectators watching men on horses riding in an ANZAC Day parade in Australia
two police riding horses on a london street
A door that is opened wiith a chair inside .
An empty bathroom with 2 toilets next to each other.
Two black bears sit on the ground beside a structure made of wooden logs while another stands on top of it.
THERE IS A METET THAT IS ON THE STREET ON THE SIDE WALK
The young woman is jumping into the air as birds fly over the ocean behind her.
Three stop lights and one way signs are in the intersection.
A cat lays in the window on a sunny day.
A man on a skate board who is touching the ground.
a bamboo tray holding several bowls of asian food
A man riding on the back of a brown horse down a street.
A group of people with toy swords in a crowd.
Two women in the snow on skis in front of a large building.
A pigeon that is sitting on top of a head stone.
A herd of zebra and horses standing next to each other.
A man holding a baby in front of a plate with cake.
A person clothed head to toe in white paints a room.
A BIG GROUP OF PEOPLE FLYING KITES IN A FIELD
A bathroom with a tub and shower and a sink.
A baby bear standing among some tall grass.
A living room filled with furniture and a flat screen TV.
Bright red umbrella open on the sand of a beach.
a motorcycle parked on a side walk near a brick building
lose up of various trays of croissants and muffins.
A man is taking a selfie with a mountain range in the background.
a zebra in some brown grass and some green plants
A car and motorcycle riding on a pavement road.
A row of pizzas sit on tables underneath lamps.
Some people in an arena with other people watching from the stands.
A living room near an open window has furniture and an area rug on the floor.
A man and woman dressed in wedding attire walking out of a building together.
A bunch of bananas sit next to a cup of coffee.
A woman relaxes on her bed and uses her computer
A man with gray hair is holding a colorful kite.
A polar bear grazing in a vibrant green grass
A purple skateboard sitting at the back of a bus isle.
A black horse and white horse graze for grass
a guy and a girl getting ready to stand up on their surf boards
A long red table with dishes on it seats many people in a room.
Small toy train engine set with a train station.
View of adult elephant seen through the trees
a couple of cars pass through a city street
A woman in the process of serving a tennis ball.
A woman with a shorn sheep on a grate.
A smiling young woman uses a computer in the kitchen.
a photo of city buildings near beautiful plants
A group of cows laying in a green pasture or grazing.
A giraffe looking ahead in front of a stone wall.
A man wearing a purple die and work shirt
A radio sitting on a table next to a record player.
A stove top in a storage type of room with several spices on the stove.
Green onions sit on a cutting board along with carrot sticks.
A man setting at a table in a restaurant cutting his food.
A picture of a large cathedral with clock in the center.
Two plush bears are found as a gift along with a Starbucks cup
A row of floor height urinals in a public restroom.
Two shake boarders playing on the street with one individual sitting under a tree.
A child playing with his hand-held game system.
A birthday cake that is decorated with a dolphin and sea horse on it
A white bowl with a few pieces of broccoli.
A train is coming down the track near a hillside.
There is a figurine by the computer keyboard in the office.
A bedroom with a plain neatly made bed with no headboard
A man petting a giraffe whose face it over the fence
An elephant standing on rocks next to a wood bridge.
A large bathroom has a tiny window and a tub and toilet and sink and mirror.
A group of children playing with a ball.
A cooked pizza made with various separated toppings
A man is snowboarding and is mid air over the snow.
The person in the black and white photo is jumping up with a skateboard.
A big clock tower topped with a walk and an American flag, stands tall against a blue sky, far ahead of city skyline, and right above a lot of teal-roofed domiciles.
a blue bus parked at a street corner.
Couple of people out in the ocean on surfboards
The dogs are playing together out in the yard.
An adorable little girl holding her hand over her mouth.
A dog catching a Frisbee in a park, with people in the background.
A woman with short, brown hair is looking into a circular mirror and holding a camera up to her cheek.
A man and a small child fly a butterfly kite in a park.
A snowboarder jumping through the air and performing a trick.
A very beautiful kitchen with very modern updates
people bringing their vegetables to the market by boat
A bowl with steamed broccoli topped with nuts in it.
A wooden table topped with cooking tools next to a sink.
A man is jumping up to catch a Frisbee between his legs.
Two men playing a game with steering wheel controllers.
a giraffe eating some leaves off a tree
A fire hydrant that was busted and is shooting water out.
A woman walking past a table with a plate of food on top of it.
A wooden table topped with lots of camera equipment.
A stemmed bottle is holding a slender flower in a window sill with a view of rain.
A blender filled with food on top of a counter.
A person took a picture of his torso and legs while laying on the top of a bunk bed.
A boy with a helmet stands next to a clock.
A man that is holding a banana in his mouth.
A caste all it up, reflecting off of the water.
A male maneuvering up a ramp while on his skateboard.
A red pick up truck with a plow blade drives down a snowy suburban road.
Several toilets are place outside on a lawn.
a bird eating out of a pizza box that is on the ground
An airplane flying under the clouds in daytime.
A hand with a gold ring is posed over a wireless keyboard, beside a wired mouse
A couple of people dancing in some sand with no shoes on.
The cat is observing its own visage in the circular make-up mirror.
A group of people standing around a baby elephant in a river.
a guy attempting a trick with his skateboard while othes watch
A man and two boys herd 5 sheep into a truck.
A sub sandwich in a box next to two hot dogs.
Men in suits smiling and walking across a green soccer field.
Four tanned men and a girl at an event.
Two born bears walking though a forest surrounded by trees.
A skiers lies on her back with the skis straight up.
A building at a railroad crossing billows smoke.
Construction loading truck driving in front of a building.
The nose of an airplane sits on the landing strip, boarding passengers.
A bowl filled with pasta, veggies and seasoning.
a small dog is walking next to the fruit stand.
A cup of Starbucks coffee is sitting on the side of a court.
A street scene with cars on the road and people on the sidewalk.
A person taking a photo in a mirror on a mass transit vehicle.
A woman sitting on a bench in a stone alcove.
A picture of a boat and some water.
A small plane is getting ready for a flight
Airport during a snowstorm with planes awaiting boarding.
A dog and cat sitting on a couch
A little girl makes a pizza with a smiley face.
A woman in sunglasses petting the trunk of an elephant.
A woman holds a mirror and tool up to a woman's mouth.
The skier is sitting down in the snow.
A street pole has an enormous number of signs on it.
The carrots in the dish are marinating in beer.
There is a woman holding a wine glass and a man wearing a necklace.
A person holding a Chocolate Lab dog while the dog holds an old teddy bear.
there is a male wake boarder holding on to a rope in the water
three young people holding wine glasses laughing
a cow walking on a city street near people
A young man holding a white frisbee next to poles.
A kitchen mostly empty with lots of cupboard and counter space.
The food is seasoned and ready to be cooked.
Two men in a kitchen are standing by a refrigerator.
Two baseball players are walking on the field.
A man riding skis down a snow covered slope.
Some lemons are in a vase and oranges and grapes are in a plate.
Two men are holding tennis balls and rackets.
A circular mirror reflecting a woman's stomach in turquoise shirt.
A birthday cake with gum drops and a bag of Cheetos cheese bacon snacks on a table.
A blue and silver railroad train placed on the tracks
A man welding the back of an oven.
a close up of a box of open pizza
The Helen J sitting in the ocean not moving.
A yellow fire hydrant on a street corner.
It looks like a human figure hanging in the tree limbs, partially concealed by foliage.
A person sitting on a wooden bench outside.
A guy in a helmet skate boards down the street.
a girl is on a phone standing near a sign
A group of young men standing on a basketball court.
A cutting board with green peppers already cut and some awaiting their cutting.
A man standing on a field holding a catchers mitt.
A woman standing in front of the Eiffel Tower surrounded by photo shopped animals.
A kitchen area with a double sink, a stove, a refrigerator and several other kitchen utensils.
Picture of reflection in a mirror of a kitchen
Two people holding surfboards on the shoreline of the beach
A few surfers ride a good wave in the ocean.
A building lined street with three lanes and light traffic.
A small bathroom with a toilet next to a cabinet.
A woman holding a white teddy bear next to a wood cabinet.
Family room with furniture, fireplace and wood flooring.
A tennis player is in air while extending his arm up to return the ball.
Two little dogs hiding in the pillows of a couch.
A bear dressed in a green outfit sitting outside.
A man teaching a boy how to play baseball
An office break room with table, microwave, sink and lockers.
The boy throws a baseball to another boy who is ready to hit it.
A home kitchen stripped down to be painted.
an assortment of fruit including oranges and bananas
The top of a church showing steeples and windows.
A service truck at an airport terminal with planes reflected in the windows.
A computer screen and keyboard on a desk.
A boat heading upriver to a harbor town.
A group of bicyclists are riding down a path
Baseball players are in action as a crowd watches.
A sheepdog prepares to guide a sheep into a corral.
Two people that are standing on ski's in the snow.
A person that is about to catch a frisbee.
There is a bowl of food and a sandwich on a plate
A fire hydrant sitting on the side of a road.
A crane is stacked high with lots of luggage.
A man with no fashion sense holding several frisbees.
Two cats are laying down together on what seems to be a table cloth.
Rows of handmade grass umbrellas lying on their sides.
a red double decked bus advertising a shop
The woman walks through sand with a black horse.
A heard of cows with yellow tags on their ears in a field of grass.
A plane is flying low during the evening.
Snow covered mountains can be seen past the boats on the water.
People hold six corn dogs with various mustard designs
a couple skiers skiing through cones down a slope
A man prepares to serve a tennis ball.
A cat sitting behind storage containers and a computer.
Two emergency vehicles on a driveway next to a garage.
A sidewalk is next to many different signs.
Some men with snowboards standing on a hill
image does not appear in this particular one
A giraffe standing on top of a dirt field.
Three people check on a number of bicycles in a showroom
Group of mixed fruits sitting inside a metal basket.
A blue and yellow train is parked on the tracks.
A brown and white dog laying next to luggage at an airport.
A grassy field with three zebra grazing from the ground.
A baseball player holds a bat while standing next to home plate.
a tennis player swings his tennis racket
A hotel lobby with a table and flowers in a vase.
Two planes are on a runway beside trucks.
a man in a black hat standing next to and  holding the reigns of a horse
A boy takes a selfie in a bathroom with Harry Potter decorations.
A very big cute giraffe by a pretty palm tree.
A group of people walking around a shopping center.
A smiling man is playing tennis on a brown court.
a small and dirty zebra inside of a corral
A young boy holding a baseball bat to his face.
A crowded store with several different displays of goods for sale.
A black cat staring into the distance in a room
A young girl on a bench with a kite.
A bike on a pole in front of a brick building.
A man is performing tricks on a bicycle.
A busy city intersection with public transit and pedestrians.
A dark room with a bed and black chair.
A man doing a trick on a skateboard on a ramp.
A man holding the string to a kite in a park.
A man holding a tennis racket with a ball in the air on the tennis court.
A person in the snow with two dogs on leashes.
A brown bear walking with rocks in the background.
A clock above a glove resting on a leopard print ledge.
A cat places its mouth on a computer keyboard.
A couple of people eating a slice of pizza.
A silver fire hydrant with a blue top at a road corner.
A man on a a fake horse is in the parade.
The inside of a vehicle driving down a highway with a tv playing an image.
A large group of elephants are in the water.
A colorful plate of avocado, carrot, and cabbage.
A young man doing tricks on his skate board.
An airplane sitting on top of an airport tarmac.
A man sitting on the raised cement border around a tree and looking at his cellphone.
This is a small bathroom with a towel on the floor.
there is a man drinking whine from a glass
A red truck with patriotic bunting drags a parade float.
two black cats are drinking out of a toilet
A herd of goats standing on a public street.
Sandwiches on buns topped with black olives and tomato.
a birthday cake with candles on top of it
Looking up at a tall clock tower in a blue sky
A city bus is leaving the bus station.
A person walking out of the waves with a surfboard.
Two bowls of soup set on a restaurant table.


================================================
FILE: DiT-ToCa/cache_functions/__init__.py
================================================
from .cache_cutfresh import cache_cutfresh
from .fresh_ratio_scheduler import fresh_ratio_scheduler
from .score_evaluate import score_evaluate
from .global_force_fresh import global_force_fresh
from .cache_cutfresh import cache_cutfresh
from .update_cache import update_cache
from .force_init import force_init
from .attention import Attention
from .cache_init import cache_init
from .cal_type import cal_type

================================================
FILE: DiT-ToCa/cache_functions/attention.py
================================================
# Besides, re-arrange the attention module
from torch.jit import Final
from timm.layers import use_fused_attn
import torch
import torch.nn as nn
import torch.nn.functional as F
import os

class Attention(nn.Module):
    fused_attn: Final[bool]

    def __init__(
            self,
            dim: int,
            num_heads: int = 8,
            qkv_bias: bool = False,
            qk_norm: bool = False,
            attn_drop: float = 0.,
            proj_drop: float = 0.,
            norm_layer: nn.Module = nn.LayerNorm,
    ) -> None:
        super().__init__()
        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim ** -0.5
        self.fused_attn = use_fused_attn()
        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x: torch.Tensor, cache_dic, current, fresh_indices=None) -> torch.Tensor:
    # 0.4ms extra cost on A800, mainly tensor operations
        """
        fresh_indices: (B, fresh_ratio*N), the index tensor for the fresh tokens
        """

        B, N, C = x.shape
        
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(0)   #q: (B, num_heads, N, head_dim)
        if cache_dic['cache_type'] == 'kv-norm':
            cache_dic['cache'][-1][current['layer']]['v_norm'] = torch.norm(v, dim=-1, p=2)

        q, k = self.q_norm(q), self.k_norm(k)
        #q: (B, num_heads, N-M, head_dim), k: (B, num_heads, N, head_dim), v: (B, num_heads, N, head_dim)
        if (self.fused_attn) and (cache_dic['cache_type'] !='attention'):
            x = F.scaled_dot_product_attention(
                q, k, v,
                dropout_p=self.attn_drop.p if self.training else 0.,
            )
            attn_map = None
        else:
            q = q * self.scale
            attn = q @ k.transpose(-2, -1)

            attn_map= attn.softmax(dim=-1) #extra cost for attn
            attn = self.attn_drop(attn_map)
            x = attn @ v
            attn_map = attn_map.mean(dim=1) #head mean
        
        x = x.transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x) 
        
        flops = (
            B * N * C * 3 * C * 2 # QKV projection
            + B * self.num_heads * N * self.head_dim  # Scale q
            + B * self.num_heads * N * N * self.head_dim * 2 # Q @ K
            + B * self.num_heads * N * N * 5 # Softmax
            + B * self.num_heads * N * N * self.head_dim * 2 # Attn @ V
            + B * N * C * C * 2 # Projection
        )
        cache_dic['flops']+=flops
        
        return x, attn_map # x: (B, N-M, C), attn_map: (B, N-M, N)


================================================
FILE: DiT-ToCa/cache_functions/cache_cutfresh.py
================================================
from .fresh_ratio_scheduler import fresh_ratio_scheduler
from .score_evaluate import score_evaluate
from .token_merge import token_merge
import torch
def cache_cutfresh(cache_dic, tokens, current):
    '''
    Cut fresh tokens from the input tokens and update the cache counter.
    
    cache_dic: dict, the cache dictionary containing cache(main extra memory cost), indices and some other information.
    tokens: torch.Tensor, the input tokens to be cut.
    current: dict, the current step, layer, and module information. Particularly convenient for debugging.
    '''
    step = current['step']
    layer = current['layer']
    module = current['module']
    
    fresh_ratio = fresh_ratio_scheduler(cache_dic, current)
    fresh_ratio = torch.clamp(torch.tensor(fresh_ratio), 0.0, 1.0)
    # Generate the index tensor for fresh tokens
    score = score_evaluate(cache_dic, tokens, current)
    score = local_selection_with_bonus(score, 0.6, 2) # Uniform Spatial Distribution s4 mentioned in the paper
    # 0.6, 2
    indices = score.argsort(dim=-1, descending=True)
    topk = int(fresh_ratio * score.shape[1])
    fresh_indices = indices[:, :topk]
    #stale_indices = indices[:, topk:]
    # (B, fresh_ratio *N)

    # Updating the Cache Frequency Score s3 mentioned in the paper
    # stale tokens index + 1, fresh tokens index = 0
    cache_dic['cache_index'][-1][layer][module] += 1
    cache_dic['cache_index'][-1][layer][module].scatter_(dim=1, index=fresh_indices, 
                                                                    src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device))
    
    ## not used in the final version
    #cache_dic['cache_index']['layer_index'][module] += 1
    #cache_dic['cache_index']['layer_index'][module].scatter_(dim=1, index=fresh_indices, 
    #                                                                src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device))
    # select the fresh tokens out
    fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])

    if module in ['mlp', 'attn']:
        # cut out the fresh tokens
        fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices_expand)

        return fresh_indices, fresh_tokens
    
    else:
        # no need for this branch hhh.
        raise ValueError("Unrecognized module?", module)
    
def local_selection_with_bonus(score, bonus_ratio, grid_size=2):
    '''
    Uniform Spatial Distribution s4 mentioned in the paper
    '''
    batch_size, num_tokens = score.shape
    image_size = int(num_tokens ** 0.5)
    block_size = grid_size * grid_size
    
    assert num_tokens % block_size == 0, "The number of tokens must be divisible by the block size."
    
    # Step 1: Reshape score to group it by blocks
    score_reshaped = score.view(batch_size, image_size // grid_size, grid_size, image_size // grid_size, grid_size)
    score_reshaped = score_reshaped.permute(0, 1, 3, 2, 4).contiguous()
    score_reshaped = score_reshaped.view(batch_size, -1, block_size)  # [batch_size, num_blocks, block_size]
    
    # Step 2: Find the max token in each block
    max_scores, max_indices = score_reshaped.max(dim=-1, keepdim=True)  # [batch_size, num_blocks, 1]
    
    # Step 3: Create a mask to identify max score tokens
    mask = torch.zeros_like(score_reshaped)
    mask.scatter_(-1, max_indices, 1)  # Set mask to 1 at the max indices
    
    # Step 4: Apply the bonus only to the max score tokens
    score_reshaped = score_reshaped + (mask * max_scores * bonus_ratio)  # Apply bonus only to max tokens
    
    # Step 5: Reshape the score back to its original shape
    score_modified = score_reshaped.view(batch_size, image_size // grid_size, image_size // grid_size, grid_size, grid_size)
    score_modified = score_modified.permute(0, 1, 3, 2, 4).contiguous()
    score_modified = score_modified.view(batch_size, num_tokens)
    
    return score_modified

================================================
FILE: DiT-ToCa/cache_functions/cache_init.py
================================================
def cache_init(model_kwargs, num_steps):   
    '''
    Initialization for cache.
    '''
    cache_dic = {}
    cache = {}
    cache_index = {}
    cache[-1]={}
    cache_index[-1]={}
    cache_index['layer_index']={}
    cache_dic['attn_map'] = {}
    cache_dic['attn_map'][-1] = {}
    for j in range(28):
        cache[-1][j] = {}
        cache_index[-1][j] = {}
        cache_dic['attn_map'][-1][j] = {}
    for i in range(num_steps):
        cache[i]={}
        for j in range(28):
            cache[i][j] = {}
    cache_dic['cache_type']           = model_kwargs['cache_type']
    cache_dic['cache_index']          = cache_index
    cache_dic['cache']                = cache
    cache_dic['fresh_ratio_schedule'] = model_kwargs['ratio_scheduler']
    cache_dic['fresh_ratio']          = model_kwargs['fresh_ratio']
    cache_dic['fresh_threshold']      = model_kwargs['fresh_threshold']
    cache_dic['force_fresh']          = model_kwargs['force_fresh']
    cache_dic['soft_fresh_weight']    = model_kwargs['soft_fresh_weight']
    cache_dic['flops']                = 0.0
    cache_dic['test_FLOPs']           = model_kwargs['test_FLOPs'] 
    
    cache_dic['cache'][-1]['noise_steps'] = {}
    cache_dic['counter'] = 0.0
    
    current = {}
    current['num_steps'] = num_steps
    return cache_dic, current
    

================================================
FILE: DiT-ToCa/cache_functions/cal_type.py
================================================
def cal_type(cache_dic, current):
    '''
    Determine calculation type for this step
    '''
    last_steps = (current['step'] <=2)
    first_step = (current['step'] == (current['num_steps'] - 1))
    force_fresh = cache_dic['force_fresh']
    if not first_step:
        fresh_interval = cache_dic['cal_threshold']
    else:
        fresh_interval = cache_dic['fresh_threshold']

    if (current['step'] % fresh_interval == 0) or first_step:
        current['type'] = 'full'
        
    elif ((current['step'] % fresh_interval) % 2 == 1): #[1,3,5] [2,4,6]
        current['type'] = 'ToCa'
    # 'ToCa' 'FORA'
    else: 
        current['type'] = 'ToCa'


================================================
FILE: DiT-ToCa/cache_functions/force_init.py
================================================
import torch
from .force_scheduler import force_scheduler
def force_init(cache_dic, current, tokens):
    '''
    Initialization for Force Activation step.
    '''
    # reset the cache index to 0
    cache_dic['cache_index'][-1][current['layer']][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device)
    if current['layer'] == 0:
        cache_dic['cache_index']['layer_index'][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device)
    #if current['layer'] == 27:
        force_scheduler(cache_dic, current)

================================================
FILE: DiT-ToCa/cache_functions/force_scheduler.py
================================================
import torch
def force_scheduler(cache_dic, current):
    '''
    Force Activation Cycle Scheduler
    '''
    if cache_dic['fresh_ratio'] == 0:
        # FORA
        linear_step_weight = 0.0
    else: 
        # ToCa
        linear_step_weight = 0.4 #0.4
    step_factor = torch.tensor(1 + linear_step_weight - 2 * linear_step_weight * current['step'] / current['num_steps'])
    threshold = torch.round(cache_dic['fresh_threshold'] / step_factor)

    if (current['step'] in range(int(current['num_steps']*0.2),int(current['num_steps']*0.4))) and (cache_dic['fresh_ratio'] != 0):
        # We find that in these 20% steps, the model is extremely sensitive for cache, i.e. worse temporal redundancy.
        threshold = 2

    cache_dic['cal_threshold'] = threshold


================================================
FILE: DiT-ToCa/cache_functions/fresh_ratio_scheduler.py
================================================
import torch
def fresh_ratio_scheduler(cache_dic, current):
    '''
    Return the fresh ratio for the current step.
    '''
    fresh_ratio = cache_dic['fresh_ratio']
    fresh_ratio_schedule = cache_dic['fresh_ratio_schedule']
    step = current['step']
    num_steps = current['num_steps']
    threshold = cache_dic['fresh_threshold']
    weight = 0.9
    if fresh_ratio_schedule == 'constant':
        return fresh_ratio
    elif fresh_ratio_schedule == 'linear':
        return fresh_ratio * (1 + weight - 2 * weight * step / num_steps)
    elif fresh_ratio_schedule == 'exp':
        #return 0.5 * (0.052 ** (step/num_steps))
        return fresh_ratio * (weight ** (step / num_steps))
    elif fresh_ratio_schedule == 'linear-mode':
        mode = (step % threshold)/threshold - 0.5
        mode_weight = 0.1
        return fresh_ratio * (1 + weight - 2 * weight * step / num_steps + mode_weight * mode)
    elif fresh_ratio_schedule == 'layerwise':
        return fresh_ratio * (1 + weight - 2 * weight * current['layer'] / 27)
    elif fresh_ratio_schedule == 'linear-layerwise':
        step_weight = 0.4 
        step_factor = 1 + step_weight - 2 * step_weight * step / num_steps

        layer_weight = 0.8
        layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27

        module_weight = 2.5
        module_time_weight = 0.6
        module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='attn' else (1 + module_time_weight * module_weight)
        
        return fresh_ratio * layer_factor * step_factor * module_factor
    
###### Recommended Configurations ######

    elif fresh_ratio_schedule == 'ToCa-ddim50':
        # Proposed scheduling method in toca.

        # step wise scheduling, we find there is little differece if change the weight of step factor, so this is not a key factor. 
        step_weight = 2.0 #0.4 #0.0 # 2.0
        step_factor = 1 + step_weight - 2 * step_weight * step / num_steps

        # layer wise scheduling, important. Meaning caculate more in the front layers, less in the back layers.
        layer_weight = -0.2#0.8 #0.0 # -0.2
        layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27

        # Module wise scheduling, important. Meaning caculate more in the mlp module, less in the attn module.
        module_weight = 2.5 # no calculations for attn module (2.5 * 0.4 = 1.0), compuation is transformed to mlp module.
        module_time_weight = 0.6 # estimated from the time and flops of mlp and attn module, may change in different situations.
        module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='attn' else (1 + module_time_weight * module_weight)
        
        return fresh_ratio * layer_factor * step_factor * module_factor
    
    elif fresh_ratio_schedule == 'ToCa-ddpm250':
        # Proposed scheduling method in toca.

        # step wise scheduling, we find there is little differece if change the weight of step factor, so this is not a key factor. 
        step_weight = 0.4 #0.0 # 2.0
        step_factor = 1 + step_weight - 2 * step_weight * step / num_steps

        # layer wise scheduling, important. Meaning caculate more in the front layers, less in the back layers.
        layer_weight = 0.8 #0.0 # -0.2
        layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27

        # Module wise scheduling, important. Meaning caculate more in the mlp module, less in the attn module.
        module_weight = 2.5 # no calculations for attn module (2.5 * 0.4 = 1.0), compuation is transformed to mlp module.
        module_time_weight = 0.6 # estimated from the time and flops of mlp and attn module, may change in different situations.
        module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='attn' else (1 + module_time_weight * module_weight)
        return fresh_ratio * layer_factor * step_factor * module_factor

    else:
        raise ValueError("unrecognized fresh ratio schedule", fresh_ratio_schedule)


================================================
FILE: DiT-ToCa/cache_functions/global_force_fresh.py
================================================
from .force_scheduler import force_scheduler
def global_force_fresh(cache_dic, current):
    '''
    Return whether to force fresh tokens globally.
    '''
    last_steps = (current['step'] <= 2)
    first_step = (current['step'] == (current['num_steps'] - 1))
    force_fresh = cache_dic['force_fresh']
    if not first_step:
        fresh_threshold = cache_dic['cal_threshold']
    else:
        fresh_threshold = cache_dic['fresh_threshold']

    if force_fresh == 'global':
    # global force fresh means force activate all tokens in this step.
        return (first_step or (current['step']% fresh_threshold == 0))
    
    elif force_fresh == 'local':
    # fresh locally cause much worse results, for the misalignment of cache and computed tokens.
        return first_step
    elif force_fresh == 'none':
        return first_step
    else:
        raise ValueError("unrecognized force fresh strategy", force_fresh)

================================================
FILE: DiT-ToCa/cache_functions/score_evaluate.py
================================================
import torch
import torch.nn as nn
from .scores import attn_score, similarity_score, norm_score, kv_norm_score
def score_evaluate(cache_dic, tokens, current) -> torch.Tensor:
    '''
    Return the score tensor (B, N) for the given tokens. Mainly include s1, (s2,) s3 mentioned in the paper.
    '''

    #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')):
    ## abandoned branch, if you want to explore the local force fresh strategy, this may help.
    #    force_fresh_mask = torch.as_tensor((cache_dic['cache_index'][-1][current['layer']][current['module']] >= 2 * cache_dic['fresh_threshold']), dtype = int) # 2 because the threshold is for step, not module
    #    force_len = force_fresh_mask.sum(dim=1)
    #    force_indices = force_fresh_mask.argsort(dim = -1, descending = True)[:, :force_len.min()]
    #
    #    force_indices = force_indices[:, torch.randperm(force_indices.shape[1])]

    if cache_dic['cache_type'] == 'random':
        # select tokens randomly, but remember to keep the same for cfg and no cfg.
        score = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1], device=tokens.device)
        score = torch.cat([score, score], dim=0).to(tokens.device)

    elif cache_dic['cache_type'] == 'straight':
        # abandon the cache, just return 1 hhh, obviously no use.
        score = torch.ones(tokens.shape[0], tokens.shape[1]).to(tokens.device)
    
    elif cache_dic['cache_type'] == 'attention':
        # Recommended selection method in the paper.

        # cache_dic['attn_map'][step][layer] (B, N, N), the last dimention has get softmaxed

        # calculate the attention score, for DiT, there is no cross-attention, so just self-attention score s1 applied.
        score = attn_score(cache_dic, current)

        # if you'd like to add some randomness to the score as SiTo does to avoid tokens been over cached. This works, but we have another elegant way.
        #score = score + 0.0 * torch.rand_like(score, device= score.device)
    elif cache_dic['cache_type'] == 'kv-norm':
        score = kv_norm_score(cache_dic, current)

    elif cache_dic['cache_type'] == 'similarity':
        # why don't we calculate similarity score? 
        # This is natural but we find it cost **TOO MUCH TIME**, cause in DiT series models, you can calculate similarity for scoring every where.
        score = similarity_score(cache_dic, current, tokens)

    elif cache_dic['cache_type'] == 'norm':
        # an interesting exploration, but not used in the final version.
        # use norm as the selectioon method is probably because of the norm of the tokens may indicate the importance of the token. but it is not the case.
        score = norm_score(cache_dic, current, tokens)

    elif cache_dic['cache_type'] == 'compress':
        # if you want to combine any of the methods mentioned, we have not tried this yet hhh.
        score1 = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1])
        score1 = torch.cat([score1, score1], dim=0).to(tokens.device)
        score2 = cache_dic['attn_map'][-1][current['layer']].sum(dim=1)#.mean(dim=0) # (B, N)
        # normalize
        score2 = score2 / score2.max(dim=1, keepdim=True)[0]
        score = 0.5 * score1 + 0.5 * score2

    # abandon the branch, if you want to explore the local force fresh strategy, this may help.
    #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # current['is_force_fresh'] is False, cause when it is True, no cut and fresh are needed
    #        #print(torch.ones_like(force_indices, dtype=float, device=force_indices.device).dtype)
    #    score.scatter_(dim=1, index=force_indices, src=torch.ones_like(force_indices, dtype=torch.float32, 
    #                                                                       device=force_indices.device))
    
    if (True and (cache_dic['force_fresh'] == 'global')):
        # apply s3 mentioned in the paper, the "True" above is for a switch to turn on/off the s3.
        soft_step_score = cache_dic['cache_index'][-1][current['layer']][current['module']].float() / (cache_dic['fresh_threshold'])

        # layer wise s3, not used in the final version. seems it is not necessary to add if step wise is applied.
        #soft_layer_score = cache_dic['cache_index']['layer_index'][current['module']].float() / (27)
        score = score + cache_dic['soft_fresh_weight'] * soft_step_score #+ 0.1 *soft_layer_score
    
    #cfg_score, no_cfg_score = torch.split(score, len(score)//2, dim = 0)
    #score = 0.5* cfg_score + 0.5* no_cfg_score
    #score = torch.cat([score,score], dim=0)

    return score.to(tokens.device)

================================================
FILE: DiT-ToCa/cache_functions/scores.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

def attn_score(cache_dic, current):
    '''
    Attention Score s1 (s2, but dit doesn't contain cross-attention for s2)
    '''
    #self_attn_score = 1- cache_dic['attn_map'][-1][current['layer']].diagonal(dim1=1, dim2=2)
    #self_attn_score = F.normalize(self_attn_score, dim=1, p=2)

    attention_score = F.normalize(cache_dic['attn_map'][-1][current['layer']].sum(dim=1), dim=1, p=2)

    #score = self_attn_score
    score = attention_score
    return score

def similarity_score(cache_dic, current, tokens):
    cosine_sim = F.cosine_similarity(tokens, cache_dic['cache'][-1][current['layer']][current['module']], dim=-1)

    return F.normalize(1- cosine_sim, dim=-1, p=2)

def norm_score(cache_dic, current, tokens):
    norm = tokens.norm(dim=-1, p=2)
    return F.normalize(norm, dim=-1, p=2)

def kv_norm_score(cache_dic, current):
    # (B, num_heads, N)
    #k_norm = cache_dic['cache'][-1][current['layer']]['k_norm']
    v_norm = cache_dic['cache'][-1][current['layer']]['v_norm']
    kv_norm = 1- v_norm 


    return F.normalize(kv_norm.sum(dim = -2), p=2)

================================================
FILE: DiT-ToCa/cache_functions/token_merge.py
================================================
import torch
def token_merge(cache_dic, tokens, current, fresh_indices, stale_indices):
    '''
    An abandoned branch in exploring if token merge helps. The answer is no, at least no for training-free strategy.
    '''
    if (current['layer'] % 1 == 0):
        fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]))
        stale_tokens = torch.gather(input = tokens, dim = 1, index = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]))
        method = 'similarity'
        if method == 'distance':
            descending = False
            distance = torch.cdist(stale_tokens, fresh_tokens, p=1)
            stale_fresh_dist, stale_fresh_indices_allstale = torch.min(distance, dim=2)
        elif method == 'similarity':
            descending = True
            fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1)
            stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1)
            similarity = stale_tokens @ fresh_tokens.transpose(1, 2)
            stale_fresh_dist, stale_fresh_indices_allstale = torch.max(similarity, dim=2)
        

        saved_topk_stale = int((stale_fresh_dist > 0.995).sum(dim=1).min())
        merged_stale_sequence = torch.sort(stale_fresh_dist, dim=1, descending=descending)[1][:,:saved_topk_stale]
        stale_fresh_indices = stale_fresh_indices_allstale.gather(1, merged_stale_sequence)
        merged_stale_sequence = stale_indices.gather(1, merged_stale_sequence)
        merged_stale_fresh_indices = fresh_indices.gather(1, stale_fresh_indices)
        cache_dic['merged_stale_fresh_indices'] = merged_stale_fresh_indices 
        cache_dic['merged_stale_sequence'] = merged_stale_sequence


================================================
FILE: DiT-ToCa/cache_functions/update_cache.py
================================================
import torch
def update_cache(fresh_indices, fresh_tokens, cache_dic, current, fresh_attn_map=None):
    '''
    Update the cache with the fresh tokens.
    '''
    step = current['step']
    layer = current['layer']
    module = current['module']
    
    # Update the cached tokens at the positions
    if module == 'attn': 
        # this branch is not used in the final version, but if you explore the partial fresh strategy of attention, it works.
        indices = fresh_indices.sort(dim=1, descending=False)[0]
        
        cache_dic['attn_map'][-1][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map)
    elif module == 'mlp':
        indices = fresh_indices

    cache_dic['cache'][-1][layer][module].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]), src=fresh_tokens)
            
    
================================================
FILE: DiT-ToCa/diffusion/__init__.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py

from . import gaussian_diffusion as gd
from .respace import SpacedDiffusion, space_timesteps


def create_diffusion(
    timestep_respacing,
    noise_schedule="linear", 
    use_kl=False,
    sigma_small=False,
    predict_xstart=False,
    learn_sigma=True,
    rescale_learned_sigmas=False,
    diffusion_steps=1000
):
    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
    if use_kl:
        loss_type = gd.LossType.RESCALED_KL
    elif rescale_learned_sigmas:
        loss_type = gd.LossType.RESCALED_MSE
    else:
        loss_type = gd.LossType.MSE
    if timestep_respacing is None or timestep_respacing == "":
        timestep_respacing = [diffusion_steps]
    return SpacedDiffusion(
        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
        betas=betas,
        model_mean_type=(
            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
        ),
        model_var_type=(
            (
                gd.ModelVarType.FIXED_LARGE
                if not sigma_small
                else gd.ModelVarType.FIXED_SMALL
            )
            if not learn_sigma
            else gd.ModelVarType.LEARNED_RANGE
        ),
        loss_type=loss_type
        # rescale_timesteps=rescale_timesteps,
    )


================================================
FILE: DiT-ToCa/diffusion/diffusion_utils.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py

import torch as th
import numpy as np


def normal_kl(mean1, logvar1, mean2, logvar2):
    """
    Compute the KL divergence between two gaussians.
    Shapes are automatically broadcasted, so batches can be compared to
    scalars, among other use cases.
    """
    tensor = None
    for obj in (mean1, logvar1, mean2, logvar2):
        if isinstance(obj, th.Tensor):
            tensor = obj
            break
    assert tensor is not None, "at least one argument must be a Tensor"

    # Force variances to be Tensors. Broadcasting helps convert scalars to
    # Tensors, but it does not work for th.exp().
    logvar1, logvar2 = [
        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
        for x in (logvar1, logvar2)
    ]

    return 0.5 * (
        -1.0
        + logvar2
        - logvar1
        + th.exp(logvar1 - logvar2)
        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
    )


def approx_standard_normal_cdf(x):
    """
    A fast approximation of the cumulative distribution function of the
    standard normal.
    """
    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))


def continuous_gaussian_log_likelihood(x, *, means, log_scales):
    """
    Compute the log-likelihood of a continuous Gaussian distribution.
    :param x: the targets
    :param means: the Gaussian mean Tensor.
    :param log_scales: the Gaussian log stddev Tensor.
    :return: a tensor like x of log probabilities (in nats).
    """
    centered_x = x - means
    inv_stdv = th.exp(-log_scales)
    normalized_x = centered_x * inv_stdv
    log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
    return log_probs


def discretized_gaussian_log_likelihood(x, *, means, log_scales):
    """
    Compute the log-likelihood of a Gaussian distribution discretizing to a
    given image.
    :param x: the target images. It is assumed that this was uint8 values,
              rescaled to the range [-1, 1].
    :param means: the Gaussian mean Tensor.
    :param log_scales: the Gaussian log stddev Tensor.
    :return: a tensor like x of log probabilities (in nats).
    """
    assert x.shape == means.shape == log_scales.shape
    centered_x = x - means
    inv_stdv = th.exp(-log_scales)
    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
    cdf_plus = approx_standard_normal_cdf(plus_in)
    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
    cdf_min = approx_standard_normal_cdf(min_in)
    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
    cdf_delta = cdf_plus - cdf_min
    log_probs = th.where(
        x < -0.999,
        log_cdf_plus,
        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
    )
    assert log_probs.shape == x.shape
    return log_probs


================================================
FILE: DiT-ToCa/diffusion/gaussian_diffusion.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py


import math

import numpy as np
import torch as th
import enum

from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl

from cache_functions import cache_init

def mean_flat(tensor):
    """
    Take the mean over all non-batch dimensions.
    """
    return tensor.mean(dim=list(range(1, len(tensor.shape))))


class ModelMeanType(enum.Enum):
    """
    Which type of output the model predicts.
    """

    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
    START_X = enum.auto()  # the model predicts x_0
    EPSILON = enum.auto()  # the model predicts epsilon


class ModelVarType(enum.Enum):
    """
    What is used as the model's output variance.
    The LEARNED_RANGE option has been added to allow the model to predict
    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
    """

    LEARNED = enum.auto()
    FIXED_SMALL = enum.auto()
    FIXED_LARGE = enum.auto()
    LEARNED_RANGE = enum.auto()


class LossType(enum.Enum):
    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
    RESCALED_MSE = (
        enum.auto()
    )  # use raw MSE loss (with RESCALED_KL when learning variances)
    KL = enum.auto()  # use the variational lower-bound
    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB

    def is_vb(self):
        return self == LossType.KL or self == LossType.RESCALED_KL


def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
    warmup_time = int(num_diffusion_timesteps * warmup_frac)
    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
    return betas


def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
    """
    This is the deprecated API for creating beta schedules.
    See get_named_beta_schedule() for the new library of schedules.
    """
    if beta_schedule == "quad":
        betas = (
            np.linspace(
                beta_start ** 0.5,
                beta_end ** 0.5,
                num_diffusion_timesteps,
                dtype=np.float64,
            )
            ** 2
        )
    elif beta_schedule == "linear":
        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
    elif beta_schedule == "warmup10":
        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
    elif beta_schedule == "warmup50":
        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
    elif beta_schedule == "const":
        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
        betas = 1.0 / np.linspace(
            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
        )
    else:
        raise NotImplementedError(beta_schedule)
    assert betas.shape == (num_diffusion_timesteps,)
    return betas


def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
    """
    Get a pre-defined beta schedule for the given name.
    The beta schedule library consists of beta schedules which remain similar
    in the limit of num_diffusion_timesteps.
    Beta schedules may be added, but should not be removed or changed once
    they are committed to maintain backwards compatibility.
    """
    if schedule_name == "linear":
        # Linear schedule from Ho et al, extended to work for any number of
        # diffusion steps.
        scale = 1000 / num_diffusion_timesteps
        return get_beta_schedule(
            "linear",
            beta_start=scale * 0.0001,
            beta_end=scale * 0.02,
            num_diffusion_timesteps=num_diffusion_timesteps,
        )
    elif schedule_name == "squaredcos_cap_v2":
        return betas_for_alpha_bar(
            num_diffusion_timesteps,
            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
        )
    else:
        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")


def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
    """
    Create a beta schedule that discretizes the given alpha_t_bar function,
    which defines the cumulative product of (1-beta) over time from t = [0,1].
    :param num_diffusion_timesteps: the number of betas to produce.
    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
                      produces the cumulative product of (1-beta) up to that
                      part of the diffusion process.
    :param max_beta: the maximum beta to use; use values lower than 1 to
                     prevent singularities.
    """
    betas = []
    for i in range(num_diffusion_timesteps):
        t1 = i / num_diffusion_timesteps
        t2 = (i + 1) / num_diffusion_timesteps
        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
    return np.array(betas)


class GaussianDiffusion:
    """
    Utilities for training and sampling diffusion models.
    Original ported from this codebase:
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
    :param betas: a 1-D numpy array of betas for each diffusion timestep,
                  starting at T and going to 1.
    """

    def __init__(
        self,
        *,
        betas,
        model_mean_type,
        model_var_type,
        loss_type
    ):

        self.model_mean_type = model_mean_type
        self.model_var_type = model_var_type
        self.loss_type = loss_type


        # Use float64 for accuracy.
        betas = np.array(betas, dtype=np.float64)
        self.betas = betas
        assert len(betas.shape) == 1, "betas must be 1-D"
        assert (betas > 0).all() and (betas <= 1).all()

        self.num_timesteps = int(betas.shape[0])

        alphas = 1.0 - betas
        self.alphas_cumprod = np.cumprod(alphas, axis=0)
        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)

        # calculations for diffusion q(x_t | x_{t-1}) and others
        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)

        # calculations for posterior q(x_{t-1} | x_t, x_0)
        self.posterior_variance = (
            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        )
        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
        self.posterior_log_variance_clipped = np.log(
            np.append(self.posterior_variance[1], self.posterior_variance[1:])
        ) if len(self.posterior_variance) > 1 else np.array([])

        self.posterior_mean_coef1 = (
            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        )
        self.posterior_mean_coef2 = (
            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
        )

    def q_mean_variance(self, x_start, t):
        """
        Get the distribution q(x_t | x_0).
        :param x_start: the [N x C x ...] tensor of noiseless inputs.
        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
        """
        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
        return mean, variance, log_variance

    def q_sample(self, x_start, t, noise=None):
        """
        Diffuse the data for a given number of diffusion steps.
        In other words, sample from q(x_t | x_0).
        :param x_start: the initial data batch.
        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
        :param noise: if specified, the split-out normal noise.
        :return: A noisy version of x_start.
        """
        if noise is None:
            noise = th.randn_like(x_start)
        assert noise.shape == x_start.shape
        return (
            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
        )

    def q_posterior_mean_variance(self, x_start, x_t, t):
        """
        Compute the mean and variance of the diffusion posterior:
            q(x_{t-1} | x_t, x_0)
        """
        assert x_start.shape == x_t.shape
        posterior_mean = (
            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
        )
        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
        posterior_log_variance_clipped = _extract_into_tensor(
            self.posterior_log_variance_clipped, t, x_t.shape
        )
        assert (
            posterior_mean.shape[0]
            == posterior_variance.shape[0]
            == posterior_log_variance_clipped.shape[0]
            == x_start.shape[0]
        )
        return posterior_mean, posterior_variance, posterior_log_variance_clipped

    def p_mean_variance(self, model, x, t, current=None, cache_dic=None, clip_denoised=True, denoised_fn=None, model_kwargs=None):
        #def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None): 
        """
        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
        the initial x, x_0.
        :param model: the model, which takes a signal and a batch of timesteps
                      as input.
        :param x: the [N x C x ...] tensor at time t.
        :param t: a 1-D Tensor of timesteps.
        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample. Applies before
            clip_denoised.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict with the following keys:
                 - 'mean': the model mean output.
                 - 'variance': the model variance output.
                 - 'log_variance': the log of 'variance'.
                 - 'pred_xstart': the prediction for x_0.
        """
        if model_kwargs is None:
            model_kwargs = {}

        B, C = x.shape[:2]
        assert t.shape == (B,)

        model_output = model(x, t, current=current, cache_dic=cache_dic, **model_kwargs)
        if isinstance(model_output, tuple):
            model_output, extra = model_output
        else:
            extra = None

        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
            assert model_output.shape == (B, C * 2, *x.shape[2:])
            model_output, model_var_values = th.split(model_output, C, dim=1)
            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
            # The model_var_values is [-1, 1] for [min_var, max_var].
            frac = (model_var_values + 1) / 2
            model_log_variance = frac * max_log + (1 - frac) * min_log
            model_variance = th.exp(model_log_variance)
        else:
            model_variance, model_log_variance = {
                # for fixedlarge, we set the initial (log-)variance like so
                # to get a better decoder log likelihood.
                ModelVarType.FIXED_LARGE: (
                    np.append(self.posterior_variance[1], self.betas[1:]),
                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
                ),
                ModelVarType.FIXED_SMALL: (
                    self.posterior_variance,
                    self.posterior_log_variance_clipped,
                ),
            }[self.model_var_type]
            model_variance = _extract_into_tensor(model_variance, t, x.shape)
            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)

        def process_xstart(x):
            if denoised_fn is not None:
                x = denoised_fn(x)
            if clip_denoised:
                return x.clamp(-1, 1)
            return x

        if self.model_mean_type == ModelMeanType.START_X:
            pred_xstart = process_xstart(model_output)
        else:
            pred_xstart = process_xstart(
                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
            )
        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)

        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
        return {
            "mean": model_mean,
            "variance": model_variance,
            "log_variance": model_log_variance,
            "pred_xstart": pred_xstart,
            "extra": extra,
        }

    def _predict_xstart_from_eps(self, x_t, t, eps):
        assert x_t.shape == eps.shape
        return (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
        )

    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
        return (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)

    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
        """
        Compute the mean for the previous step, given a function cond_fn that
        computes the gradient of a conditional log probability with respect to
        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
        condition on y.
        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
        """
        gradient = cond_fn(x, t, **model_kwargs)
        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
        return new_mean

    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
        """
        Compute what the p_mean_variance output would have been, should the
        model's score function be conditioned by cond_fn.
        See condition_mean() for details on cond_fn.
        Unlike condition_mean(), this instead uses the conditioning strategy
        from Song et al (2020).
        """
        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)

        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)

        out = p_mean_var.copy()
        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
        return out

    def p_sample(
        self,
        model,
        x,
        t,
        clip_denoised=True,
        current=None,
        cache_dic=None,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
    ):
        """
        Sample x_{t-1} from the model at the given timestep.
        :param model: the model to sample from.
        :param x: the current tensor at x_{t-1}.
        :param t: the value of t, starting at 0 for the first diffusion step.
        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample.
        :param cond_fn: if not None, this is a gradient function that acts
                        similarly to the model.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict containing the following keys:
                 - 'sample': a random sample from the model.
                 - 'pred_xstart': a prediction of x_0.
        """
        out = self.p_mean_variance(
            model,
            x,
            t,
            current=current,
            cache_dic=cache_dic,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        noise = th.randn_like(x)
        nonzero_mask = (
            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
        )  # no noise when t == 0
        if cond_fn is not None:
            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
        return {"sample": sample, "pred_xstart": out["pred_xstart"]}

    def p_sample_loop(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
    ):
        """
        Generate samples from the model.
        :param model: the model module.
        :param shape: the shape of the samples, (N, C, H, W).
        :param noise: if specified, the noise from the encoder to sample.
                      Should be of the same shape as `shape`.
        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample.
        :param cond_fn: if not None, this is a gradient function that acts
                        similarly to the model.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :param device: if specified, the device to create the samples on.
                       If not specified, use a model parameter's device.
        :param progress: if True, show a tqdm progress bar.
        :return: a non-differentiable batch of samples.
        """
        final = None
        for sample in self.p_sample_loop_progressive(
            model,
            shape,
            noise=noise,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            cond_fn=cond_fn,
            model_kwargs=model_kwargs,
            device=device,
            progress=progress,
        ):
            final = sample
        return final["sample"]

    def p_sample_loop_progressive(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
    ):
        """
        Generate samples from the model and yield intermediate samples from
        each timestep of diffusion.
        Arguments are the same as p_sample_loop().
        Returns a generator over dicts, where each dict is the return value of
        p_sample().
        """
        if device is None:
            device = next(model.parameters()).device
        assert isinstance(shape, (tuple, list))
        if noise is not None:
            img = noise
        else:
            img = th.randn(*shape, device=device)
        indices = list(range(self.num_timesteps))[::-1]

        if progress:
            # Lazy import so that we don't depend on tqdm.
            from tqdm.auto import tqdm

            indices = tqdm(indices)

        # Initialization for ToCa     
        cache_dic, current = cache_init(model_kwargs=model_kwargs, num_steps=self.num_timesteps)

        for i in indices:
            t = th.tensor([i] * shape[0], device=device)
            with th.no_grad():
                current['step'] = i
                out = self.p_sample(
                    model,
                    img,
                    t,
                    current=current,
                    cache_dic=cache_dic,
                    clip_denoised=clip_denoised,
                    denoised_fn=denoised_fn,
                    cond_fn=cond_fn,
                    model_kwargs=model_kwargs,
                )
                yield out
                img = out["sample"]
        
        if cache_dic['test_FLOPs'] == True:
            print(cache_dic['flops'] * 1e-12, "TFLOPs")

    def ddim_sample(
        self,
        model,
        x,
        t,
        current = None,
        cache_dic = None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        eta=0.0,
    ):
        """
        Sample x_{t-1} from the model using DDIM.
        Same usage as p_sample().
        """
        out = self.p_mean_variance(
            model,
            x,
            t,
            current=current,
            cache_dic=cache_dic,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        if cond_fn is not None:
            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)

        # Usually our model outputs epsilon, but we re-derive it
        # in case we used x_start or x_prev prediction.
        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])

        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
        sigma = (
            eta
            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
        )
        # Equation 12.
        noise = th.randn_like(x)
        mean_pred = (
            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
        )
        nonzero_mask = (
            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
        )  # no noise when t == 0
        sample = mean_pred + nonzero_mask * sigma * noise
        return {"sample": sample, "pred_xstart": out["pred_xstart"]}

    def ddim_reverse_sample(
        self,
        model,
        x,
        t,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        eta=0.0,
    ):
        """
        Sample x_{t+1} from the model using DDIM reverse ODE.
        """
        assert eta == 0.0, "Reverse ODE only for deterministic path"
        out = self.p_mean_variance(
            model,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        if cond_fn is not None:
            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
        # Usually our model outputs epsilon, but we re-derive it
        # in case we used x_start or x_prev prediction.
        eps = (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
            - out["pred_xstart"]
        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)

        # Equation 12. reversed
        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps

        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}

    def ddim_sample_loop(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
        eta=0.0,
    ):
        """
        Generate samples from the model using DDIM.
        Same usage as p_sample_loop().
        """
        final = None
        for sample in self.ddim_sample_loop_progressive(
            model,
            shape,
            noise=noise,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            cond_fn=cond_fn,
            model_kwargs=model_kwargs,
            device=device,
            progress=progress,
            eta=eta,
        ):
            final = sample
        return final["sample"]

    def ddim_sample_loop_progressive(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
        eta=0.0,
    ):
        """
        Use DDIM to sample from the model and yield intermediate samples from
        each timestep of DDIM.
        Same usage as p_sample_loop_progressive().
        """
        if device is None:
            device = next(model.parameters()).device
        assert isinstance(shape, (tuple, list))
        if noise is not None:
            img = noise
        else:
            img = th.randn(*shape, device=device)
        indices = list(range(self.num_timesteps))[::-1]

        if progress:
            # Lazy import so that we don't depend on tqdm.
            from tqdm.auto import tqdm

            indices = tqdm(indices)

        # Initialization for ToCa     
        cache_dic, current = cache_init(model_kwargs=model_kwargs, num_steps=self.num_timesteps)

        for i in indices:
            t = th.tensor([i] * shape[0], device=device)
            with th.no_grad():
                current['step'] = i
                out = self.ddim_sample(
                    model,
                    img,
                    t,
                    current=current,
                    cache_dic=cache_dic,
                    clip_denoised=clip_denoised,
                    denoised_fn=denoised_fn,
                    cond_fn=cond_fn,
                    model_kwargs=model_kwargs,
                    eta=eta,
                )
                yield out
                img = out["sample"]
        if cache_dic['test_FLOPs'] == True:
            print(cache_dic['flops'] * 1e-12, "TFLOPs")

    def _vb_terms_bpd(
            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
    ):
        """
        Get a term for the variational lower-bound.
        The resulting units are bits (rather than nats, as one might expect).
        This allows for comparison to other papers.
        :return: a dict with the following keys:
                 - 'output': a shape [N] tensor of NLLs or KLs.
                 - 'pred_xstart': the x_0 predictions.
        """
        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
            x_start=x_start, x_t=x_t, t=t
        )
        out = self.p_mean_variance(
            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
        )
        kl = normal_kl(
            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
        )
        kl = mean_flat(kl) / np.log(2.0)

        decoder_nll = -discretized_gaussian_log_likelihood(
            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
        )
        assert decoder_nll.shape == x_start.shape
        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)

        # At the first timestep return the decoder NLL,
        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
        output = th.where((t == 0), decoder_nll, kl)
        return {"output": output, "pred_xstart": out["pred_xstart"]}

    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
        """
        Compute training losses for a single timestep.
        :param model: the model to evaluate loss on.
        :param x_start: the [N x C x ...] tensor of inputs.
        :param t: a batch of timestep indices.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :param noise: if specified, the specific Gaussian noise to try to remove.
        :return: a dict with the key "loss" containing a tensor of shape [N].
                 Some mean or variance settings may also have other keys.
        """
        if model_kwargs is None:
            model_kwargs = {}
        if noise is None:
            noise = th.randn_like(x_start)
        x_t = self.q_sample(x_start, t, noise=noise)

        terms = {}

        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
            terms["loss"] = self._vb_terms_bpd(
                model=model,
                x_start=x_start,
                x_t=x_t,
                t=t,
                clip_denoised=False,
                model_kwargs=model_kwargs,
            )["output"]
            if self.loss_type == LossType.RESCALED_KL:
                terms["loss"] *= self.num_timesteps
        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
            model_output = model(x_t, t, **model_kwargs)

            if self.model_var_type in [
                ModelVarType.LEARNED,
                ModelVarType.LEARNED_RANGE,
            ]:
                B, C = x_t.shape[:2]
                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
                model_output, model_var_values = th.split(model_output, C, dim=1)
                # Learn the variance using the variational bound, but don't let
                # it affect our mean prediction.
                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
                terms["vb"] = self._vb_terms_bpd(
                    model=lambda *args, r=frozen_out: r,
                    x_start=x_start,
                    x_t=x_t,
                    t=t,
                    clip_denoised=False,
                )["output"]
                if self.loss_type == LossType.RESCALED_MSE:
                    # Divide by 1000 for equivalence with initial implementation.
                    # Without a factor of 1/1000, the VB term hurts the MSE term.
                    terms["vb"] *= self.num_timesteps / 1000.0

            target = {
                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
                    x_start=x_start, x_t=x_t, t=t
                )[0],
                ModelMeanType.START_X: x_start,
                ModelMeanType.EPSILON: noise,
            }[self.model_mean_type]
            assert model_output.shape == target.shape == x_start.shape
            terms["mse"] = mean_flat((target - model_output) ** 2)
            if "vb" in terms:
                terms["loss"] = terms["mse"] + terms["vb"]
            else:
                terms["loss"] = terms["mse"]
        else:
            raise NotImplementedError(self.loss_type)

        return terms

    def _prior_bpd(self, x_start):
        """
        Get the prior KL term for the variational lower-bound, measured in
        bits-per-dim.
        This term can't be optimized, as it only depends on the encoder.
        :param x_start: the [N x C x ...] tensor of inputs.
        :return: a batch of [N] KL values (in bits), one per batch element.
        """
        batch_size = x_start.shape[0]
        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
        kl_prior = normal_kl(
            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
        )
        return mean_flat(kl_prior) / np.log(2.0)

    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
        """
        Compute the entire variational lower-bound, measured in bits-per-dim,
        as well as other related quantities.
        :param model: the model to evaluate loss on.
        :param x_start: the [N x C x ...] tensor of inputs.
        :param clip_denoised: if True, clip denoised samples.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict containing the following keys:
                 - total_bpd: the total variational lower-bound, per batch element.
                 - prior_bpd: the prior term in the lower-bound.
                 - vb: an [N x T] tensor of terms in the lower-bound.
                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
        """
        device = x_start.device
        batch_size = x_start.shape[0]

        vb = []
        xstart_mse = []
        mse = []
        for t in list(range(self.num_timesteps))[::-1]:
            t_batch = th.tensor([t] * batch_size, device=device)
            noise = th.randn_like(x_start)
            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
            # Calculate VLB term at the current timestep
            with th.no_grad():
                out = self._vb_terms_bpd(
                    model,
                    x_start=x_start,
                    x_t=x_t,
                    t=t_batch,
                    clip_denoised=clip_denoised,
                    model_kwargs=model_kwargs,
                )
            vb.append(out["output"])
            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
            mse.append(mean_flat((eps - noise) ** 2))

        vb = th.stack(vb, dim=1)
        xstart_mse = th.stack(xstart_mse, dim=1)
        mse = th.stack(mse, dim=1)

        prior_bpd = self._prior_bpd(x_start)
        total_bpd = vb.sum(dim=1) + prior_bpd
        return {
            "total_bpd": total_bpd,
            "prior_bpd": prior_bpd,
            "vb": vb,
            "xstart_mse": xstart_mse,
            "mse": mse,
        }


def _extract_into_tensor(arr, timesteps, broadcast_shape):
    """
    Extract values from a 1-D numpy array for a batch of indices.
    :param arr: the 1-D numpy array.
    :param timesteps: a tensor of indices into the array to extract.
    :param broadcast_shape: a larger shape of K dimensions with the batch
                            dimension equal to the length of timesteps.
    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
    """
    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
    while len(res.shape) < len(broadcast_shape):
        res = res[..., None]
    return res + th.zeros(broadcast_shape, device=timesteps.device)


================================================
FILE: DiT-ToCa/diffusion/respace.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py

import numpy as np
import torch as th

from .gaussian_diffusion import GaussianDiffusion


def space_timesteps(num_timesteps, section_counts):
    """
    Create a list of timesteps to use from an original diffusion process,
    given the number of timesteps we want to take from equally-sized portions
    of the original process.
    For example, if there's 300 timesteps and the section counts are [10,15,20]
    then the first 100 timesteps are strided to be 10 timesteps, the second 100
    are strided to be 15 timesteps, and the final 100 are strided to be 20.
    If the stride is a string starting with "ddim", then the fixed striding
    from the DDIM paper is used, and only one section is allowed.
    :param num_timesteps: the number of diffusion steps in the original
                          process to divide up.
    :param section_counts: either a list of numbers, or a string containing
                           comma-separated numbers, indicating the step count
                           per section. As a special case, use "ddimN" where N
                           is a number of steps to use the striding from the
                           DDIM paper.
    :return: a set of diffusion steps from the original process to use.
    """
    if isinstance(section_counts, str):
        if section_counts.startswith("ddim"):
            desired_count = int(section_counts[len("ddim") :])
            for i in range(1, num_timesteps):
                if len(range(0, num_timesteps, i)) == desired_count:
                    return set(range(0, num_timesteps, i))
            raise ValueError(
                f"cannot create exactly {num_timesteps} steps with an integer stride"
            )
        section_counts = [int(x) for x in section_counts.split(",")]
    size_per = num_timesteps // len(section_counts)
    extra = num_timesteps % len(section_counts)
    start_idx = 0
    all_steps = []
    for i, section_count in enumerate(section_counts):
        size = size_per + (1 if i < extra else 0)
        if size < section_count:
            raise ValueError(
                f"cannot divide section of {size} steps into {section_count}"
            )
        if section_count <= 1:
            frac_stride = 1
        else:
            frac_stride = (size - 1) / (section_count - 1)
        cur_idx = 0.0
        taken_steps = []
        for _ in range(section_count):
            taken_steps.append(start_idx + round(cur_idx))
            cur_idx += frac_stride
        all_steps += taken_steps
        start_idx += size
    return set(all_steps)


class SpacedDiffusion(GaussianDiffusion):
    """
    A diffusion process which can skip steps in a base diffusion process.
    :param use_timesteps: a collection (sequence or set) of timesteps from the
                          original diffusion process to retain.
    :param kwargs: the kwargs to create the base diffusion process.
    """

    def __init__(self, use_timesteps, **kwargs):
        self.use_timesteps = set(use_timesteps)
        self.timestep_map = []
        self.original_num_steps = len(kwargs["betas"])

        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
        last_alpha_cumprod = 1.0
        new_betas = []
        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
            if i in self.use_timesteps:
                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
                last_alpha_cumprod = alpha_cumprod
                self.timestep_map.append(i)
        kwargs["betas"] = np.array(new_betas)
        super().__init__(**kwargs)

    def p_mean_variance(
        self, model, *args, **kwargs
    ):  # pylint: disable=signature-differs
        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)

    def training_losses(
        self, model, *args, **kwargs
    ):  # pylint: disable=signature-differs
        return super().training_losses(self._wrap_model(model), *args, **kwargs)

    def condition_mean(self, cond_fn, *args, **kwargs):
        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)

    def condition_score(self, cond_fn, *args, **kwargs):
        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)

    def _wrap_model(self, model):
        if isinstance(model, _WrappedModel):
            return model
        return _WrappedModel(
            model, self.timestep_map, self.original_num_steps
        )

    def _scale_timesteps(self, t):
        # Scaling is done by the wrapped model.
        return t


class _WrappedModel:
    def __init__(self, model, timestep_map, original_num_steps):
        self.model = model
        self.timestep_map = timestep_map
        # self.rescale_timesteps = rescale_timesteps
        self.original_num_steps = original_num_steps

    def __call__(self, x, ts, **kwargs):
        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
        new_ts = map_tensor[ts]
        # if self.rescale_timesteps:
        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
        return self.model(x, new_ts, **kwargs)


================================================
FILE: DiT-ToCa/diffusion/timestep_sampler.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py

from abc import ABC, abstractmethod

import numpy as np
import torch as th
import torch.distributed as dist


def create_named_schedule_sampler(name, diffusion):
    """
    Create a ScheduleSampler from a library of pre-defined samplers.
    :param name: the name of the sampler.
    :param diffusion: the diffusion object to sample for.
    """
    if name == "uniform":
        return UniformSampler(diffusion)
    elif name == "loss-second-moment":
        return LossSecondMomentResampler(diffusion)
    else:
        raise NotImplementedError(f"unknown schedule sampler: {name}")


class ScheduleSampler(ABC):
    """
    A distribution over timesteps in the diffusion process, intended to reduce
    variance of the objective.
    By default, samplers perform unbiased importance sampling, in which the
    objective's mean is unchanged.
    However, subclasses may override sample() to change how the resampled
    terms are reweighted, allowing for actual changes in the objective.
    """

    @abstractmethod
    def weights(self):
        """
        Get a numpy array of weights, one per diffusion step.
        The weights needn't be normalized, but must be positive.
        """

    def sample(self, batch_size, device):
        """
        Importance-sample timesteps for a batch.
        :param batch_size: the number of timesteps.
        :param device: the torch device to save to.
        :return: a tuple (timesteps, weights):
                 - timesteps: a tensor of timestep indices.
                 - weights: a tensor of weights to scale the resulting losses.
        """
        w = self.weights()
        p = w / np.sum(w)
        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
        indices = th.from_numpy(indices_np).long().to(device)
        weights_np = 1 / (len(p) * p[indices_np])
        weights = th.from_numpy(weights_np).float().to(device)
        return indices, weights


class UniformSampler(ScheduleSampler):
    def __init__(self, diffusion):
        self.diffusion = diffusion
        self._weights = np.ones([diffusion.num_timesteps])

    def weights(self):
        return self._weights


class LossAwareSampler(ScheduleSampler):
    def update_with_local_losses(self, local_ts, local_losses):
        """
        Update the reweighting using losses from a model.
        Call this method from each rank with a batch of timesteps and the
        corresponding losses for each of those timesteps.
        This method will perform synchronization to make sure all of the ranks
        maintain the exact same reweighting.
        :param local_ts: an integer Tensor of timesteps.
        :param local_losses: a 1D Tensor of losses.
        """
        batch_sizes = [
            th.tensor([0], dtype=th.int32, device=local_ts.device)
            for _ in range(dist.get_world_size())
        ]
        dist.all_gather(
            batch_sizes,
            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
        )

        # Pad all_gather batches to be the maximum batch size.
        batch_sizes = [x.item() for x in batch_sizes]
        max_bs = max(batch_sizes)

        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
        dist.all_gather(timestep_batches, local_ts)
        dist.all_gather(loss_batches, local_losses)
        timesteps = [
            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
        ]
        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
        self.update_with_all_losses(timesteps, losses)

    @abstractmethod
    def update_with_all_losses(self, ts, losses):
        """
        Update the reweighting using losses from a model.
        Sub-classes should override this method to update the reweighting
        using losses from the model.
        This method directly updates the reweighting without synchronizing
        between workers. It is called by update_with_local_losses from all
        ranks with identical arguments. Thus, it should have deterministic
        behavior to maintain state across workers.
        :param ts: a list of int timesteps.
        :param losses: a list of float losses, one per timestep.
        """


class LossSecondMomentResampler(LossAwareSampler):
    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
        self.diffusion = diffusion
        self.history_per_term = history_per_term
        self.uniform_prob = uniform_prob
        self._loss_history = np.zeros(
            [diffusion.num_timesteps, history_per_term], dtype=np.float64
        )
        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)

    def weights(self):
        if not self._warmed_up():
            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
        weights /= np.sum(weights)
        weights *= 1 - self.uniform_prob
        weights += self.uniform_prob / len(weights)
        return weights

    def update_with_all_losses(self, ts, losses):
        for t, loss in zip(ts, losses):
            if self._loss_counts[t] == self.history_per_term:
                # Shift out the oldest loss term.
                self._loss_history[t, :-1] = self._loss_history[t, 1:]
                self._loss_history[t, -1] = loss
            else:
                self._loss_history[t, self._loss_counts[t]] = loss
                self._loss_counts[t] += 1

    def _warmed_up(self):
        return (self._loss_counts == self.history_per_term).all()


================================================
FILE: DiT-ToCa/download.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Functions for downloading pre-trained DiT models
"""
from torchvision.datasets.utils import download_url
import torch
import os


pretrained_models = {'DiT-XL-2-512x512.pt', 'DiT-XL-2-256x256.pt'}


def find_model(model_name):
    """
    Finds a pre-trained DiT model, downloading it if necessary. Alternatively, loads a model from a local path.
    """
    if model_name in pretrained_models:  # Find/download our pre-trained DiT checkpoints
        return download_model(model_name)
    else:  # Load a custom DiT checkpoint:
        assert os.path.isfile(model_name), f'Could not find DiT checkpoint at {model_name}'
        checkpoint = torch.load(model_name, map_location=lambda storage, loc: storage, weights_only=True)
        if "ema" in checkpoint:  # supports checkpoints from train.py
            checkpoint = checkpoint["ema"]
        return checkpoint


def download_model(model_name):
    """
    Downloads a pre-trained DiT model from the web.
    """
    assert model_name in pretrained_models
    local_path = f'pretrained_models/{model_name}'
    if not os.path.isfile(local_path):
        os.makedirs('pretrained_models', exist_ok=True)
        web_path = f'https://dl.fbaipublicfiles.com/DiT/models/{model_name}'
        download_url(web_path, 'pretrained_models')
    model = torch.load(local_path, map_location=lambda storage, loc: storage)
    return model


if __name__ == "__main__":
    # Download all DiT checkpoints
    for model in pretrained_models:
        download_model(model)
    print('Done.')


================================================
FILE: DiT-ToCa/environment-dit.yml
================================================
name: base
channels:
  - pytorch
  - nvidia
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/
  - defaults
dependencies:
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=5.1=1_gnu
  - aiohttp=3.9.5=py312h5eee18b_0
  - aiosignal=1.2.0=pyhd3eb1b0_0
  - anaconda-anon-usage=0.4.4=py312hfc0e8ea_100
  - archspec=0.2.3=pyhd3eb1b0_0
  - arrow-cpp=16.1.0=hc1eb8f0_0
  - aws-c-auth=0.6.19=h5eee18b_0
  - aws-c-cal=0.5.20=hdbd6064_0
  - aws-c-common=0.8.5=h5eee18b_0
  - aws-c-compression=0.2.16=h5eee18b_0
  - aws-c-event-stream=0.2.15=h6a678d5_0
  - aws-c-http=0.6.25=h5eee18b_0
  - aws-c-io=0.13.10=h5eee18b_0
  - aws-c-mqtt=0.7.13=h5eee18b_0
  - aws-c-s3=0.1.51=hdbd6064_0
  - aws-c-sdkutils=0.1.6=h5eee18b_0
  - aws-checksums=0.1.13=h5eee18b_0
  - aws-crt-cpp=0.18.16=h6a678d5_0
  - aws-sdk-cpp=1.10.55=h721c034_0
  - blas=1.0=mkl
  - boltons=23.0.0=py312h06a4308_0
  - boost-cpp=1.82.0=hdb19cb5_2
  - bottleneck=1.3.7=py312ha883a20_0
  - brotli-python=1.0.9=py312h6a678d5_8
  - bzip2=1.0.8=h5eee18b_6
  - c-ares=1.19.1=h5eee18b_0
  - ca-certificates=2024.7.2=h06a4308_0
  - certifi=2024.7.4=py312h06a4308_0
  - cffi=1.16.0=py312h5eee18b_1
  - charset-normalizer=2.0.4=pyhd3eb1b0_0
  - conda=24.7.1=py312h06a4308_0
  - conda-content-trust=0.2.0=py312h06a4308_1
  - conda-libmamba-solver=24.1.0=pyhd3eb1b0_0
  - conda-package-handling=2.2.0=py312h06a4308_1
  - conda-package-streaming=0.9.0=py312h06a4308_0
  - cryptography=42.0.5=py312hdda0065_1
  - cuda-cudart=12.1.105=0
  - cuda-cupti=12.1.105=0
  - cuda-libraries=12.1.0=0
  - cuda-nvrtc=12.1.105=0
  - cuda-nvtx=12.1.105=0
  - cuda-opencl=12.6.37=0
  - cuda-runtime=12.1.0=0
  - cuda-version=12.6=3
  - datasets=2.19.1=py312h06a4308_0
  - diffusers=0.18.2=py312he106c6f_0
  - diffusers-base=0.18.2=py312he106c6f_0
  - diffusers-torch=0.18.2=py312he106c6f_0
  - dill=0.3.8=py312h06a4308_0
  - distro=1.9.0=py312h06a4308_0
  - expat=2.6.2=h6a678d5_0
  - ffmpeg=4.3=hf484d3e_0
  - fmt=9.1.0=hdb19cb5_1
  - freetype=2.12.1=h4a9f257_0
  - frozendict=2.4.2=py312h06a4308_0
  - frozenlist=1.4.0=py312h5eee18b_0
  - gflags=2.2.2=h6a678d5_1
  - glog=0.5.0=h6a678d5_1
  - gmp=6.2.1=h295c915_3
  - gnutls=3.6.15=he1e5248_0
  - huggingface_accelerate=0.21.0=py312h06a4308_0
  - huggingface_hub=0.23.1=py312h06a4308_0
  - icu=73.1=h6a678d5_0
  - idna=3.7=py312h06a4308_0
  - importlib-metadata=7.0.1=py312h06a4308_0
  - intel-openmp=2023.1.0=hdb19cb5_46306
  - jinja2=3.1.4=py312h06a4308_0
  - jpeg=9e=h5eee18b_3
  - jsonpatch=1.33=py312h06a4308_1
  - jsonpointer=2.1=pyhd3eb1b0_0
  - krb5=1.20.1=h143b758_1
  - lame=3.100=h7b6447c_0
  - lcms2=2.12=h3be6417_0
  - ld_impl_linux-64=2.38=h1181459_1
  - lerc=3.0=h295c915_0
  - libabseil=20240116.2=cxx17_h6a678d5_0
  - libarchive=3.6.2=h6ac8c49_3
  - libboost=1.82.0=h109eef0_2
  - libbrotlicommon=1.0.9=h5eee18b_8
  - libbrotlidec=1.0.9=h5eee18b_8
  - libbrotlienc=1.0.9=h5eee18b_8
  - libcublas=12.1.0.26=0
  - libcufft=11.0.2.4=0
  - libcufile=1.11.0.15=0
  - libcurand=10.3.7.37=0
  - libcurl=8.7.1=h251f7ec_0
  - libcusolver=11.4.4.55=0
  - libcusparse=12.0.2.55=0
  - libdeflate=1.17=h5eee18b_1
  - libedit=3.1.20230828=h5eee18b_0
  - libev=4.33=h7f8727e_1
  - libevent=2.1.12=hdbd6064_1
  - libffi=3.4.4=h6a678d5_1
  - libgcc-ng=11.2.0=h1234567_1
  - libgomp=11.2.0=h1234567_1
  - libgrpc=1.62.2=h2d74bed_0
  - libiconv=1.16=h5eee18b_3
  - libidn2=2.3.4=h5eee18b_0
  - libjpeg-turbo=2.0.0=h9bf148f_0
  - libmamba=1.5.8=hfe524e5_2
  - libmambapy=1.5.8=py312h2dafd23_2
  - libnghttp2=1.57.0=h2d74bed_0
  - libnpp=12.0.2.50=0
  - libnvjitlink=12.1.105=0
  - libnvjpeg=12.1.1.14=0
  - libpng=1.6.39=h5eee18b_0
  - libprotobuf=4.25.3=he621ea3_0
  - libsolv=0.7.24=he621ea3_1
  - libssh2=1.11.0=h251f7ec_0
  - libstdcxx-ng=11.2.0=h1234567_1
  - libtasn1=4.19.0=h5eee18b_0
  - libthrift=0.15.0=h1795dd8_2
  - libtiff=4.5.1=h6a678d5_0
  - libunistring=0.9.10=h27cfd23_0
  - libuuid=1.41.5=h5eee18b_0
  - libwebp-base=1.3.2=h5eee18b_0
  - libxml2=2.10.4=hfdd30dd_2
  - llvm-openmp=14.0.6=h9e868ea_0
  - lz4-c=1.9.4=h6a678d5_1
  - menuinst=2.0.2=py312h06a4308_1
  - mkl=2023.1.0=h213fc3f_46344
  - mkl-service=2.4.0=py312h5eee18b_1
  - mkl_fft=1.3.8=py312h5eee18b_0
  - mkl_random=1.2.4=py312hdb19cb5_0
  - mpmath=1.3.0=py312h06a4308_0
  - multidict=6.0.4=py312h5eee18b_0
  - multiprocess=0.70.15=py312h06a4308_0
  - ncurses=6.4=h6a678d5_0
  - nettle=3.7.3=hbbd107a_1
  - networkx=3.3=py312h06a4308_0
  - numexpr=2.8.7=py312hf827012_0
  - numpy=1.26.4=py312hc5e2394_0
  - numpy-base=1.26.4=py312h0da6c21_0
  - openh264=2.1.1=h4ff587b_0
  - openjpeg=2.5.2=he7f1fd0_0
  - openssl=3.0.14=h5eee18b_0
  - orc=2.0.1=h2d29ad5_0
  - packaging=23.2=py312h06a4308_0
  - pandas=2.2.2=py312h526ad5a_0
  - pcre2=10.42=hebb0a14_1
  - pip=24.0=py312h06a4308_0
  - platformdirs=3.10.0=py312h06a4308_0
  - pluggy=1.0.0=py312h06a4308_1
  - pyarrow=16.1.0=py312h526ad5a_0
  - pybind11-abi=5=hd3eb1b0_0
  - pycosat=0.6.6=py312h5eee18b_1
  - pycparser=2.21=pyhd3eb1b0_0
  - pysocks=1.7.1=py312h06a4308_0
  - python=3.12.3=h996f2a0_1
  - python-dateutil=2.9.0post0=py312h06a4308_2
  - python-tzdata=2023.3=pyhd3eb1b0_0
  - python-xxhash=2.0.2=py312h5eee18b_1
  - pytorch=2.4.0=py3.12_cuda12.1_cudnn9.1.0_0
  - pytorch-cuda=12.1=ha16c6d3_5
  - pytorch-mutex=1.0=cuda
  - pytz=2024.1=py312h06a4308_0
  - pyyaml=6.0.1=py312h5eee18b_0
  - re2=2022.04.01=h295c915_0
  - readline=8.2=h5eee18b_0
  - regex=2024.7.24=py312h5eee18b_0
  - reproc=14.2.4=h6a678d5_2
  - reproc-cpp=14.2.4=h6a678d5_2
  - requests=2.31.0=py312h06a4308_1
  - ruamel.yaml=0.17.21=py312h5eee18b_0
  - s2n=1.3.27=hdbd6064_0
  - safetensors=0.4.2=py312hb7cc22b_1
  - setuptools=69.5.1=py312h06a4308_0
  - six=1.16.0=pyhd3eb1b0_1
  - snappy=1.2.1=h6a678d5_0
  - sqlite=3.45.3=h5eee18b_0
  - tbb=2021.8.0=hdb19cb5_0
  - tk=8.6.14=h39e8969_0
  - tokenizers=0.19.1=py312ha11519a_0
  - torchaudio=2.4.0=py312_cu121
  - torchtriton=3.0.0=py312
  - tqdm=4.66.2=py312he106c6f_0
  - transformers=4.41.2=py312h06a4308_0
  - truststore=0.8.0=py312h06a4308_0
  - typing_extensions=4.11.0=py312h06a4308_0
  - tzdata=2024a=h04d1e81_0
  - urllib3=2.1.0=py312h06a4308_1
  - utf8proc=2.6.1=h5eee18b_1
  - wheel=0.43.0=py312h06a4308_0
  - xxhash=0.8.0=h7f8727e_3
  - xz=5.4.6=h5eee18b_1
  - yaml=0.2.5=h7b6447c_0
  - yaml-cpp=0.8.0=h6a678d5_1
  - yarl=1.9.3=py312h5eee18b_0
  - zipp=3.17.0=py312h06a4308_0
  - zlib=1.2.13=h5eee18b_1
  - zstandard=0.22.0=py312h2c38b39_0
  - zstd=1.5.5=hc292b87_2
  - pip:
      - absl-py==2.1.0
      - anyio==4.4.0
      - argon2-cffi==23.1.0
      - argon2-cffi-bindings==21.2.0
      - arrow==1.3.0
      - asttokens==2.4.1
      - async-lru==2.0.4
      - attrs==23.2.0
      - babel==2.15.0
      - beautifulsoup4==4.12.3
      - bleach==6.1.0
      - brokenaxes==0.6.2
      - comm==0.2.2
      - contourpy==1.2.1
      - cycler==0.12.1
      - debugpy==1.8.1
      - decorator==5.1.1
      - defusedxml==0.7.1
      - executing==2.0.1
      - fastjsonschema==2.19.1
      - filelock==3.14.0
      - fonttools==4.53.0
      - fqdn==1.5.1
      - fsspec==2024.5.0
      - grpcio==1.64.0
      - h11==0.14.0
      - httpcore==1.0.5
      - httpx==0.27.0
      - ipykernel==6.29.4
      - ipython==8.25.0
      - ipywidgets==8.1.3
      - isoduration==20.11.0
      - jedi==0.19.1
      - json5==0.9.25
      - jsonschema==4.22.0
      - jsonschema-specifications==2023.12.1
      - jupyter-client==8.6.2
      - jupyter-core==5.7.2
      - jupyter-events==0.10.0
      - jupyter-lsp==2.2.5
      - jupyter-server==2.14.1
      - jupyter-server-terminals==0.5.3
      - jupyterlab==4.2.1
      - jupyterlab-language-pack-zh-cn==4.2.post1
      - jupyterlab-pygments==0.3.0
      - jupyterlab-server==2.27.2
      - jupyterlab-widgets==3.0.11
      - kiwisolver==1.4.5
      - markdown==3.6
      - markupsafe==2.1.5
      - matplotlib==3.9.0
      - matplotlib-inline==0.1.7
      - mistune==3.0.2
      - nbclient==0.10.0
      - nbconvert==7.16.4
      - nbformat==5.10.4
      - nest-asyncio==1.6.0
      - notebook-shim==0.2.4
      - nvidia-cublas-cu12==12.1.3.1
      - nvidia-cuda-cupti-cu12==12.1.105
      - nvidia-cuda-nvrtc-cu12==12.1.105
      - nvidia-cuda-runtime-cu12==12.1.105
      - nvidia-cudnn-cu12==9.1.0.70
      - nvidia-cufft-cu12==11.0.2.54
      - nvidia-curand-cu12==10.3.2.106
      - nvidia-cusolver-cu12==11.4.5.107
      - nvidia-cusparse-cu12==12.1.0.106
      - nvidia-nccl-cu12==2.20.5
      - nvidia-nvjitlink-cu12==12.5.40
      - nvidia-nvtx-cu12==12.1.105
      - overrides==7.7.0
      - pandocfilters==1.5.1
      - parso==0.8.4
      - pexpect==4.9.0
      - pillow==10.3.0
      - prometheus-client==0.20.0
      - prompt-toolkit==3.0.45
      - protobuf==5.27.0
      - psutil==5.9.8
      - ptyprocess==0.7.0
      - pure-eval==0.2.2
      - pygments==2.18.0
      - pyparsing==3.1.2
      - python-json-logger==2.0.7
      - pytorch-fid==0.3.0
      - pyzmq==26.0.3
      - referencing==0.35.1
      - rfc3339-validator==0.1.4
      - rfc3986-validator==0.1.1
      - rpds-py==0.18.1
      - scipy==1.14.1
      - send2trash==1.8.3
      - sniffio==1.3.1
      - soupsieve==2.5
      - stack-data==0.6.3
      - supervisor==4.2.5
      - sympy==1.12.1
      - tensorboard==2.16.2
      - tensorboard-data-server==0.7.2
      - terminado==0.18.1
      - timm==1.0.8
      - tinycss2==1.3.0
      - torch==2.4.0
      - torchvision==0.19.0
      - tornado==6.4
      - traitlets==5.14.3
      - triton==3.0.0
      - types-python-dateutil==2.9.0.20240316
      - typing-extensions==4.12.1
      - uri-template==1.3.0
      - wcwidth==0.2.13
      - webcolors==1.13
      - webencodings==0.5.1
      - websocket-client==1.8.0
      - werkzeug==3.0.3
      - widgetsnbextension==4.0.11
prefix: /root/miniconda3


================================================
FILE: DiT-ToCa/models.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------

import torch
import torch.nn as nn
import numpy as np
import math
#from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
from timm.models.vision_transformer import PatchEmbed, Mlp
#import os.path as osp
from cache_functions import global_force_fresh, cache_cutfresh, update_cache, force_init, Attention, cal_type


def modulate(x, shift, scale):
    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)


#################################################################################
#               Embedding Layers for Timesteps and Class Labels                 #
#################################################################################

class TimestepEmbedder(nn.Module):
    """
    Embeds scalar timesteps into vector representations.
    """
    def __init__(self, hidden_size, frequency_embedding_size=256):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
            nn.SiLU(),
            nn.Linear(hidden_size, hidden_size, bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size

    @staticmethod
    def timestep_embedding(t, dim, max_period=10000):
        """
        Create sinusoidal timestep embeddings.
        :param t: a 1-D Tensor of N indices, one per batch element.
                          These may be fractional.
        :param dim: the dimension of the output.
        :param max_period: controls the minimum frequency of the embeddings.
        :return: an (N, D) Tensor of positional embeddings.
        """
        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
        half = dim // 2
        freqs = torch.exp(
            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
        ).to(device=t.device)
        args = t[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
        return embedding

    def forward(self, t):
        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
        t_emb = self.mlp(t_freq)
        return t_emb


class LabelEmbedder(nn.Module):
    """
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
    """
    def __init__(self, num_classes, hidden_size, dropout_prob):
        super().__init__()
        use_cfg_embedding = dropout_prob > 0
        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
        self.num_classes = num_classes
        self.dropout_prob = dropout_prob

    def token_drop(self, labels, force_drop_ids=None):
        """
        Drops labels to enable classifier-free guidance.
        """
        if force_drop_ids is None:
            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
        else:
            drop_ids = force_drop_ids == 1
        labels = torch.where(drop_ids, self.num_classes, labels)
        return labels

    def forward(self, labels, train, force_drop_ids=None):
        use_dropout = self.dropout_prob > 0
        if (train and use_dropout) or (force_drop_ids is not None):
            labels = self.token_drop(labels, force_drop_ids)
        embeddings = self.embedding_table(labels)
        return embeddings


#################################################################################
#                                 Core DiT Model                                #
#################################################################################

class DiTBlock(nn.Module):
    """
    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
    """
    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
        super().__init__()
        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        approx_gelu = lambda: nn.GELU(approximate="tanh")
        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
        )

    def forward(self, x, c, current, cache_dic):
        B, N, C = x.shape

        layer = current['layer']

        # FLOPs calculation initialization
        flops = 0
        test_FLOPs = cache_dic.get('test_FLOPs', False)  # check if test_FLOPs is enabled
        
        # determine current working status
        cal_type(cache_dic, current)

        if current['type'] == 'full':  # Force Activation: Compute all tokens and save them in cache

            # AdaLN Modulation
            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)

            # LayerNorm FLOPs (for both norm1 and norm2)
            if test_FLOPs:
                flops += 2 * B * N * C

            # AdaLN FLOPs (SiLU and Linear)
            if test_FLOPs:
                flops += B * C  # SiLU FLOPs
                flops += B * C * 6 * C  # Linear FLOPs in adaLN_modulation

            current['module'] = 'attn'
            attn_output, attn_map = self.attn(modulate(self.norm1(x), shift_msa, scale_msa), cache_dic=cache_dic, current=current)
            cache_dic['cache'][-1][layer]['attn'] = attn_output
            cache_dic['attn_map'][-1][layer] = attn_map
            force_init(cache_dic, current, x)
            x = x + gate_msa.unsqueeze(1) * attn_output

            current['module'] = 'mlp'
            mlp_output = self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
            cache_dic['cache'][-1][layer]['mlp'] = mlp_output
            force_init(cache_dic, current, x)
            x = x + gate_mlp.unsqueeze(1) * mlp_output

            # MLP FLOPs
            if test_FLOPs:
                mlp_hidden_dim = int(C * 4)  # Assuming mlp_ratio = 4
                flops += B * N * C * mlp_hidden_dim * 2 # First projection
                flops += B * N * mlp_hidden_dim * C * 2# Second projection
                flops += B * N * mlp_hidden_dim * 6 # GELU activation

        elif current['type'] == 'ToCa':  # Partial Computation: Compute only fresh tokens and save them in cache, no attention token computation in the final version
            
            # AdaLN Modulation
            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
            
            # LayerNorm FLOPs (for both norm1 and norm2)
            if test_FLOPs:
                flops += 2 * B * N * C

            # AdaLN FLOPs (SiLU and Linear)
            if test_FLOPs:
                flops += B * C  # SiLU FLOPs
                flops += B * C * 6 * C  # Linear FLOPs in adaLN_modulation

            current['module'] = 'attn'
            x = x + gate_msa.unsqueeze(1) * cache_dic['cache'][-1][layer]['attn']

            current['module'] = 'mlp'
            fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current)
            fresh_tokens = self.mlp(modulate(self.norm2(fresh_tokens), shift_mlp, scale_mlp))
            update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current)
            
            x = x + gate_mlp.unsqueeze(1) * cache_dic['cache'][-1][layer]['mlp']
            

            # MLP FLOPs for the 'else' branch
            if test_FLOPs:
                B_fresh, N_fresh, C_fresh = fresh_tokens.shape
                mlp_hidden_dim = int(C_fresh * 4)  # Assuming mlp_ratio = 4
                flops += B_fresh * N_fresh * C_fresh * mlp_hidden_dim * 2 # First projection
                flops += B_fresh * N_fresh * mlp_hidden_dim * C_fresh * 2 # Second projection
                flops += B_fresh * N_fresh * mlp_hidden_dim * 6 # GELU activation

        elif current['type'] == 'FORA':
            
            # AdaLN Modulation
            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
            
            # AdaLN FLOPs (SiLU and Linear)
            if test_FLOPs:
                flops += B * C  # SiLU FLOPs
                flops += B * C * 6 * C  # Linear FLOPs in adaLN_modulation

            current['module'] = 'attn'
            x = x + gate_msa.unsqueeze(1) * cache_dic['cache'][-1][layer]['attn']

            current['module'] = 'mlp'
            x = x + gate_mlp.unsqueeze(1) * cache_dic['cache'][-1][layer]['mlp']
        
        else:
            current['module'] = 'skipped'
            if current['layer'] == 27:
                x = cache_dic['cache'][-1]['noise']

        cache_dic['flops'] += flops

        if current['layer'] == 27:
            cache_dic['cache'][-1]['noise'] = x

        return x


class FinalLayer(nn.Module):
    """
    The final layer of DiT.
    """
    def __init__(self, hidden_size, patch_size, out_channels):
        super().__init__()
        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
        )

    def forward(self, x, c):
        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
        x = modulate(self.norm_final(x), shift, scale)
        x = self.linear(x)
        return x


class DiT(nn.Module):
    """
    Diffusion model with a Transformer backbone.
    """
    def __init__(
        self,
        input_size=32,
        patch_size=2,
        in_channels=4,
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        num_classes=1000,
        learn_sigma=True,
    ):
        super().__init__()
        self.learn_sigma = learn_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if learn_sigma else in_channels
        self.patch_size = patch_size
        self.num_heads = num_heads

        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
        self.t_embedder = TimestepEmbedder(hidden_size)
        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
        num_patches = self.x_embedder.num_patches
        # Will use fixed sin-cos embedding:
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)

        self.blocks = nn.ModuleList([
            DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
        ])
        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
        self.initialize_weights()

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
        self.apply(_basic_init)

        # Initialize (and freeze) pos_embed by sin-cos embedding:
        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
        nn.init.constant_(self.x_embedder.proj.bias, 0)

        # Initialize label embedding table:
        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)

        # Zero-out adaLN modulation layers in DiT blocks:
        for block in self.blocks:
            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)

    def unpatchify(self, x):
        """
        x: (N, T, patch_size**2 * C)
        imgs: (N, H, W, C)
        """
        c = self.out_channels
        p = self.x_embedder.patch_size[0]
        h = w = int(x.shape[1] ** 0.5)
        assert h * w == x.shape[1]

        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
        x = torch.einsum('nhwpqc->nchpwq', x)
        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
        return imgs

    def forward(self, x, t, current, cache_dic, y): 
        """
        Forward pass of DiT.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N,) tensor of class labels
        """

        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
        t = self.t_embedder(t)                   # (N, D)
        y = self.y_embedder(y, self.training)    # (N, D)
        c = t + y                                # (N, D)

        for layeridx, block in enumerate(self.blocks):
            current['layer'] = layeridx
            x = block(x, c, current, cache_dic)                      # (N, T, D)

        x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)                   # (N, out_channels, H, W)
        return x

    
    def forward_with_cfg(self, x, t, current, cache_dic, y, cfg_scale, **kwargs):
    #def forward_with_cfg(self, x, t, y, cfg_scale):
        """
        Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.
        """
        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
        half = x[: len(x) // 2]
        combined = torch.cat([half, half], dim=0)
        #model_out = self.forward(combined, t, y)
        model_out = self.forward(combined, t, current, cache_dic, y)
        # For exact reproducibility reasons, we apply classifier-free guidance on only
        # three channels by default. The standard approach to cfg applies it to all channels.
        # This can be done by uncommenting the following line and commenting-out the line following that.
        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
        eps, rest = model_out[:, :3], model_out[:, 3:]
        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
        eps = torch.cat([half_eps, half_eps], dim=0)
        return torch.cat([eps, rest], dim=1)
    

#################################################################################
#                   Sine/Cosine Positional Embedding Functions                  #
#################################################################################
# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py

def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    grid_h = np.arange(grid_size, dtype=np.float32)
    grid_w = np.arange(grid_size, dtype=np.float32)
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)

    grid = grid.reshape([2, 1, grid_size, grid_size])
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token and extra_tokens > 0:
        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
    return pos_embed


def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0

    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)

    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
    return emb


def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float64)
    omega /= embed_dim / 2.
    omega = 1. / 10000**omega  # (D/2,)

    pos = pos.reshape(-1)  # (M,)
    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product

    emb_sin = np.sin(out) # (M, D/2)
    emb_cos = np.cos(out) # (M, D/2)

    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
    return emb


#################################################################################
#                                   DiT Configs                                  #
#################################################################################

def DiT_XL_2(**kwargs):
    return DiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)

def DiT_XL_4(**kwargs):
    return DiT(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)

def DiT_XL_8(**kwargs):
    return DiT(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)

def DiT_L_2(**kwargs):
    return DiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)

def DiT_L_4(**kwargs):
    return DiT(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)

def DiT_L_8(**kwargs):
    return DiT(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)

def DiT_B_2(**kwargs):
    return DiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)

def DiT_B_4(**kwargs):
    return DiT(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)

def DiT_B_8(**kwargs):
    return DiT(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)

def DiT_S_2(**kwargs):
    return DiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)

def DiT_S_4(**kwargs):
    return DiT(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)

def DiT_S_8(**kwargs):
    return DiT(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)


DiT_models = {
    'DiT-XL/2': DiT_XL_2,  'DiT-XL/4': DiT_XL_4,  'DiT-XL/8': DiT_XL_8,
    'DiT-L/2':  DiT_L_2,   'DiT-L/4':  DiT_L_4,   'DiT-L/8':  DiT_L_8,
    'DiT-B/2':  DiT_B_2,   'DiT-B/4':  DiT_B_4,   'DiT-B/8':  DiT_B_8,
    'DiT-S/2':  DiT_S_2,   'DiT-S/4':  DiT_S_4,   'DiT-S/8':  DiT_S_8,
}


================================================
FILE: DiT-ToCa/sample.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Sample new images from a pre-trained DiT.
"""
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
from torchvision.utils import save_image
from diffusion import create_diffusion
from diffusers.models import AutoencoderKL
from download import find_model
from models import DiT_models
import argparse


def main(args):
    # Setup PyTorch:
    torch.manual_seed(args.seed)
    torch.set_grad_enabled(False)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #device = "cpu" 
    #print("device = ", device, flush=True)
    #print(torch.cuda.device_count(), flush=True)

    if args.ckpt is None:
        assert args.model == "DiT-XL/2", "Only DiT-XL/2 models are available for auto-download."
        assert args.image_size in [256, 512]
        assert args.num_classes == 1000

    # Load model:
    latent_size = args.image_size // 8
    model = DiT_models[args.model](
        input_size=latent_size,
        num_classes=args.num_classes
    ).to(device)
    # Auto-download a pre-trained model or load a custom DiT checkpoint from train.py:
    ckpt_path = args.ckpt or f"/root/autodl-tmp/pretrained_models/DiT/DiT-XL-2-{args.image_size}x{args.image_size}.pt"
    state_dict = find_model(ckpt_path)
    model.load_state_dict(state_dict)
    model.eval()  # important!
    diffusion = create_diffusion(str(args.num_sampling_steps))
    vae = AutoencoderKL.from_pretrained(f"/root/autodl-tmp/pretrained_models/stabilityai/sd-vae-ft-{args.vae}").to(device)
    #vae = AutoencoderKL.from_pretrained(f"/root/autodl-tmp/pretrained_models").to(device)

    # Labels to condition the model with (feel free to change):
    class_labels = [985]


    # Create sampling noise:
    n = len(class_labels)
    # Sample 4 images for category label
    z = torch.randn(n, 4, latent_size, latent_size, device=device)
    y = torch.tensor(class_labels, device=device)

    # Setup classifier-free guidance:
    #print("cfg scale = ", args.cfg_scale, flush=True)
    z = torch.cat([z, z], 0)
    y_null = torch.tensor([1000] * n, device=device)
    y = torch.cat([y, y_null], 0)
    model_kwargs = dict(y=y, cfg_scale=args.cfg_scale)

    model_kwargs['cache_type']        = args.cache_type
    model_kwargs['fresh_ratio']       = args.fresh_ratio
    model_kwargs['force_fresh']       = args.force_fresh
    model_kwargs['fresh_threshold']   = args.fresh_threshold
    model_kwargs['ratio_scheduler']   = args.ratio_scheduler
    model_kwargs['soft_fresh_weight'] = args.soft_fresh_weight
    model_kwargs['test_FLOPs']        = args.test_FLOPs
        

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()

    if args.ddim_sample:
        samples = diffusion.ddim_sample_loop(
            model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device
        )
    else:
        samples = diffusion.p_sample_loop(
            model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device
        )
    end.record()
    torch.cuda.synchronize()
    print(f"Total Sampling took {start.elapsed_time(end)*0.001} seconds")

    samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
    samples = vae.decode(samples / 0.18215).sample

    # Save and display images:
    save_image(samples, "sample.png", nrow=4, normalize=True, value_range=(-1, 1))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, choices=list(DiT_models.keys()), default="DiT-XL/2")
    parser.add_argument("--vae", type=str, choices=["ema", "mse"], default="mse")
    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
    parser.add_argument("--num-classes", type=int, default=1000)
    parser.add_argument("--cfg-scale", type=float, default=1.5)
    parser.add_argument("--num-sampling-steps", type=int, default=250)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--ckpt", type=str, default=None,
                        help="Optional path to a DiT checkpoint (default: auto-download a pre-trained DiT-XL/2 model).")
    parser.add_argument("--ddim-sample", action="store_true", default=False)
    parser.add_argument("--cache-type", type=str, choices=['random', 'attention','similarity','norm', 'compress','kv-norm'], default='attention') # only attention is supported currently
    parser.add_argument("--fresh-ratio", type=float, default=0.07)
    parser.add_argument("--ratio-scheduler", type=str, default='ToCa', choices=['linear', 'cosine', 'exp', 'constant','linear-mode','layerwise','ToCa-ddpm250', 'ToCa-ddim50']) #  'ToCa' is the proposed scheduler in Final version of the paper
    parser.add_argument("--force-fresh", type=str, choices=['global', 'local'], default='global',
                        help="Force fresh strategy. global: fresh all tokens. local: fresh tokens acheiving fresh step threshold.") # only global is supported currently, local causes bad results
    parser.add_argument("--fresh-threshold", type=int, default=4) # N in the paper
    parser.add_argument("--soft-fresh-weight", type=float, default=0.25, # lambda_3 in the paper
                        help="soft weight for updating the stale tokens by adding extra scores.")
    parser.add_argument("--test-FLOPs", action="store_true", default=False)
    #parser.add_argument("--merge-weight", type=float, default=0.0) # never used in the paper, just for exploration

    args = parser.parse_args()
    main(args)


================================================
FILE: DiT-ToCa/sample_ddp.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Samples a large number of images from a pre-trained DiT model using DDP.
Subsequently saves a .npz file that can be used to compute FID and other
evaluation metrics via the ADM repo: https://github.com/openai/guided-diffusion/tree/main/evaluations

For a simple single-GPU/CPU sampling script, see sample.py.
"""
import torch
import torch.distributed as dist
from models import DiT_models
from download import find_model
from diffusion import create_diffusion
from diffusers.models import AutoencoderKL
from tqdm import tqdm
import os
from PIL import Image
import numpy as np
import math
import argparse


def create_npz_from_sample_folder(sample_dir, num=50_000):
    """
    Builds a single .npz file from a folder of .png samples.
    """
    samples = []
    for i in tqdm(range(num), desc="Building .npz file from samples"):
        sample_pil = Image.open(f"{sample_dir}/{i:06d}.png")
        sample_np = np.asarray(sample_pil).astype(np.uint8)
        samples.append(sample_np)
    samples = np.stack(samples)
    assert samples.shape == (num, samples.shape[1], samples.shape[2], 3)
    npz_path = f"{sample_dir}.npz"
    np.savez(npz_path, arr_0=samples)
    print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
    return npz_path

def main(args):
    """
    Run sampling.
    """

    torch.backends.cuda.matmul.allow_tf32 = args.tf32  # True: fast but may lead to some small numerical differences
    assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage"
    torch.set_grad_enabled(False)

    # Setup DDP:
    dist.init_process_group("nccl")
    rank = dist.get_rank()
    device = rank % torch.cuda.device_count()
    seed = args.global_seed * dist.get_world_size() + rank
    torch.manual_seed(seed)
    torch.cuda.set_device(device)
    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")

    if args.ckpt is None:
        assert args.model == "DiT-XL/2", "Only DiT-XL/2 models are available for auto-download."
        assert args.image_size in [256, 512]
        assert args.num_classes == 1000

    # Load model:
    latent_size = args.image_size // 8
    model = DiT_models[args.model](
        input_size=latent_size,
        num_classes=args.num_classes
    ).to(device)
    # Auto-download a pre-trained model or load a custom DiT checkpoint from train.py:
    ckpt_path = args.ckpt or f"/root/autodl-tmp/pretrained_models/DiT/DiT-XL-2-{args.image_size}x{args.image_size}.pt"
    state_dict = find_model(ckpt_path)
    model.load_state_dict(state_dict)
    model.eval()  # important!
    diffusion = create_diffusion(str(args.num_sampling_steps))
    vae = AutoencoderKL.from_pretrained(f"/root/autodl-tmp/pretrained_models/stabilityai/sd-vae-ft-{args.vae}").to(device)
    #vae = AutoencoderKL.from_pretrained(f"/root/autodl-tmp/pretrained_models").to(device)
    assert args.cfg_scale >= 1.0, "In almost all cases, cfg_scale be >= 1.0"
    using_cfg = args.cfg_scale > 1.0

    # Create folder to save samples:
    model_string_name = args.model.replace("/", "-")
    ckpt_string_name = os.path.basename(args.ckpt).replace(".pt", "") if args.ckpt else "pretrained"
    folder_name = f"ToCa-{model_string_name}-{ckpt_string_name}-size-{args.image_size}-vae-{args.vae}-" \
                  f"cfg-{args.cfg_scale}-seed-{args.global_seed}-step-{args.num_sampling_steps}-num-{args.num_fid_samples}"\
                  f"-{args.cache_type}-{args.fresh_ratio}-{args.ratio_scheduler}-{args.force_fresh}-{args.fresh_threshold}"\
                  f"-softweight-{args.soft_fresh_weight}"
    sample_folder_dir = f"{args.sample_dir}/{folder_name}"
    if rank == 0:
        os.makedirs(sample_folder_dir, exist_ok=True)
        print(f"Saving .png samples at {sample_folder_dir}")
    dist.barrier()

    # Figure out how many samples we need to generate on each GPU and how many iterations we need to run:
    n = args.per_proc_batch_size
    global_batch_size = n * dist.get_world_size()
    # To make things evenly-divisible, we'll sample a bit more than we need and then discard the extra samples:
    total_samples = int(math.ceil(args.num_fid_samples / global_batch_size) * global_batch_size)
    if rank == 0:
        print(f"Total number of images that will be sampled: {total_samples}")
    assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size"
    samples_needed_this_gpu = int(total_samples // dist.get_world_size())
    assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size"
    iterations = int(samples_needed_this_gpu // n)
    pbar = range(iterations)
    pbar = tqdm(pbar) if rank == 0 else pbar
    total = 0

    for _ in pbar:
        # Sample inputs:
        z = torch.randn(n, model.in_channels, latent_size, latent_size, device=device)
        y = torch.randint(0, args.num_classes, (n,), device=device)

        # Setup classifier-free guidance:
        if using_cfg:
            z = torch.cat([z, z], 0)
            y_null = torch.tensor([1000] * n, device=device)
            y = torch.cat([y, y_null], 0)
            model_kwargs = dict(y=y, cfg_scale=args.cfg_scale)
            sample_fn = model.forward_with_cfg
        else:
            model_kwargs = dict(y=y)
            sample_fn = model.forward

        model_kwargs['cache_type']        = args.cache_type
        model_kwargs['fresh_ratio']       = args.fresh_ratio
        model_kwargs['force_fresh']       = args.force_fresh
        model_kwargs['fresh_threshold']   = args.fresh_threshold
        model_kwargs['ratio_scheduler']   = args.ratio_scheduler
        model_kwargs['soft_fresh_weight'] = args.soft_fresh_weight
        model_kwargs['test_FLOPs']        = args.test_FLOPs
        

        # Sample images:
        if args.ddim_sample:
            samples = diffusion.ddim_sample_loop(
                sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=False, device=device
            )
        else:
            samples = diffusion.p_sample_loop(
                sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=False, device=device,
            )
            
        if using_cfg:
            samples, _ = samples.chunk(2, dim=0)  # Remove null class samples

        samples = vae.decode(samples / 0.18215).sample
        samples = torch.clamp(127.5 * samples + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()

        # Save samples to disk as individual .png files
        for i, sample in enumerate(samples):
            index = i * dist.get_world_size() + rank + total
            Image.fromarray(sample).save(f"{sample_folder_dir}/{index:06d}.png")
        total += global_batch_size

    # Make sure all processes have finished saving their samples before attempting to convert to .npz
    dist.barrier()
    if rank == 0:
        create_npz_from_sample_folder(sample_folder_dir, args.num_fid_samples)
        print("Done.")
    dist.barrier()
    dist.destroy_process_group()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, choices=list(DiT_models.keys()), default="DiT-XL/2")
    parser.add_argument("--vae",  type=str, choices=["ema", "mse"], default="ema")
    parser.add_argument("--sample-dir", type=str, default="/root/autodl-tmp/samples") # Change this to your desired sample directory
    parser.add_argument("--per-proc-batch-size", type=int, default=32)
    parser.add_argument("--num-fid-samples", type=int, default=50_000)
    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
    parser.add_argument("--num-classes", type=int, default=1000)
    parser.add_argument("--cfg-scale",  type=float, default=1.5)
    parser.add_argument("--num-sampling-steps", type=int, default=250)
    parser.add_argument("--global-seed", type=int, default=0)
    parser.add_argument("--tf32", action=argparse.BooleanOptionalAction, default=True,
                        help="By default, use TF32 matmuls. This massively accelerates sampling on Ampere GPUs.")
    parser.add_argument("--ckpt", type=str, default=None,
                        help="Optional path to a DiT checkpoint (default: auto-download a pre-trained DiT-XL/2 model).")
    parser.add_argument("--ddim-sample", action="store_true", default=False)
    parser.add_argument("--fresh-ratio", type=float, default=0.07)
    parser.add_argument("--cache-type", type=str, choices=['random', 'attention','similarity','norm', 'compress','kv-norm'], default='random') # only attention supported currently
    parser.add_argument("--ratio-scheduler", type=str, default='ToCa', choices=['linear', 'cosine', 'exp', 'constant','linear-mode','layerwise','ToCa-ddpm250', 'ToCa-ddim50']) #  'ToCa' is the proposed scheduler in Final version of the paper
    parser.add_argument("--force-fresh", type=str, choices=['global', 'local'], default='global', # only global is supported currently, local causes bad results
                        help="Force fresh strategy. global: fresh all tokens. local: fresh tokens acheiving fresh step threshold.")
    parser.add_argument("--fresh-threshold", type=int, default=4) # N in the paper
    parser.add_argument("--soft-fresh-weight", type=float, default=0.25, # lambda_3 in the paper
                        help="soft weight for updating the stale tokens by adding extra scores.")
    parser.add_argument("--test-FLOPs", action="store_true", default=False)
    #parser.add_argument("--merge-weight", type=float, default=0.0) # never used in the paper, just for exploration

    args = parser.parse_args()
    main(args)

================================================
FILE: DiT-ToCa/train.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
A minimal training script for DiT using PyTorch DDP.
"""
import torch
# the first flag below was False when we tested this script but True makes A100 training a lot faster:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from torchvision.datasets import ImageFolder
from torchvision import transforms
import numpy as np
from collections import OrderedDict
from PIL import Image
from copy import deepcopy
from glob import glob
from time import time
import argparse
import logging
import os

from models import DiT_models
from diffusion import create_diffusion
from diffusers.models import AutoencoderKL


#################################################################################
#                             Training Helper Functions                         #
#################################################################################

@torch.no_grad()
def update_ema(ema_model, model, decay=0.9999):
    """
    Step the EMA model towards the current model.
    """
    ema_params = OrderedDict(ema_model.named_parameters())
    model_params = OrderedDict(model.named_parameters())

    for name, param in model_params.items():
        # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed
        ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)


def requires_grad(model, flag=True):
    """
    Set requires_grad flag for all parameters in a model.
    """
    for p in model.parameters():
        p.requires_grad = flag


def cleanup():
    """
    End DDP training.
    """
    dist.destroy_process_group()


def create_logger(logging_dir):
    """
    Create a logger that writes to a log file and stdout.
    """
    if dist.get_rank() == 0:  # real logger
        logging.basicConfig(
            level=logging.INFO,
            format='[\033[34m%(asctime)s\033[0m] %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S',
            handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")]
        )
        logger = logging.getLogger(__name__)
    else:  # dummy logger (does nothing)
        logger = logging.getLogger(__name__)
        logger.addHandler(logging.NullHandler())
    return logger


def center_crop_arr(pil_image, image_size):
    """
    Center cropping implementation from ADM.
    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
    """
    while min(*pil_image.size) >= 2 * image_size:
        pil_image = pil_image.resize(
            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
        )

    scale = image_size / min(*pil_image.size)
    pil_image = pil_image.resize(
        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
    )

    arr = np.array(pil_image)
    crop_y = (arr.shape[0] - image_size) // 2
    crop_x = (arr.shape[1] - image_size) // 2
    return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])


#################################################################################
#                                  Training Loop                                #
#################################################################################

def main(args):
    """
    Trains a new DiT model.
    """
    assert torch.cuda.is_available(), "Training currently requires at least one GPU."

    # Setup DDP:
    dist.init_process_group("nccl")
    assert args.global_batch_size % dist.get_world_size() == 0, f"Batch size must be divisible by world size."
    rank = dist.get_rank()
    device = rank % torch.cuda.device_count()
    seed = args.global_seed * dist.get_world_size() + rank
    torch.manual_seed(seed)
    torch.cuda.set_device(device)
    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")

    # Setup an experiment folder:
    if rank == 0:
        os.makedirs(args.results_dir, exist_ok=True)  # Make results folder (holds all experiment subfolders)
        experiment_index = len(glob(f"{args.results_dir}/*"))
        model_string_name = args.model.replace("/", "-")  # e.g., DiT-XL/2 --> DiT-XL-2 (for naming folders)
        experiment_dir = f"{args.results_dir}/{experiment_index:03d}-{model_string_name}"  # Create an experiment folder
        checkpoint_dir = f"{experiment_dir}/checkpoints"  # Stores saved model checkpoints
        os.makedirs(checkpoint_dir, exist_ok=True)
        logger = create_logger(experiment_dir)
        logger.info(f"Experiment directory created at {experiment_dir}")
    else:
        logger = create_logger(None)

    # Create model:
    assert args.image_size % 8 == 0, "Image size must be divisible by 8 (for the VAE encoder)."
    latent_size = args.image_size // 8
    model = DiT_models[args.model](
        input_size=latent_size,
        num_classes=args.num_classes
    )
    # Note that parameter initialization is done within the DiT constructor
    ema = deepcopy(model).to(device)  # Create an EMA of the model for use after training
    requires_grad(ema, False)
    model = DDP(model.to(device), device_ids=[rank])
    diffusion = create_diffusion(timestep_respacing="")  # default: 1000 steps, linear noise schedule
    vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
    logger.info(f"DiT Parameters: {sum(p.numel() for p in model.parameters()):,}")

    # Setup optimizer (we used default Adam betas=(0.9, 0.999) and a constant learning rate of 1e-4 in our paper):
    opt = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0)

    # Setup data:
    transform = transforms.Compose([
        transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, args.image_size)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
    ])
    dataset = ImageFolder(args.data_path, transform=transform)
    sampler = DistributedSampler(
        dataset,
        num_replicas=dist.get_world_size(),
        rank=rank,
        shuffle=True,
        seed=args.global_seed
    )
    loader = DataLoader(
        dataset,
        batch_size=int(args.global_batch_size // dist.get_world_size()),
        shuffle=False,
        sampler=sampler,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True
    )
    logger.info(f"Dataset contains {len(dataset):,} images ({args.data_path})")

    # Prepare models for training:
    update_ema(ema, model.module, decay=0)  # Ensure EMA is initialized with synced weights
    model.train()  # important! This enables embedding dropout for classifier-free guidance
    ema.eval()  # EMA model should always be in eval mode

    # Variables for monitoring/logging purposes:
    train_steps = 0
    log_steps = 0
    running_loss = 0
    start_time = time()

    logger.info(f"Training for {args.epochs} epochs...")
    for epoch in range(args.epochs):
        sampler.set_epoch(epoch)
        logger.info(f"Beginning epoch {epoch}...")
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            with torch.no_grad():
                # Map input images to latent space + normalize latents:
                x = vae.encode(x).latent_dist.sample().mul_(0.18215)
            t = torch.randint(0, diffusion.num_timesteps, (x.shape[0],), device=device)
            model_kwargs = dict(y=y)
            loss_dict = diffusion.training_losses(model, x, t, model_kwargs)
            loss = loss_dict["loss"].mean()
            opt.zero_grad()
            loss.backward()
            opt.step()
            update_ema(ema, model.module)

            # Log loss values:
            running_loss += loss.item()
            log_steps += 1
            train_steps += 1
            if train_steps % args.log_every == 0:
                # Measure training speed:
                torch.cuda.synchronize()
                end_time = time()
                steps_per_sec = log_steps / (end_time - start_time)
                # Reduce loss history over all processes:
                avg_loss = torch.tensor(running_loss / log_steps, device=device)
                dist.all_reduce(avg_loss, op=dist.ReduceOp.SUM)
                avg_loss = avg_loss.item() / dist.get_world_size()
                logger.info(f"(step={train_steps:07d}) Train Loss: {avg_loss:.4f}, Train Steps/Sec: {steps_per_sec:.2f}")
                # Reset monitoring variables:
                running_loss = 0
                log_steps = 0
                start_time = time()

            # Save DiT checkpoint:
            if train_steps % args.ckpt_every == 0 and train_steps > 0:
                if rank == 0:
                    checkpoint = {
                        "model": model.module.state_dict(),
                        "ema": ema.state_dict(),
                        "opt": opt.state_dict(),
                        "args": args
                    }
                    checkpoint_path = f"{checkpoint_dir}/{train_steps:07d}.pt"
                    torch.save(checkpoint, checkpoint_path)
                    logger.info(f"Saved checkpoint to {checkpoint_path}")
                dist.barrier()

    model.eval()  # important! This disables randomized embedding dropout
    # do any sampling/FID calculation/etc. with ema (or model) in eval mode ...

    logger.info("Done!")
    cleanup()


if __name__ == "__main__":
    # Default args here will train DiT-XL/2 with the hyperparameters we used in our paper (except training iters).
    parser = argparse.ArgumentParser()
    parser.add_argument("--data-path", type=str, required=True)
    parser.add_argument("--results-dir", type=str, default="results")
    parser.add_argument("--model", type=str, choices=list(DiT_models.keys()), default="DiT-XL/2")
    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
    parser.add_argument("--num-classes", type=int, default=1000)
    parser.add_argument("--epochs", type=int, default=1400)
    parser.add_argument("--global-batch-size", type=int, default=256)
    parser.add_argument("--global-seed", type=int, default=0)
    parser.add_argument("--vae", type=str, choices=["ema", "mse"], default="ema")  # Choice doesn't affect training
    parser.add_argument("--num-workers", type=int, default=4)
    parser.add_argument("--log-every", type=int, default=100)
    parser.add_argument("--ckpt-every", type=int, default=50_000)
    args = parser.parse_args()
    main(args)


================================================
FILE: DrawBench200.txt
================================================
A red colored car.
A black colored car.
A pink colored car.
A black colored dog.
A red colored dog.
A blue colored dog.
A green colored banana.
A red colored banana.
A black colored banana.
A white colored sandwich.
A black colored sandwich.
An orange colored sandwich.
A pink colored giraffe.
A yellow colored giraffe.
A brown colored giraffe.
A red car and a white sheep.
A blue bird and a brown bear.
A green apple and a black backpack.
A green cup and a blue cell phone.
A yellow book and a red vase.
A white car and a red sheep.
A brown bird and a blue bear.
A black apple and a green backpack.
A blue cup and a green cell phone.
A red book and a yellow vase.
A horse riding an astronaut.
A pizza cooking an oven.
A bird scaring a scarecrow.
A blue coloured pizza.
Hovering cow abducting aliens.
A panda making latte art.
A shark in the desert.
An elephant under the sea.
Rainbow coloured penguin.
A fish eating a pelican.
One car on the street.
Two cars on the street.
Three cars on the street.
Four cars on the street.
Five cars on the street.
One dog on the street.
Two dogs on the street.
Three dogs on the street.
Four dogs on the street.
Five dogs on the street.
One cat and one dog sitting on the grass.
One cat and two dogs sitting on the grass.
One cat and three dogs sitting on the grass.
Two cats and one dog sitting on the grass.
Two cats and two dogs sitting on the grass.
Two cats and three dogs sitting on the grass.
Three cats and one dog sitting on the grass.
Three cats and two dogs sitting on the grass.
Three cats and three dogs sitting on the grass.
A triangular purple flower pot. A purple flower pot in the shape of a triangle.
A triangular orange picture frame. An orange picture frame in the shape of a triangle.
A triangular pink stop sign. A pink stop sign in the shape of a triangle.
A cube made of denim. A cube with the texture of denim.
A sphere made of kitchen tile. A sphere with the texture of kitchen tile.
A cube made of brick. A cube with the texture of brick.
A collection of nail is sitting on a table.
A single clock is sitting on a table.
A couple of glasses are sitting on a table.
An illustration of a large red elephant sitting on a small blue mouse.
An illustration of a small green elephant standing behind a large red mouse.
A small blue book sitting on a large red book.
"A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom."
"A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom."
"A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom."
"An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants."
"An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants."
A fisheye lens view of a turtle sitting in a forest.
A side view of an owl sitting in a field.
A cross-section view of a brain.
"A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel."
"A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare."
"A small vessel propelled on water by oars, sails, or an engine."
A connection point by which firefighters can tap into a water supply.
"A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time."
"A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun."
"A separate seat for one person, typically with a back and four legs."
An appliance or compartment which is artificially kept cool and used to store food and drink.
A mechanical or electrical device for measuring time.
"An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles."
"A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads."
A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe.
"A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed."
"A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice."
"An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity."
"An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics."
"A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals."
"A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank."
"A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks."
A machine resembling a human being and able to replicate certain human movements and functions automatically.
Paying for a quarter-sized pizza with a pizza-sized quarter.
An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.
"A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf."
"In late afternoon in January in New England, a man stands in the shadow of a maple tree."
An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.
A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.
A pear cut into seven pieces arranged in a ring.
"A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope."
"Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field."
Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.
Tcennis rpacket.
Bzaseball galove.
Rbefraigerator.
Dininrg tablez.
Pafrking metr.
"A smafml vessef epropoeilled on watvewr by ors, sauls, or han engie."
"A sjmall domesticated carnivorious mammnal with sof fuh,y a sthort sout, and retracwtablbe flaws. It iw widexly kept as a pet or for catchitng mic, ad many breeds zhlyde beefn develvoked."
"An instqrumemnt used for cutting cloth, paper, axdz othr thdin mteroial, consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on kthe end oc thei vatndlzes."
"A domesticated carnivvorous mzammal that typicbally hfaas a lons sfnout, an acxujte sense off osmell, noneetractaaln crlaws, anid xbarkring,y howlingu, or whining rvoiche."
"A ldarge keybord msical instroument lwith a woden case enmclosig a qsouvnkboajrd and mfgtal strivgf, which are strucrk b hammrs when the nels are depresdsmed.f lhe strsingsj' vibration ie stopped by damperds when the keys re released and can bce regulavewdd for lengh and vnolume y two or three pedalvs."
A train on top of a surfboard.
A wine glass on top of a dog.
A bicycle on top of a boat.
An umbrella on top of a spoon.
A laptop on top of a teddy bear.
A giraffe underneath a microwave.
A donut underneath a toilet.
A hair drier underneath a sheep.
A tennis racket underneath a traffic light.
A zebra underneath a broccoli.
A banana on the left of an apple.
A couch on the left of a chair.
A car on the left of a bus.
A cat on the left of a dog.
A carrot on the left of a broccoli.
A pizza on the right of a suitcase.
A cat on the right of a tennis racket.
A stop sign on the right of a refrigerator.
A sheep to the right of a wine glass.
A zebra to the right of a fire hydrant.
Acersecomicke.
Jentacular.
Matutinal.
Peristeronic.
Artophagous.
Backlotter.
Octothorpe.
A church with stained glass windows depicting a hamburger and french fries.
"Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna."
"A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears."
A photo of a confused grizzly bear in calculus class.
An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.
"A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes."
"A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art."
A 1960s yearbook photo with animals dressed as humans.
Lego Arnold Schwarzenegger.
A yellow and black bus cruising through the rainforest.
A medieval painting of the wifi not working.
"An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506."
"35mm macro shot a kitten licking a baby duck, studio lighting."
McDonalds Church.
Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.
Greek statue of a man tripping over a cat.
"An old photograph of a 1920s airship shaped like a pig, floating over a wheat field."
Photo of a cat singing in a barbershop quartet.
"A painting by Grant Wood of an astronaut couple, american gothic style."
An oil painting portrait of the regal Burger King posing with a Whopper.
"A keyboard made of water, the water is made of light, the light is turned off."
Painting of Mona Lisa but the view is from behind of Mona Lisa.
Hyper-realistic photo of an abandoned industrial site during a storm.
A screenshot of an iOS app for ordering different types of milk.
"A real life photography of super mario, 8k Ultra HD."
Colouring page of large cats climbing the eifel tower in a cyberpunk future.
Photo of a mega Lego space station inside a kid's bedroom.
A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.
A photocopy of a photograph of a painting of a sculpture of a giraffe.
"A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view."
"A maglev train going vertically downward in high speed, New York Times photojournalism."
A magnifying glass over a page of a 1950s batman comic.
"A car playing soccer, digital art."
Darth Vader playing with raccoon in Mars during sunset.
A 1960s poster warning against climate change.
Illustration of a mouse using a mushroom as an umbrella.
A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.
A pyramid made of falafel with a partial solar eclipse in the background.
A storefront with 'Hello World' written on it.
A storefront with 'Diffusion' written on it.
A storefront with 'Text to Image' written on it.
A storefront with 'NeurIPS' written on it.
A storefront with 'Deep Learning' written on it.
A storefront with 'Google Brain Toronto' written on it.
A storefront with 'Google Research Pizza Cafe' written on it.
A sign that says 'Hello World'.
A sign that says 'Diffusion'.
A sign that says 'Text to Image'.
A sign that says 'NeurIPS'.
A sign that says 'Deep Learning'.
A sign that says 'Google Brain Toronto'.
A sign that says 'Google Research Pizza Cafe'.
New York Skyline with 'Hello World' written with fireworks on the sky.
New York Skyline with 'Diffusion' written with fireworks on the sky.
New York Skyline with 'Text to Image' written with fireworks on the sky.
New York Skyline with 'NeurIPS' written with fireworks on the sky.
New York Skyline with 'Deep Learning' written with fireworks on the sky.
New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.
New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.


================================================
FILE: LICENSE
================================================
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    <program>  Copyright (C) <year>  <name of author>
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<https://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<https://www.gnu.org/licenses/why-not-lgpl.html>.


================================================
FILE: Open-Sora/Dockerfile
================================================
FROM hpcaitech/pytorch-cuda:2.1.0-12.1.0

# metainformation
LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/Open-Sora"
LABEL org.opencontainers.image.licenses = "Apache License 2.0"
LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/pytorch-cuda:2.1.0-12.1.0"

# Set the working directory
WORKDIR /workspace/Open-Sora
# Copy the current directory contents into the container at /workspace/Open-Sora
COPY . .

# inatall library dependencies
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y

# install flash attention
RUN pip install flash-attn --no-build-isolation

# install apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git

# install xformers
RUN pip install xformers --index-url https://download.pytorch.org/whl/cu121

# install this project
RUN pip install -v .


================================================
FILE: Open-Sora/LICENSE
================================================
Copyright 2024 HPC-AI Technology Inc. All rights reserved.
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2024 HPC-AI Technology Inc.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

   =========================================================================
   This project is inspired by the listed projects and is subject to the following licenses:

   1. Latte (https://github.com/Vchitect/Latte/blob/main/LICENSE)

   Copyright 2024 Latte

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

   2. PixArt-alpha (https://github.com/PixArt-alpha/PixArt-alpha/blob/master/LICENSE)

   Copyright (C) 2024 PixArt-alpha/PixArt-alpha

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU Affero General Public License as published
   by the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Affero General Public License for more details.

   You should have received a copy of the GNU Affero General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.

   3. dpm-solver (https://github.com/LuChengTHU/dpm-solver/blob/main/LICENSE)

   MIT License

   Copyright (c) 2022 Cheng Lu

   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
   in the Software without restriction, including without limitation the rights
   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   copies of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be included in all
   copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.

   4. DiT (https://github.com/facebookresearch/DiT/blob/main/LICENSE.txt)

   Attribution-NonCommercial 4.0 International

   =======================================================================

   Creative Commons Corporation ("Creative Commons") is not a law firm and
   does not provide legal services or legal advice. Distribution of
   Creative Commons public licenses does not create a lawyer-client or
   other relationship. Creative Commons makes its licenses and related
   information available on an "as-is" basis. Creative Commons gives no
   warranties regarding its licenses, any material licensed under their
   terms and conditions, or any related information. Creative Commons
   disclaims all liability for damages resulting from their use to the
   fullest extent possible.

   Using Creative Commons Public Licenses

   Creative Commons public licenses provide a standard set of terms and
   conditions that creators and other rights holders may use to share
   original works of authorship and other material subject to copyright
   and certain other rights specified in the public license below. The
   following considerations are for informational purposes only, are not
   exhaustive, and do not form part of our licenses.

      Considerations for licensors: Our public licenses are
      intended for use by those authorized to give the public
      permission to use material in ways otherwise restricted by
      copyright and certain other rights. Our licenses are
      irrevocable. Licensors should read and understand the terms
      and conditions of the license they choose before applying it.
      Licensors should also secure all rights necessary before
      applying our licenses so that the public can reuse the
      material as expected. Licensors should clearly mark any
      material not subject to the license. This includes other CC-
      licensed material, or material used under an exception or
      limitation to copyright. More considerations for licensors:
      wiki.creativecommons.org/Considerations_for_licensors

      Considerations for the public: By using one of our public
      licenses, a licensor grants the public permission to use the
      licensed material under specified terms and conditions. If
      the licensor's permission is not necessary for any reason--for
      example, because of any applicable exception or limitation to
      copyright--then that use is not regulated by the license. Our
      licenses grant only permissions under copyright and certain
      other rights that a licensor has authority to grant. Use of
      the licensed material may still be restricted for other
      reasons, including because others have copyright or other
      rights in the material. A licensor may make special requests,
      such as asking that all changes be marked or described.
      Although not required by our licenses, you are encouraged to
      respect those requests where reasonable. More_considerations
      for the public:
      wiki.creativecommons.org/Considerations_for_licensees

   =======================================================================

   Creative Commons Attribution-NonCommercial 4.0 International Public
   License

   By exercising the Licensed Rights (defined below), You accept and agree
   to be bound by the terms and conditions of this Creative Commons
   Attribution-NonCommercial 4.0 International Public License ("Public
   License"). To the extent this Public License may be interpreted as a
   contract, You are granted the Licensed Rights in consideration of Your
   acceptance of these terms and conditions, and the Licensor grants You
   such rights in consideration of benefits the Licensor receives from
   making the Licensed Material available under these terms and
   conditions.

   Section 1 -- Definitions.

   a. Adapted Material means material subject to Copyright and Similar
      Rights that is derived from or based upon the Licensed Material
      and in which the Licensed Material is translated, altered,
      arranged, transformed, or otherwise modified in a manner requiring
      permission under the Copyright and Similar Rights held by the
      Licensor. For purposes of this Public License, where the Licensed
      Material is a musical work, performance, or sound recording,
      Adapted Material is always produced where the Licensed Material is
      synched in timed relation with a moving image.

   b. Adapter's License means the license You apply to Your Copyright
      and Similar Rights in Your contributions to Adapted Material in
      accordance with the terms and conditions of this Public License.

   c. Copyright and Similar Rights means copyright and/or similar rights
      closely related to copyright including, without limitation,
      performance, broadcast, sound recording, and Sui Generis Database
      Rights, without regard to how the rights are labeled or
      categorized. For purposes of this Public License, the rights
      specified in Section 2(b)(1)-(2) are not Copyright and Similar
      Rights.
   d. Effective Technological Measures means those measures that, in the
      absence of proper authority, may not be circumvented under laws
      fulfilling obligations under Article 11 of the WIPO Copyright
      Treaty adopted on December 20, 1996, and/or similar international
      agreements.

   e. Exceptions and Limitations means fair use, fair dealing, and/or
      any other exception or limitation to Copyright and Similar Rights
      that applies to Your use of the Licensed Material.

   f. Licensed Material means the artistic or literary work, database,
      or other material to which the Licensor applied this Public
      License.

   g. Licensed Rights means the rights granted to You subject to the
      terms and conditions of this Public License, which are limited to
      all Copyright and Similar Rights that apply to Your use of the
      Licensed Material and that the Licensor has authority to license.

   h. Licensor means the individual(s) or entity(ies) granting rights
      under this Public License.

   i. NonCommercial means not primarily intended for or directed towards
      commercial advantage or monetary compensation. For purposes of
      this Public License, the exchange of the Licensed Material for
      other material subject to Copyright and Similar Rights by digital
      file-sharing or similar means is NonCommercial provided there is
      no payment of monetary compensation in connection with the
      exchange.

   j. Share means to provide material to the public by any means or
      process that requires permission under the Licensed Rights, such
      as reproduction, public display, public performance, distribution,
      dissemination, communication, or importation, and to make material
      available to the public including in ways that members of the
      public may access the material from a place and at a time
      individually chosen by them.

   k. Sui Generis Database Rights means rights other than copyright
      resulting from Directive 96/9/EC of the European Parliament and of
      the Council of 11 March 1996 on the legal protection of databases,
      as amended and/or succeeded, as well as other essentially
      equivalent rights anywhere in the world.

   l. You means the individual or entity exercising the Licensed Rights
      under this Public License. Your has a corresponding meaning.

   Section 2 -- Scope.

   a. License grant.

         1. Subject to the terms and conditions of this Public License,
            the Licensor hereby grants You a worldwide, royalty-free,
            non-sublicensable, non-exclusive, irrevocable license to
            exercise the Licensed Rights in the Licensed Material to:

               a. reproduce and Share the Licensed Material, in whole or
                  in part, for NonCommercial purposes only; and

               b. produce, reproduce, and Share Adapted Material for
                  NonCommercial purposes only.

         2. Exceptions and Limitations. For the avoidance of doubt, where
            Exceptions and Limitations apply to Your use, this Public
            License does not apply, and You do not need to comply with
            its terms and conditions.

         3. Term. The term of this Public License is specified in Section
            6(a).

         4. Media and formats; technical modifications allowed. The
            Licensor authorizes You to exercise the Licensed Rights in
            all media and formats whether now known or hereafter created,
            and to make technical modifications necessary to do so. The
            Licensor waives and/or agrees not to assert any right or
            authority to forbid You from making technical modifications
            necessary to exercise the Licensed Rights, including
            technical modifications necessary to circumvent Effective
            Technological Measures. For purposes of this Public License,
            simply making modifications authorized by this Section 2(a)
            (4) never produces Adapted Material.

         5. Downstream recipients.

               a. Offer from the Licensor -- Licensed Material. Every
                  recipient of the Licensed Material automatically
                  receives an offer from the Licensor to exercise the
                  Licensed Rights under the terms and conditions of this
                  Public License.

               b. No downstream restrictions. You may not offer or impose
                  any additional or different terms or conditions on, or
                  apply any Effective Technological Measures to, the
                  Licensed Material if doing so restricts exercise of the
                  Licensed Rights by any recipient of the Licensed
                  Material.

         6. No endorsement. Nothing in this Public License constitutes or
            may be construed as permission to assert or imply that You
            are, or that Your use of the Licensed Material is, connected
            with, or sponsored, endorsed, or granted official status by,
            the Licensor or others designated to receive attribution as
            provided in Section 3(a)(1)(A)(i).

   b. Other rights.

         1. Moral rights, such as the right of integrity, are not
            licensed under this Public License, nor are publicity,
            privacy, and/or other similar personality rights; however, to
            the extent possible, the Licensor waives and/or agrees not to
            assert any such rights held by the Licensor to the limited
            extent necessary to allow You to exercise the Licensed
            Rights, but not otherwise.

         2. Patent and trademark rights are not licensed under this
            Public License.

         3. To the extent possible, the Licensor waives any right to
            collect royalties from You for the exercise of the Licensed
            Rights, whether directly or through a collecting society
            under any voluntary or waivable statutory or compulsory
            licensing scheme. In all other cases the Licensor expressly
            reserves any right to collect such royalties, including when
            the Licensed Material is used other than for NonCommercial
            purposes.

   Section 3 -- License Conditions.

   Your exercise of the Licensed Rights is expressly made subject to the
   following conditions.

   a. Attribution.

         1. If You Share the Licensed Material (including in modified
            form), You must:

               a. retain the following if it is supplied by the Licensor
                  with the Licensed Material:

                  i. identification of the creator(s) of the Licensed
                     Material and any others designated to receive
                     attribution, in any reasonable manner requested by
                     the Licensor (including by pseudonym if
                     designated);

                  ii. a copyright notice;

                  iii. a notice that refers to this Public License;

                  iv. a notice that refers to the disclaimer of
                     warranties;

                  v. a URI or hyperlink to the Licensed Material to the
                     extent reasonably practicable;

               b. indicate if You modified the Licensed Material and
                  retain an indication of any previous modifications; and

               c. indicate the Licensed Material is licensed under this
                  Public License, and include the text of, or the URI or
                  hyperlink to, this Public License.

         2. You may satisfy the conditions in Section 3(a)(1) in any
            reasonable manner based on the medium, means, and context in
            which You Share the Licensed Material. For example, it may be
            reasonable to satisfy the conditions by providing a URI or
            hyperlink to a resource that includes the required
            information.

         3. If requested by the Licensor, You must remove any of the
            information required by Section 3(a)(1)(A) to the extent
            reasonably practicable.

         4. If You Share Adapted Material You produce, the Adapter's
            License You apply must not prevent recipients of the Adapted
            Material from complying with this Public License.

   Section 4 -- Sui Generis Database Rights.

   Where the Licensed Rights include Sui Generis Database Rights that
   apply to Your use of the Licensed Material:

   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
      to extract, reuse, reproduce, and Share all or a substantial
      portion of the contents of the database for NonCommercial purposes
      only;

   b. if You include all or a substantial portion of the database
      contents in a database in which You have Sui Generis Database
      Rights, then the database in which You have Sui Generis Database
      Rights (but not its individual contents) is Adapted Material; and

   c. You must comply with the conditions in Section 3(a) if You Share
      all or a substantial portion of the contents of the database.

   For the avoidance of doubt, this Section 4 supplements and does not
   replace Your obligations under this Public License where the Licensed
   Rights include other Copyright and Similar Rights.

   Section 5 -- Disclaimer of Warranties and Limitation of Liability.

   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.

   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.

   c. The disclaimer of warranties and limitation of liability provided
      above shall be interpreted in a manner that, to the extent
      possible, most closely approximates an absolute disclaimer and
      waiver of all liability.

   Section 6 -- Term and Termination.

   a. This Public License applies for the term of the Copyright and
      Similar Rights licensed here. However, if You fail to comply with
      this Public License, then Your rights under this Public License
      terminate automatically.

   b. Where Your right to use the Licensed Material has terminated under
      Section 6(a), it reinstates:

         1. automatically as of the date the violation is cured, provided
            it is cured within 30 days of Your discovery of the
            violation; or

         2. upon express reinstatement by the Licensor.

      For the avoidance of doubt, this Section 6(b) does not affect any
      right the Licensor may have to seek remedies for Your violations
      of this Public License.

   c. For the avoidance of doubt, the Licensor may also offer the
      Licensed Material under separate terms or conditions or stop
      distributing the Licensed Material at any time; however, doing so
      will not terminate this Public License.

   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
      License.

   Section 7 -- Other Terms and Conditions.

   a. The Licensor shall not be bound by any additional or different
      terms or conditions communicated by You unless expressly agreed.

   b. Any arrangements, understandings, or agreements regarding the
      Licensed Material not stated herein are separate from and
      independent of the terms and conditions of this Public License.

   Section 8 -- Interpretation.

   a. For the avoidance of doubt, this Public License does not, and
      shall not be interpreted to, reduce, limit, restrict, or impose
      conditions on any use of the Licensed Material that could lawfully
      be made without permission under this Public License.

   b. To the extent possible, if any provision of this Public License is
      deemed unenforceable, it shall be automatically reformed to the
      minimum extent necessary to make it enforceable. If the provision
      cannot be reformed, it shall be severed from this Public License
      without affecting the enforceability of the remaining terms and
      conditions.

   c. No term or condition of this Public License will be waived and no
      failure to comply consented to unless expressly agreed to by the
      Licensor.

   d. Nothing in this Public License constitutes or may be interpreted
      as a limitation upon, or waiver of, any privileges and immunities
      that apply to the Licensor or You, including from the legal
      processes of any jurisdiction or authority.

   =======================================================================

   Creative Commons is not a party to its public
   licenses. Notwithstanding, Creative Commons may elect to apply one of
   its public licenses to material it publishes and in those instances
   will be considered the “Licensor.” The text of the Creative Commons
   public licenses is dedicated to the public domain under the CC0 Public
   Domain Dedication. Except for the limited purpose of indicating that
   material is shared under a Creative Commons public license or as
   otherwise permitted by the Creative Commons policies published at
   creativecommons.org/policies, Creative Commons does not authorize the
   use of the trademark "Creative Commons" or any other trademark or logo
   of Creative Commons without its prior written consent including,
   without limitation, in connection with any unauthorized modifications
   to any of its public licenses or any other arrangements,
   understandings, or agreements concerning use of licensed material. For
   the avoidance of doubt, this paragraph does not form part of the
   public licenses.

   Creative Commons may be contacted at creativecommons.org.

   5. OpenDiT (https://github.com/NUS-HPC-AI-Lab/OpenDiT/blob/master/LICENSE)

   Copyright OpenDiT

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: Open-Sora/README.md
================================================
<p align="center">
    <img src="./assets/readme/icon.png" width="250"/>
</p>
<div align="center">
    <a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
    <a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
    <a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
    <a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
    <a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
    <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
    <a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
    <a href="https://huggingface.co/spaces/hpcai-tech/open-sora"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Gradio Demo-blue"></a>
</div>

## Open-Sora: Democratizing Efficient Video Production for All

We design and implement **Open-Sora**, an initiative dedicated to **efficiently** producing high-quality video. We hope to make the model,
tools and all details accessible to all. By embracing **open-source** principles,
Open-Sora not only democratizes access to advanced video generation techniques, but also offers a
streamlined and user-friendly platform that simplifies the complexities of video generation.
With Open-Sora, our goal is to foster innovation, creativity, and inclusivity within the field of content creation.

[[中文文档](/docs/zh_CN/README.md)] [[潞晨云](https://cloud.luchentech.com/)|[OpenSora镜像](https://cloud.luchentech.com/doc/docs/image/open-sora/)|[视频教程](https://www.bilibili.com/video/BV1ow4m1e7PX/?vd_source=c6b752764cd36ff0e535a768e35d98d2)]

## 📰 News

- **[2024.06.17]** 🔥 We released **Open-Sora 1.2**, which includes **3D-VAE**, **rectified flow**, and **score condition**. The video quality is greatly improved. [[checkpoints]](#open-sora-10-model-weights) [[report]](/docs/report_03.md)   [[blog]](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use)
- **[2024.04.25]** 🤗 We released the [Gradio demo for Open-Sora](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face Spaces.
- **[2024.04.25]** We released **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]]() [[report]](/docs/report_02.md)
- **[2024.03.18]** We released **Open-Sora 1.0**, a fully open-source project for video generation.
  Open-Sora 1.0 supports a full pipeline of video data preprocessing, training with
  <a href="https://github.com/hpcaitech/ColossalAI"><img src="assets/readme/colossal_ai.png" width="8%" ></a>
  acceleration,
  inference, and more. Our model can produce 2s 512x512 videos with only 3 days training. [[checkpoints]](#open-sora-10-model-weights)
  [[blog]](https://hpc-ai.com/blog/open-sora-v1.0) [[report]](/docs/report_01.md)
- **[2024.03.04]** Open-Sora provides training with 46% cost reduction.
  [[blog]](https://hpc-ai.com/blog/open-sora)

## 🎥 Latest Demo

🔥 You can experience Open-Sora on our [🤗 Gradio application on Hugging Face](https://huggingface.co/spaces/hpcai-tech/open-sora). More samples and corresponding prompts are available in our [Gallery](https://hpcaitech.github.io/Open-Sora/).

| **4s 720×1280**                                                                                                                                      | **4s 720×1280**                                                                                                                                      | **4s 720×1280**                                                                                                                                      |
| ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="assets/demo/v1.2/sample_0013.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/7895aab6-ed23-488c-8486-091480c26327) | [<img src="assets/demo/v1.2/sample_1718.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/20f07c7b-182b-4562-bbee-f1df74c86c9a) | [<img src="assets/demo/v1.2/sample_0087.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3d897e0d-dc21-453a-b911-b3bda838acc2) |
| [<img src="assets/demo/v1.2/sample_0052.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/644bf938-96ce-44aa-b797-b3c0b513d64c) | [<img src="assets/demo/v1.2/sample_1719.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/272d88ac-4b4a-484d-a665-8d07431671d0) | [<img src="assets/demo/v1.2/sample_0002.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ebbac621-c34e-4bb4-9543-1c34f8989764) |
| [<img src="assets/demo/v1.2/sample_0011.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/a1e3a1a3-4abd-45f5-8df2-6cced69da4ca) | [<img src="assets/demo/v1.2/sample_0004.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/d6ce9c13-28e1-4dff-9644-cc01f5f11926) | [<img src="assets/demo/v1.2/sample_0061.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/561978f8-f1b0-4f4d-ae7b-45bec9001b4a) |

<details>
<summary>OpenSora 1.1 Demo</summary>

| **2s 240×426**                                                                                                                                              | **2s 240×426**                                                                                                                                             |
| ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [<img src="assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) |
| [<img src="assets/demo/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd)  | [<img src="assets/demo/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) |

| **2s 426×240**                                                                                                                                             | **4s 480×854**                                                                                                                                              |
| ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="assets/demo/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="assets/demo/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) |

| **16s 320×320**                                                                                                                                        | **16s 224×448**                                                                                                                                        | **2s 426×240**                                                                                                                                            |
| ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="assets/demo/sample_16s_320x320.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [<img src="assets/demo/sample_16s_224x448.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) | [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |

</details>

<details>
<summary>OpenSora 1.0 Demo</summary>

| **2s 512×512**                                                                                                                                                                 | **2s 512×512**                                                                                                                                                              | **2s 512×512**                                                                                                                                    |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="assets/readme/sample_0.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80)                                 | [<img src="assets/readme/sample_1.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc)                              | [<img src="assets/readme/sample_2.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16)    |
| A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff. | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall. |
| [<img src="assets/readme/sample_3.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94)                                 | [<img src="assets/readme/sample_4.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9)                              | [<img src="assets/readme/sample_5.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65)    |
| A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...]                                                           | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...]                                            | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...]                   |

Videos are downsampled to `.gif` for display. Click for original videos. Prompts are trimmed for display,
see [here](/assets/texts/t2v_samples.txt) for full prompts.

</details>

## 🔆 New Features/Updates

- 📍 **Open-Sora 1.2** released. Model weights are available [here](#model-weights). See our **[report 1.2](/docs/report_03.md)** for more details.
- ✅ Support rectified flow scheduling.
- ✅ Support more conditioning including fps, aesthetic score, motion strength and camera motion.
- ✅ Trained our 3D-VAE for temporal dimension compression.
- 📍 **Open-Sora 1.1** released. Model weights are available [here](#model-weights). It is trained on **0s~15s, 144p to 720p, various aspect ratios** videos. See our **[report 1.1](/docs/report_02.md)** for more discussions.
- 🔧 **Data processing pipeline v1.1** is released. An automatic [processing pipeline](#data-processing) from raw videos to (text, video clip) pairs is provided, including scene cutting $\rightarrow$ filtering(aesthetic, optical flow, OCR, etc.) $\rightarrow$ captioning $\rightarrow$ managing. With this tool, you can easily build your video dataset.

<details>
<summary>View more</summary>

- ✅ Improved ST-DiT architecture includes rope positional encoding, qk norm, longer text length, etc.
- ✅ Support training with any resolution, aspect ratio, and duration (including images).
- ✅ Support image and video conditioning and video editing, and thus support animating images, connecting videos, etc.
- 📍 **Open-Sora 1.0** released. Model weights are available [here](#model-weights). With only 400K video clips and 200 H800
  days (compared with 152M samples in Stable Video Diffusion), we are able to generate 2s 512×512 videos. See our **[report 1.0](docs/report_01.md)** for more discussions.
- ✅ Three-stage training from an image diffusion model to a video diffusion model. We provide the weights for each
  stage.
- ✅ Support training acceleration including accelerated transformer, faster T5 and VAE, and sequence parallelism.
  Open-Sora improves **55%** training speed when training on 64x512x512 videos. Details locates
  at [acceleration.md](docs/acceleration.md).
- 🔧 **Data preprocessing pipeline v1.0**,
  including [downloading](tools/datasets/README.md), [video cutting](tools/scene_cut/README.md),
  and [captioning](tools/caption/README.md) tools. Our data collection plan can be found
  at [datasets.md](docs/datasets.md).
- ✅ We find VQ-VAE from [VideoGPT](https://wilson1yan.github.io/videogpt/index.html) has a low quality and thus adopt a
  better VAE from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original). We also find patching in
  the time dimension deteriorates the quality. See our **[report](docs/report_01.md)** for more discussions.
- ✅ We investigate different architectures including DiT, Latte, and our proposed STDiT. Our **STDiT** achieves a better
  trade-off between quality and speed. See our **[report](docs/report_01.md)** for more discussions.
- ✅ Support clip and T5 text conditioning.
- ✅ By viewing images as one-frame videos, our project supports training DiT on both images and videos (e.g., ImageNet &
  UCF101). See [commands.md](docs/commands.md) for more instructions.
- ✅ Support inference with official weights
  from [DiT](https://github.com/facebookresearch/DiT), [Latte](https://github.com/Vchitect/Latte),
  and [PixArt](https://pixart-alpha.github.io/).
- ✅ Refactor the codebase. See [structure.md](docs/structure.md) to learn the project structure and how to use the
  config files.

</details>

### TODO list sorted by priority

<details>
<summary>View more</summary>

- [x] Training Video-VAE and adapt our model to new VAE.
- [x] Scaling model parameters and dataset size.
- [x] Incoporate a better scheduler (rectified flow).
- [x] Evaluation pipeline.
- [x] Complete the data processing pipeline (including dense optical flow, aesthetics scores, text-image similarity, etc.). See [the dataset](/docs/datasets.md) for more information
- [x] Support image and video conditioning.
- [x] Support variable aspect ratios, resolutions, durations.

</details>

## Contents

- [Installation](#installation)
- [Model Weights](#model-weights)
- [Gradio Demo](#gradio-demo)
- [Inference](#inference)
- [Data Processing](#data-processing)
- [Training](#training)
- [Evaluation](#evaluation)
- [VAE Training & Evaluation](#vae-training--evaluation)
- [Contribution](#contribution)
- [Citation](#citation)
- [Acknowledgement](#acknowledgement)

Other useful documents and links are listed below.

- Report: each version is trained from a image base seperately (not continuously trained), while a newer version will incorporate the techniques from the previous version.
  - [report 1.2](docs/report_03.md): rectified flow, 3d-VAE, score condition, evaluation, etc.
  - [report 1.1](docs/report_02.md): multi-resolution/length/aspect-ratio, image/video conditioning/editing, data preprocessing, etc.
  - [report 1.0](docs/report_01.md): architecture, captioning, etc.
  - [acceleration.md](docs/acceleration.md)
- Repo structure: [structure.md](docs/structure.md)
- Config file explanation: [config.md](docs/config.md)
- Useful commands: [commands.md](docs/commands.md)
- Data processing pipeline and dataset: [datasets.md](docs/datasets.md)
- Each data processing tool's README: [dataset conventions and management](/tools/datasets/README.md), [scene cutting](/tools/scene_cut/README.md), [scoring](/tools/scoring/README.md), [caption](/tools/caption/README.md)
- Evaluation: [eval/README.md](/eval/README.md)
- Gallery: [gallery](https://hpcaitech.github.io/Open-Sora/)

## Installation

### Install from Source

For CUDA 12.1, you can install the dependencies with the following commands. Otherwise, please refer to [Installation Documentation](docs/installation.md) for more instructions on different cuda version, and additional dependency for data preprocessing, VAE, and model evaluation.

```bash
# create a virtual env and activate (conda as an example)
conda create -n opensora python=3.9
conda activate opensora

# download the repo
git clone https://github.com/hpcaitech/Open-Sora
cd Open-Sora

# install torch, torchvision and xformers
pip install -r requirements/requirements-cu121.txt

# the default installation is for inference only
pip install -v . # for development mode, `pip install -v -e .`
```

(Optional, recommended for fast speed, especially for training) To enable `layernorm_kernel` and `flash_attn`, you need to install `apex` and `flash-attn` with the following commands.

```bash
# install flash attention
# set enable_flash_attn=False in config to disable flash attention
pip install packaging ninja
pip install flash-attn --no-build-isolation

# install apex
# set enable_layernorm_kernel=False in config to disable apex
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
```

### Use Docker

Run the following command to build a docker image from Dockerfile provided.

```bash
docker build -t opensora .
```

Run the following command to start the docker container in interactive mode.

```bash
docker run -ti --gpus all -v .:/workspace/Open-Sora opensora
```

## Model Weights

### Open-Sora 1.2 Model Weights

| Model     | Model Size | Data | #iterations | Batch Size | URL                                                           |
| --------- | ---------- | ---- | ----------- | ---------- | ------------------------------------------------------------- |
| Diffusion | 1.1B       | 30M  | 70k         | Dynamic    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v3) |
| VAE       | 384M       | 3M   | 1M          | 8          | [:link:](https://huggingface.co/hpcai-tech/OpenSora-VAE-v1.2) |

See our **[report 1.2](docs/report_03.md)** for more infomation. Weight will be automatically downloaded when you run the inference script.

> For users from mainland China, try `export HF_ENDPOINT=https://hf-mirror.com` to successfully download the weights.

### Open-Sora 1.1 Model Weights

<details>
<summary>View more</summary>

| Resolution         | Model Size | Data                       | #iterations | Batch Size                                        | URL                                                                  |
| ------------------ | ---------- | -------------------------- | ----------- | ------------------------------------------------- | -------------------------------------------------------------------- |
| mainly 144p & 240p | 700M       | 10M videos + 2M images     | 100k        | [dynamic](/configs/opensora-v1-1/train/stage2.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) |
| 144p to 720p       | 700M       | 500K HQ videos + 1M images | 4k          | [dynamic](/configs/opensora-v1-1/train/stage3.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) |

See our **[report 1.1](docs/report_02.md)** for more infomation.

:warning: **LIMITATION**: This version contains known issues which we are going to fix in the next version (as we save computation resource for the next release). In addition, the video generation may fail for long duration, and high resolution will have noisy results due to this problem.

</details>

### Open-Sora 1.0 Model Weights

<details>
<summary>View more</summary>

| Resolution | Model Size | Data   | #iterations | Batch Size | GPU days (H800) | URL                                                                                           |
| ---------- | ---------- | ------ | ----------- | ---------- | --------------- | --------------------------------------------------------------------------------------------- |
| 16×512×512 | 700M       | 20K HQ | 20k         | 2×64       | 35              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth) |
| 16×256×256 | 700M       | 20K HQ | 24k         | 8×64       | 45              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) |
| 16×256×256 | 700M       | 366K   | 80k         | 8×64       | 117             | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth)    |

Training orders: 16x256x256 $\rightarrow$ 16x256x256 HQ $\rightarrow$ 16x512x512 HQ.

Our model's weight is partially initialized from [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha). The number of
parameters is 724M. More information about training can be found in our **[report](/docs/report_01.md)**. More about
the dataset can be found in [datasets.md](/docs/datasets.md). HQ means high quality.

:warning: **LIMITATION**: Our model is trained on a limited budget. The quality and text alignment is relatively poor.
The model performs badly, especially on generating human beings and cannot follow detailed instructions. We are working
on improving the quality and text alignment.

</details>

## Gradio Demo

🔥 You can experience Open-Sora on our [🤗 Gradio application](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face online.

### Local Deployment

If you want to deploy gradio locally, we have also provided a [Gradio application](./gradio) in this repository, you can use the following the command to start an interactive web application to experience video generation with Open-Sora.

```bash
pip install gradio spaces
python gradio/app.py
```

This will launch a Gradio application on your localhost. If you want to know more about the Gradio applicaiton, you can refer to the [Gradio README](./gradio/README.md).

To enable prompt enhancement and other language input (e.g., 中文输入), you need to set the `OPENAI_API_KEY` in the environment. Check [OpenAI's documentation](https://platform.openai.com/docs/quickstart) to get your API key.

```bash
export OPENAI_API_KEY=YOUR_API_KEY
```

### Getting Started

In the Gradio application, the basic options are as follows:

![Gradio Demo](assets/readme/gradio_basic.png)

The easiest way to generate a video is to input a text prompt and click the "**Generate video**" button (scroll down if you cannot find). The generated video will be displayed in the right panel. Checking the "**Enhance prompt with GPT4o**" will use GPT-4o to refine the prompt, while "**Random Prompt**" button will generate a random prompt by GPT-4o for you. Due to the OpenAI's API limit, the prompt refinement result has some randomness.

Then, you can choose the **resolution**, **duration**, and **aspect ratio** of the generated video. Different resolution and video length will affect the video generation speed. On a 80G H100 GPU, the generation speed (with `num_sampling_step=30`) and peak memory usage is:

|      | Image   | 2s       | 4s        | 8s        | 16s       |
| ---- | ------- | -------- | --------- | --------- | --------- |
| 360p | 3s, 24G | 18s, 27G | 31s, 27G  | 62s, 28G  | 121s, 33G |
| 480p | 2s, 24G | 29s, 31G | 55s, 30G  | 108s, 32G | 219s, 36G |
| 720p | 6s, 27G | 68s, 41G | 130s, 39G | 260s, 45G | 547s, 67G |

Note that besides text to video, you can also use **image to video generation**. You can upload an image and then click the "**Generate video**" button to generate a video with the image as the first frame. Or you can fill in the text prompt and click the "**Generate image**" button to generate an image with the text prompt, and then click the "**Generate video**" button to generate a video with the image generated with the same model.

![Gradio Demo](assets/readme/gradio_option.png)

Then you can specify more options, including "**Motion Strength**", "**Aesthetic**" and "**Camera Motion**". If "Enable" not checked or the choice is "none", the information is not passed to the model. Otherwise, the model will generate videos with the specified motion strength, aesthetic score, and camera motion.

For the **aesthetic score**, we recommend using values higher than 6. For **motion strength**, a smaller value will lead to a smoother but less dynamic video, while a larger value will lead to a more dynamic but likely more blurry video. Thus, you can try without it and then adjust it according to the generated video. For the **camera motion**, sometimes the model cannot follow the instruction well, and we are working on improving it.

You can also adjust the "**Sampling steps**", this is directly related to the generation speed as it is the number of denoising. A number smaller than 30 usually leads to a poor generation results, while a number larger than 100 usually has no significant improvement. The "**Seed**" is used for reproducibility, you can set it to a fixed number to generate the same video. The "**CFG Scale**" controls how much the model follows the text prompt, a smaller value will lead to a more random video, while a larger value will lead to a more text-following video (7 is recommended).

For more advanced usage, you can refer to [Gradio README](./gradio/README.md#advanced-usage).

## Inference

### Open-Sora 1.2 Command Line Inference

The basic command line inference is as follows:

```bash
# text to video
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --aspect-ratio 9:16 \
  --prompt "a beautiful waterfall"
```

You can add more options to the command line to customize the generation.

```bash
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --aspect-ratio 9:16 \
  --num-sampling-steps 30 --flow 5 --aes 6.5 \
  --prompt "a beautiful waterfall"
```

For image to video generation and other functionalities, the API is compatible with Open-Sora 1.1. See [here](docs/commands.md) for more instructions.

If your installation do not contain `apex` and `flash-attn`, you need to disable them in the config file, or via the folowing command.

```bash
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p \
  --layernorm-kernel False --flash-attn False \
  --prompt "a beautiful waterfall"
```

### Sequence Parallelism Inference

To enable sequence parallelism, you need to use `torchrun` to run the inference script. The following command will run the inference with 2 GPUs.

```bash
# text to video
CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --aspect-ratio 9:16 \
  --prompt "a beautiful waterfall"
```

:warning: **LIMITATION**: The sequence parallelism is not supported for gradio deployment. For now, the sequence parallelism is only supported when the dimension can be divided by the number of GPUs. Thus, it may fail for some cases. We tested 4 GPUs for 720p and 2 GPUs for 480p.

### GPT-4o Prompt Refinement

We find that GPT-4o can refine the prompt and improve the quality of the generated video. With this feature, you can also use other language (e.g., Chinese) as the prompt. To enable this feature, you need prepare your openai api key in the environment:

```bash
export OPENAI_API_KEY=YOUR_API_KEY
```

Then you can inference with `--llm-refine True` to enable the GPT-4o prompt refinement, or leave prompt empty to get a random prompt generated by GPT-4o.

```bash
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --llm-refine True
```

### Open-Sora 1.1 Command Line Inference

<details>
<summary>View more</summary>

Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument.

```bash
# text to video
python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854
```

If your installation do not contain `apex` and `flash-attn`, you need to disable them in the config file, or via the folowing command.

```bash
python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854 --layernorm-kernel False --flash-attn False
```

See [here](docs/commands.md#inference-with-open-sora-11) for more instructions including text-to-image, image-to-video, video-to-video, and infinite time generation.

</details>

### Open-Sora 1.0 Command Line Inference

<details>
<summary>View more</summary>

We have also provided an offline inference script. Run the following commands to generate samples, the required model weights will be automatically downloaded. To change sampling prompts, modify the txt file passed to `--prompt-path`. See [here](docs/structure.md#inference-config-demos) to customize the configuration.

```bash
# Sample 16x512x512 (20s/sample, 100 time steps, 24 GB memory)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path OpenSora-v1-HQ-16x512x512.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 16x256x256 (5s/sample, 100 time steps, 22 GB memory)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path OpenSora-v1-HQ-16x256x256.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 64x512x512 (40s/sample, 100 time steps)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 64x512x512 with sequence parallelism (30s/sample, 100 time steps)
# sequence parallelism is enabled automatically when nproc_per_node is larger than 1
torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt
```

The speed is tested on H800 GPUs. For inference with other models, see [here](docs/commands.md) for more instructions.
To lower the memory usage, set a smaller `vae.micro_batch_size` in the config (slightly lower sampling speed).

</details>

## Data Processing

High-quality data is crucial for training good generation models.
To this end, we establish a complete pipeline for data processing, which could seamlessly convert raw videos to high-quality video-text pairs.
The pipeline is shown below. For detailed information, please refer to [data processing](docs/data_processing.md).
Also check out the [datasets](docs/datasets.md) we use.

![Data Processing Pipeline](assets/readme/report_data_pipeline.png)

## Training

### Open-Sora 1.2 Training

The training process is same as Open-Sora 1.1.

```bash
# one node
torchrun --standalone --nproc_per_node 8 scripts/train.py \
    configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
# multiple nodes
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \
    configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

### Open-Sora 1.1 Training

<details>
<summary>View more</summary>

Once you prepare the data in a `csv` file, run the following commands to launch training on a single node.

```bash
# one node
torchrun --standalone --nproc_per_node 8 scripts/train.py \
    configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
# multiple nodes
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \
    configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

</details>

### Open-Sora 1.0 Training

<details>
<summary>View more</summary>

Once you prepare the data in a `csv` file, run the following commands to launch training on a single node.

```bash
# 1 GPU, 16x256x256
torchrun --nnodes=1 --nproc_per_node=1 scripts/train.py configs/opensora/train/16x256x256.py --data-path YOUR_CSV_PATH
# 8 GPUs, 64x512x512
torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

To launch training on multiple nodes, prepare a hostfile according
to [ColossalAI](https://colossalai.org/docs/basics/launch_colossalai/#launch-with-colossal-ai-cli), and run the
following commands.

```bash
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

For training other models and advanced usage, see [here](docs/commands.md) for more instructions.

</details>

## Evaluation

We support evaluation based on:

- Validation loss
- [VBench](https://github.com/Vchitect/VBench/tree/master) score
- VBench-i2v score
- Batch generation for human evaluation

All the evaluation code is released in `eval` folder. Check the [README](/eval/README.md) for more details. Our [report](/docs/report_03.md#evaluation) also provides more information about the evaluation during training. The following table shows Open-Sora 1.2 greatly improves Open-Sora 1.0.

| Model          | Total Score | Quality Score | Semantic Score |
| -------------- | ----------- | ------------- | -------------- |
| Open-Sora V1.0 | 75.91%      | 78.81%        | 64.28%         |
| Open-Sora V1.2 | 79.23%      | 80.71%        | 73.30%         |

## VAE Training & Evaluation

We train a VAE pipeline that consists of a spatial VAE followed by a temporal VAE.
For more details, refer to [VAE Documentation](docs/vae.md).
Before you run the following commands, follow our [Installation Documentation](docs/installation.md) to install the required dependencies for VAE and Evaluation.

If you want to train your own VAE, we need to prepare data in the csv following the [data processing](#data-processing) pipeline, then run the following commands.
Note that you need to adjust the number of trained epochs (`epochs`) in the config file accordingly with respect to your own csv data size.

```bash
# stage 1 training, 380k steps, 8 GPUs
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
# stage 2 training, 260k steps, 8 GPUs
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH
# stage 3 training, 540k steps, 24 GPUs
torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH
```

To evaluate the VAE performance, you need to run VAE inference first to generate the videos, then calculate scores on the generated videos:

```bash
# video generation
torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
# the original videos will be saved to `YOUR_VIDEO_DIR_ori`
# the reconstructed videos through the pipeline will be saved to `YOUR_VIDEO_DIR_rec`
# the reconstructed videos through the spatial VAE only will be saved to `YOUR_VIDEO_DIR_spatial`

# score calculation
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
```

## Contribution

Thanks goes to these wonderful contributors:

<a href="https://github.com/hpcaitech/Open-Sora/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=hpcaitech/Open-Sora" />
</a>

If you wish to contribute to this project, please refer to the [Contribution Guideline](./CONTRIBUTING.md).

## Acknowledgement

Here we only list a few of the projects. For other works and datasets, please refer to our report.

- [ColossalAI](https://github.com/hpcaitech/ColossalAI): A powerful large model parallel acceleration and optimization
  system.
- [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers.
- [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): An acceleration for DiT training. We adopt valuable acceleration
  strategies for training progress from OpenDiT.
- [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): An open-source DiT-based text-to-image model.
- [Latte](https://github.com/Vchitect/Latte): An attempt to efficiently train DiT for video.
- [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model.
- [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model.
- [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder.
- [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) and [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
- [PLLaVA](https://github.com/magic-research/PLLaVA): A powerful video captioning model.
- [MiraData](https://github.com/mira-space/MiraData): A large-scale video dataset with long durations and structured caption.

We are grateful for their exceptional work and generous contribution to open source. Special thanks go to the authors of [MiraData](https://github.com/mira-space/MiraData) and [Rectified Flow](https://github.com/gnobitab/RectifiedFlow) for their valuable advice and help. We wish to express gratitude towards AK for sharing this project on social media and Hugging Face for providing free GPU resources for our online Gradio demo.

## Citation

```bibtex
@software{opensora,
  author = {Zangwei Zheng and Xiangyu Peng and Tianji Yang and Chenhui Shen and Shenggui Li and Hongxin Liu and Yukun Zhou and Tianyi Li and Yang You},
  title = {Open-Sora: Democratizing Efficient Video Production for All},
  month = {March},
  year = {2024},
  url = {https://github.com/hpcaitech/Open-Sora}
}
```

## Star History

[![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date)


================================================
FILE: Open-Sora/assets/texts/VBench/all_category.txt
================================================
a black dog wearing halloween costume
spider making a web
bat eating fruits while hanging
a snake crawling on a wooden flooring
a close up video of a dragonfly
macro shot of ladybug on green leaf plant
chameleon eating ant
a bee feeding on nectars
bird nests on a tree captured with moving camera
a squirrel eating nuts
close up video of snail
top view of a hermit crab crawling on a wooden surface
cat licking another cat
red dragonfly perched on green leaf
close up view of a brown caterpillar crawling on green leaf
ants eating dead spider
an eagle on a tree branch
a frog eating an ant
white rabbit near the fence
a gorilla eating a carrot
close up of wolf
a meerkat looking around
a hyena in a zoo
lemur eating grass leaves
an owl being trained by a man
a lizard on a bamboo
brown chicken hunting for its food
video of parrots perched on bird stand
underwater footage of an octopus in a coral reef
a cute pomeranian dog playing with a soccer ball
white fox on rock
close up footage of a horse figurine
giraffe feeding on a tree in a savannah
curious cat sitting and looking around
hummingbird hawk moth flying near pink flowers
close up of a scorpion on a rock
close up on fish in net
koala eating leaves from a branch
a pod of dolphins swirling in the sea catching forage fish
low angle view of a hawk perched on a tree branch
a lion standing on wild grass
deer grazing in the field
elephant herd in a savanna
close up on lobster under water
hedgehog crossing road in forest
a sheep eating yellow flowers from behind a wire fence
twin sisters and a turtle
a pig wallowing in mud
flock of goose eating on the lake water
cow in a field irritated with flies
a close up shot of a fly
cheetah lying on the grass
close up of a lemur
close up shot of a kangaroo itching in the sand
a tortoise covered with algae
turkey in cage
a great blue heron bird in the lakeside
crab with shell in aquarium
a seagull walking on shore
an american crocodile
a tiger walking inside a cage
alligator in the nature
a raccoon climbing a tree
wild rabbit in a green meadow
group of ring tailed lemurs
a clouded leopard on a tree branch
duck grooming its feathers
an african penguin walking on a beach
a video of a peacock
close up shot of a wild bear
baby rhino plays with mom
porcupine climbs tree branches
close up of a natterjack toad on a rock
a sleeping orangutan
mother whale swimming with babies
a bear wearing red jersey
pink jellyfish swimming underwater in a blue sea
beautiful clown fish swimming
animation of disposable objects shaped as a whale
paper cut out of a pair of hands a whale and a heart
vertical video of camel roaming in the field during daytime
a still video of mosquito biting human
a curious sloth hanging from a tree branch
a plastic flamingo bird stumbles from the wind
a wolf in its natural habitat
a monkey sitting in the stone and scratching his head
bat hanging upside down
a red panda eating leaves
snake on ground
a harbour seal swimming near the shore
shark swimming in the sea
otter on branch while eating
goat standing over a rock
a troop of monkey on top of a mountain
a zebra eating grass on the field
a colorful butterfly perching on a bud
a snail crawling on a leaf
zookeeper showering a baby elephant
a beetle emerging from the sand
a nine banded armadillo searching for food
an apartment building with balcony
asian garden and medieval castle
illuminated tower in berlin
a wooden house overseeing the lake
a crowd of people in a plaza in front of a government building
a church interior
jewish friends posing with hanukkah menorah in a cabin house
a destroyed building after a missile attack in ukraine
abandoned building in the woods
drone video of an abandoned school building in pripyat ukraine
elegant university building
architecture and designs of buildings in central london
a pancake tower with chocolate syrup and strawberries on top
an ancient white building
friends hanging out at a coffee house
house front door with christmas decorations
city night dark building
a bird house hanging on a tree branch
sacred sculpture in a temple
high angle shot of a clock tower
modern wooden house interior
the interior of an abandoned building
opera house overlooking sea
a concrete structure near the green trees
dome like building in scotland
low angle shot of a building
tower on hill
a miniature house
eiffel tower from the seine river
low angle footage of an apartment building
island with pier and antique building
asian historic architecture
drone footage of a beautiful mansion
mosque in the middle east
building a tent and hammock in the forest camping site
top view of a high rise building
house covered in snow
skyscraper at night
house in village
a casino with people outside the building
silhouette of a building
a woman climbing a tree house
drone view of house near lake during golden hour
an under construction concrete house
a watch tower by the sea
exterior view of arabic style building
video of a hotel building
red paper lantern decorations hanging outside a building
house on seashore
aerial footage of the palace of culture and science building in warsaw poland
aerial video of stuttgart tv tower in germany
aerial view of the highway and building in a city
drone shot of a skyscraper san francisco california usa
waterfall and house
view of the sky through a building
drone footage of a house on top of the mountain
abandoned house in the nature
clouds hovering over a mansion
light house on the ocean
buddhist temple at sunrise
people walking by a graveyard near a mosque at sunset
view of lifeguard tower on the beach
scenic view of a house in the mountains
the landscape in front of a government building
aerial footage of a building and its surrounding landscape in winter
time lapse of a cloudy sky behind a transmission tower
blue ocean near the brown castle
fog over temple
house in countryside top view
building under construction
turkish flag waving on old tower
the georgian building
close up shot of a steel structure
the atrium and interior design of a multi floor building
city view reflected on a glass building
aerial view of a luxurious house with pool
an unpaved road leading to the house
drone footage of a lookout tower in mountain landscape
wind turbines on hill behind building
time lapse footage of the sun light in front of a small house porch
a building built with lots of stairways
overcast over house on seashore
the view of the sydney opera house from the other side of the harbor
candle on a jar and a house figurine on a surface
video of a farm and house
a dilapidated building made of bricks
a view of a unique building from a moving vehicle
aerial footage of a tall building in cambodia
push in shot of a huge house
a beach house built over a seawall protected from the sea waves
exotic house surrounded by trees
drone video of a house surrounded by tropical vegetation
drone footage of a building beside a pond
observation tower on hill in forest
a tree house in the woods
a video of vessel structure during daytime
fire in front of illuminated building at night
a footage of a wooden house on a wheat field
tilt shot of a solar panel below a light tower
water tower on the desert
freshly baked finger looking cookies
video of fake blood in wine glass
halloween food art
a person slicing a vegetable
a serving of pumpkin dish in a plate
close up view of green leafy vegetable
a birthday cake in the plate
video of a slice papaya fruit
a muffin with a burning candle and a love sign by a ceramic mug
a jack o lantern designed cookie
baked bread with chocolate
a broccoli soup on wooden table
a freshly brewed coffee on a pink mug
grabbing sourdough neapolitan style pizza slices
person cooking mushrooms in frying pan
rice grains placed on a reusable cloth bag
slices of kiwi fruit
grilling a steak on a pan grill
close up of bread popping out of a toaster
man eating noodle
preparing a cocktail drink
close up pasta with bacon on plate
milk and cinnamon rolls
boy getting a dumpling using chopsticks
a mother preparing food with her kids
man using his phone while eating
fresh salmon salad on a plate
cutting cucumbers into long thin slices as ingredient for sushi roll
a steaming cup of tea by the window
a glass filled with beer
a kid eating popcorn while watching tv
close up shot of fried fish on the plate
a man eating a donut
person making a vegetarian dish
spreading cheese on bagel
close up view of a man drinking red wine
a couple having breakfast in a restaurant
a student eating her sandwich
girl peeling a banana
red rice in a small bowl
pancake with blueberry on the top
green apple fruit on white wooden table
a man eating a taco by the bar
making of a burrito
squeezing lemon into salad
a chef cutting sushi rolls
video of a delicious dessert
deep frying a crab on a wok in high fire
close up video of a orange juice
video of a cooked chicken breast
woman holding a pineapple
a woman eating a bar of chocolate
decorating christmas cookie
squeezing a slice of fruit
tuna sashimi on a plate
a strawberry fruit mixed in an alcoholic drink
preparing hot dogs in a grill
a woman cutting a tomato
an orange fruit cut in half
a coconut fruit with drinking straw
woman holding a dragon fruit
a woman pouring hot beverage on a cup
waffles with whipped cream and fruit
focus shot of an insect at the bottom of a fruit
preparing a healthy broccoli dish
man eating snack at picnic
close up video of a grilled shrimp skewer
a woman mixing a smoothie drinks
close up video of woman having a bite of jelly
businessman drinking whiskey at the bar counter of a hotel lounge
cutting an onion with a knife over a wooden chopping board
fresh lemonade in bottles
grilling a meat on a charcoal grill
people enjoying asian cuisine
close up footage of a hot dish on a clay pot
pork ribs dish
waffle with strawberry and syrup for breakfast
tofu dish with rose garnish
uncooked pork meat
egg yolk being dumped over gourmet dish
tasty brunch dish close up
little boy pretending to eat the watermelon
slicing roasted beef
close up of a chef adding teriyaki sauce to a dish
flat lay mexican dish
a person placing an octopus dish on a marble surface
close up of tea leaves brewing in a glass kettle
adding fresh herbs to soup dish
a scoop of roasted coffee beans
fresh dim sum set up on a bamboo steam tray for cooking
a girl putting ketchup on food at the kitchen
cooking on electric stove
a woman with a slice of a pie
grapes and wine on a wooden board
man taking picture of his food
hamburger and fries on restaurant table
close up video of japanese food
a cracker sandwich with cheese filling for snack
barista preparing matcha tea
close up of onion rings being deep fried
people carving a pumpkin
people sitting on a sofa
a man with a muertos face painting
man walking in the dark
men in front of their computer editing photos
men loading christmas tree on tow truck
woman washing the dishes
woman adding honey to the cinnamon rolls
two women kissing and smiling
three women looking at watercolor paintings
a family wearing paper bag masks
a family posing for the camera
a boy covering a rose flower with a dome glass
boy sitting on grass petting a dog
a girl in her tennis sportswear
a girl coloring the cardboard
silhouette of the couple during sunset
couple dancing with body paint
a child playing with water
a woman with her child sitting on a couch in the living room
a group of friend place doing hand gestures of agreement
friends having a group selfie
friends talking while on the basketball court
group of people protesting
a group of campers with a cute dog
a group of photographers taking pictures at the north western gardens in llandudno north wales
a group of students laughing and talking
a group of martial artist warming up
a person playing golf
a person walking on a wet wooden bridge
person doing a leg exercise
ice hockey athlete on rink
a young athlete training in swimming
chess player dusting a chessboard
baseball player holding his bat
a bearded man putting a vinyl record on a vinyl player
an orchestra finishes a performance
people applauding the performance of the kids
band performance at the recording studio
father and his children playing jenga game
people playing a board game
man playing a video game
a man video recording the movie in theater
man and a woman eating while watching a movie
movie crew talking together
a director explaining the movie scene
man and woman listening to music on car
man playing music
couple dancing slow dance with sun glare
a ballerina practicing in the dance studio
father and son holding hands
father and daughter talking together
a mother and her kids engaged in a video call
mother and daughter reading a book together
a mother teaching her daughter playing a violin
kid in a halloween costume
a happy kid playing the ukulele
a chef slicing a cucumber
chef wearing his gloves properly
brother and sister using hammock
girl applying sunblock to her brother
a girl pushing the chair while her sister is on the chair
colleagues talking in office building
fighter practice kicking
a woman fighter in her cosplay costume
an engineer holding blueprints while talking with her colleague
a young woman looking at vr controllers with her friend
workmates teasing a colleague in the work
a male police officer talking on the radio
teacher holding a marker while talking
teacher writing on her notebook
a young student attending her online classes
a student showing his classmates his wand
a male vendor selling fruits
a shirtless male climber
a sound engineer listening to music
female talking to a psychiatrist in a therapy session
young female activist posing with flag
a man in a hoodie and woman with a red bandana talking to each other and smiling
a medium close up of women wearing kimonos
a male interviewer listening to a person talking
a social worker having a conversation with the foster parents
a farm worker harvesting onions
worker packing street food
worker and client at barber shop
elderly man lifting kettlebell
mom assisting son in riding a bicycle
dad watching her daughter eat
young guy with vr headset
pregnant woman exercising with trainer
a fortune teller talking to a client
wizard doing a ritual on a woman
a footage of an actor on a movie scene
a man holding a best actor trophy
a singer of a music band
a young singer performing on stage
young dancer practicing at home
seller showing room to a couple
cab driver talking to passenger
a policeman talking to the car driver
kids celebrating halloween at home
little boy helping mother in kitchen
video of a indoor green plant
a girl arranges a christmas garland hanging by the kitchen cabinet
candle burning in dark room
couple having fun and goofing around the bedroom
girls jumping up and down in the bedroom
woman and man in pajamas working from home
a muslim family sitting and talking in the living room
family enjoying snack time while sitting in the living room
woman holding an animal puppet and a little girl playing together at the living room
kids playing in the indoor tent
young people celebrating new year at the office
a woman writing on the sticky note in the office
a woman exercising at home over a yoga mat
girls preparing easter decorations at home
dog on floor in room
turning on a fluorescent light inside a room
colleagues talking to each other near the office windows
a woman recording herself while exercising at home
music room
different kind of tools kept in a utility room
sofa beds and other furniture
a girl finding her brother reading a book in the bedroom
an elegant ceramic plant pot and hanging plant on indoor
furniture inside a bedroom
interior design of the bar section
living room with party decoration
firewood burning in dark room
a young woman playing the ukulele at home
woman painting at home
a woman in a locker room
video of a bathroom interior
the interior design of a jewish synagogue
a woman in protective suit disinfecting the kitchen
modern minimalist home interior
modern interior design of a coffee shop
person arranging minimalist furniture
aerial shot of interior of the warehouse
a room of a manufacturing facility
interior of catholic
interior design of a restaurant
a female model in a changing room looking herself in mirror
men walking in the office hallway
people sitting in a conference room
the interior design of a shopping mall
chandeliers in room
lucerne railway station interior
a female fencer posing in a foggy room
a toolbox and a paint roller beside a huge package in a room
bedroom in hotel
a woman lying in the operating room
a chef holding and checking kitchen utensils
a couple singing in the shower room together
a woman cleaning mess in the living room
an empty meeting room with natural light
person dancing in a dark room
close up on blood in hospital room
a couple resting on their home floor
a young female staff at courier office
a man entering the gym locker room
a bored man sitting by the tv at home
woman dancing in indoor garden
rubble in the interior of an abandoned house
indoor farm in a greenhouse
man doing handstand in indoor garden
an abandoned indoor swimming pool
home decorations on top of a cabinet
graffiti art on the interior walls of an abandoned mansion
indoor wall climbing activity
sunlight inside a room
teenage girl roller skating at indoor rink
home deco with lighted
baby in the shower room
men enjoying office christmas party
a bedroom with a brick wall
actors prepping in the dressing room
kids playing at an indoor playground
a person sanitizing an office space using smoke machine
mother and daughter choosing clothes at home
a woman sitting by the indoor fire pit
man standing on the corner of the room while looking around
person assembling furniture
a family stacking cardboard boxes in a room
family having fun in the dining room
person disinfecting a room
a woman washing strawberries in the kitchen sink
modern office waiting room
close up view of a person slicing with a kitchen knife
boiling coffee on a stove in the kitchen
modern equipment used in a home studio
interior of a recording studio
people working in a call center office
band performing at a home concert
a group of people watching a concert in a room
people packing their furniture
young employees in office holding a certificate
a criminal inside a dark room handcuffed in a table
couple browsing and looking for furniture in the store
workspace at home
video of a indoor green plant
close up view of a plant
close up shot of a burning plant
plucking leaves from plant
a plant on gold pot with glass lid
a branch of a tree and a plant
a leafless tree
close up shot of fern leaf
close up video of strawberry plant
plant with blooming flowers
close up video of flower petals
watering yellow plant
beautiful flower decoration
cannabis flower in a jar
a footage of the tree leaves
a red leaf plant
close up view of a white christmas tree
snow pouring on a tree
close up shot of white flowers on the tree
leaves in the trees daytime
a dead tree lying on a grass field
tree branches in a flowing river
purple flowers with leaves
a coconut tree by the house
close up on flower in winter
bamboo leaves backlit by the sun
close up video of a wet flower
a man putting a flower in a box
dropping flower petals on a wooden bowl
a close up shot of gypsophila flower
variety of succulent plants on a garden
variety of trees and plants in a botanical garden
forest of deciduous trees
a stack of dried leaves burning in a forest
tall forest trees on a misty morning
close up view of dewdrops on a leaf
close up view of white petaled flower
removing a pineapple leaf
a dragonfly perched on a leaf
butterfly pollinating flower
person visiting and checking a corn plant
woman picking beans from a plant
woman plucking mint leaves
single tree in the middle of farmland
a plant on a soil
drone footage of a tree on farm field
a tractor harvesting lavender flower
people putting christmas ornaments on a christmas tree
jack o lantern hanging on a tree
tree with halloween decoration
flower field near the waterfall
truck carrying the tree logs
raindrops falling on leaves
shot of a palm tree swaying with the wind
squirrels on a tree branch
person holding a flower
a fallen tree trunk
tree with golden leaves
cherry tree
wind blows through leaves of the tree in autumn
a leaf on a glass
the long trunks of tall trees in the forest
trees in the forest during sunny day
close up video of tree bark
reflection of tree branches
trunks of many trees in the forest
tree leaves providing shades from the sun
leaves swaying in the wind
low angle shot of baobab tree
bare trees in forest
a plant surrounded by fallen leaves
a couple preparing food and pruning a plant
a man cutting a tree bark
oranges on a tree branch
plant connected on the stones
video of a sawmill machine cutting tree log
women drying flower petals
macro view of an agave plant
a video of a person tying a plant on a string
green moss in forest nature
coconut tree near sea under blue sky
the canopy of a coconut tree
a man leaning on a tree at the beach
a full grown plant on a pot
candle wax dripping on flower petals
close up of leaves in autumn
a woman opening a book with a flower inside
a man holding leaves looking at the camera
a shadow of a swaying plant
a tree and concrete structure under a blue and cloudy sky
trimming excess leaves on a potted plant
the changing color of the tree leaves during autumn season
a gooseberry tree swayed by the wind
forest trees and a medieval castle at sunset
woman cut down tree
an old oak tree in a park across the street from a hotel
wild flowers growing in a forest ground
a mossy fountain and green plants in a botanical garden
mansion with beautiful garden
ants on a dragon fruit flower
scenery of desert landscape
landscape agriculture farm tractor
burning slash piles in the forest
graveyard at sunset
view of a jack o lantern with pumpkins in a smoky garden
sun view through a spider web
view of the sea from an abandoned building
close up view of a full moon
close up view of lighted candles
close up view of swaying white flowers and leaves
scenery of a relaxing beach
selective focus video of grass during sunny day
aerial view of brown dry landscape
fireworks display in the sky at night
a bonfire near river
mountain view
waterfalls in between mountain
a picturesque view of nature
exotic view of a riverfront city
tall trees in the forest under the clear sky
snow on branches in forest
stream in the nature
an airplane flying above the sea of clouds
scenic video of sunset
view of houses with bush fence under a blue and cloudy sky
scenic view from wooden pathway
scenic view of a tropical beach
drone footage of waves crashing on beach shore
a scenic view of the golden hour at norway
time lapse video of foggy mountain forest
brown mountain during fall season
video of ocean during daytime
boat sailing in the ocean
top view of yachts
beautiful scenery of flowing waterfalls and river
wild ducks paddling on the lake surface
a relaxing scenery of beach view under cloudy sky
natural rock formations on beach under cloudy sky
a palm tree against blue sky
video of sailboat on a lake during sunset
aerial view of snow piles
time lapse of a sunset sky in the countryside
aerial footage of a statue
time lapse video of a farm during sunset
clouds formation in the sky at sunset
aerial shot of a village
drone shot of a beautiful sunrise at the mountains
time lapse video of foggy morning during sunrise
sun shining between tree leaves at sunrise
video of lake during dawn
vehicles traveling on roadway under cloudy sky
view of golden domed church
a monument under the blue sky
firecrackers in the sky
view of fruit signage in the farm
a dark clouds over shadowing the full moon
view of the amazon river
a big river swamp in a dense forest
a blooming cherry blossom tree under a blue sky with white clouds
a river waterfall cascading down the plunge basin
flooded landscape with palm trees
a blurry waterfall background
waterfall in the mountains
aerial footage of a city at night
pond by small waterfall in forest
aerial view of farmlands at the bay of lake
rice terraces in the countryside
a highway built across an agricultural area in the countryside
gloomy morning in the countryside
drone shot of an abandoned coliseum on a snowy mountain top
boat sailing in the middle of ocean
drone shot of the grass field
natural landscape of mountain and sea with islets developed into a community
aerial view of zaporizhia in ukraine
aerial footage of a herd
an aerial footage of a red sky
grass and plants growing in the remains of an abandoned house
view from hill on city
aerial view on orthodox church
aerial view of bay in croatia
a footage of a frozen river
overlooking view of a city at daylight
view outside the cemetery
clear sky with moon over meadow
clouds over railway
aerial footage of moving vehicles on the road at night
aerial view of town and park
top view of skyscrapers
top view of the empire state building in manhattan
top view of the central park in new york city
sheep running in a grass field
clear sky over factory
smoke and fire in birds eye view
view of a pathway with snow melting on its side
ferry under bridge on river near city in malaysia
mountain slopes covered in green vegetation
panoramic view of a town surrounded by snow covered mountains
aerial view of a palace
top view of vehicles driving on the intersection
a graveyard by a church in a mountain landscape
a modern railway station in malaysia use for public transportation
drone footage of amsterdam metro station
train arriving at a station
red vehicle driving on field
close up view of flashing emergency vehicle lighting
vehicle with fertilizer on field
a highway built across an agricultural area in the countryside
drone footage of motorcycles driving on country road between agricultural fields
a road in the woods under fog
footage of a car driving through a wheat field
vehicle stops for an ambulance passing through city traffic
emergency vehicle parked outside the casino
zombies attacking a woman and a boy inside a car
woman seating inside the car while chewing
video of passengers riding a double decker bus during night
traffic in london street at night
elderly couple checking engine of automobile
a green vintage automobile with an open hood parked in a parking area
close up of a prototype automobile with exposed engine on the back seat of the car
aerial view of road in forest
train departing from station
aerial view of a train passing by a bridge
video of a train tracks
video footage of a subway
video of blinking traffic lights
couple walking out on the subway
time lapse of a subway tunnel
monitor board inside the subway
metro train at night
zoom in video of a tram passing by city
young man using laptop in the tram
man reading a book at bus stop
close up shot of a moving taxi
night travel in london street on a public bus
red bus in a rainy city
flow of traffic in the city
close up shot of a yellow taxi turning left
two women calling for a taxi
drone view of an illuminated bridge across a river
policeman in police car talking on radio
airplane taking off at night
view through window in airplane
an airplane in the sky
helicopter landing on the street
a pilot getting out of a helicopter
a helicopter flying under blue sky
boat sailing in the middle of the ocean
girl playing with a toy boat
silhouette of a boat on sea during golden hour
a boat travelling around the lake
road on mountain ridge
ship sailing on danube river
slow motion video of a ship water trail in the sea
drone footage of a wreck ship on shore
a white yacht traveling on a river and passing under the bridge
female teenagers drinking champagne in the yacht
video of yacht sailing in the ocean
red combine harvester on road on field
a woman sitting on a bicycle while using a mobile phone
a woman sitting on a motorcycle looking around
three teenagers fixing a bicycle
a woman in a halloween costume posing on a motorcycle
a parked motorcycle on a foggy roadside
cable car near sea shore
a truck travelling in the road
footage of the road without any traffic
a road sign
love padlocks on a bridge
camera moving at highway construction site
vehicles driving on highway
a motorbike on highway at timelapse mode
point of view of a car driving through a tunnel
time lapse of heavy traffic on an avenue
ferry boat on city canal
black vintage car in museum
a zigzag road across a forest
people crossing the road
video of a kayak boat in a river
a person paddling a wooden boat in a lake
a car charging in the parking area
cars parked on the road
footage of the street with people and vehicle passing by in the rain
traffic on busy city street
a woman getting out of the car to walk with their dog
yacht sailing through the ocean
people in queue to military ship
man wearing motorcycle helmet looking at the camera
empty seats in the bus
empty boat on the water
cargo train traveling on the mountainside
cruise ship in harbor
counting down at traffic lights
pressing the car ignition
fire truck driving on the road
a footage of a broken bicycle
drone footage of an ambulance on the road
slow motion footage of a racing car
ship sailing on sea against sunset
big cargo ship passing on the shore
back view of man and woman walking on unpaved road


================================================
FILE: Open-Sora/assets/texts/VBench/all_dimension.txt
================================================
In a still frame, a stop sign
a toilet, frozen in time
a laptop, frozen in time
A tranquil tableau of alley
A tranquil tableau of bar
A tranquil tableau of barn
A tranquil tableau of bathroom
A tranquil tableau of bedroom
A tranquil tableau of cliff
In a still frame, courtyard
In a still frame, gas station
A tranquil tableau of house
indoor gymnasium, frozen in time
A tranquil tableau of indoor library
A tranquil tableau of kitchen
A tranquil tableau of palace
In a still frame, parking lot
In a still frame, phone booth
A tranquil tableau of restaurant
A tranquil tableau of tower
A tranquil tableau of a bowl
A tranquil tableau of an apple
A tranquil tableau of a bench
A tranquil tableau of a bed
A tranquil tableau of a chair
A tranquil tableau of a cup
A tranquil tableau of a dining table
In a still frame, a pear
A tranquil tableau of a bunch of grapes
A tranquil tableau of a bowl on the kitchen counter
A tranquil tableau of a beautiful, handcrafted ceramic bowl
A tranquil tableau of an antique bowl
A tranquil tableau of an exquisite mahogany dining table
A tranquil tableau of a wooden bench in the park
A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers
In a still frame, a park bench with a view of the lake
A tranquil tableau of a vintage rocking chair was placed on the porch
A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars
A tranquil tableau of the phone booth was tucked away in a quiet alley
a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time
A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside
A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow
In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water
In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape
In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens
In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels
A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility
In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity
static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water
A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night
A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water
In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square
In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner
A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy
A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins
A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes
A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades
In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall
A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels
A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour
In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting
In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light
A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon
A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon
A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space
In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk
In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier
A tranquil tableau of a country estate's library featured elegant wooden shelves
A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently
A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm
A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden
In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface
In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation
A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms
A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time
a bird and a cat
a cat and a dog
a dog and a horse
a horse and a sheep
a sheep and a cow
a cow and an elephant
an elephant and a bear
a bear and a zebra
a zebra and a giraffe
a giraffe and a bird
a chair and a couch
a couch and a potted plant
a potted plant and a tv
a tv and a laptop
a laptop and a remote
a remote and a keyboard
a keyboard and a cell phone
a cell phone and a book
a book and a clock
a clock and a backpack
a backpack and an umbrella
an umbrella and a handbag
a handbag and a tie
a tie and a suitcase
a suitcase and a vase
a vase and scissors
scissors and a teddy bear
a teddy bear and a frisbee
a frisbee and skis
skis and a snowboard
a snowboard and a sports ball
a sports ball and a kite
a kite and a baseball bat
a baseball bat and a baseball glove
a baseball glove and a skateboard
a skateboard and a surfboard
a surfboard and a tennis racket
a tennis racket and a bottle
a bottle and a chair
an airplane and a train
a train and a boat
a boat and an airplane
a bicycle and a car
a car and a motorcycle
a motorcycle and a bus
a bus and a traffic light
a traffic light and a fire hydrant
a fire hydrant and a stop sign
a stop sign and a parking meter
a parking meter and a truck
a truck and a bicycle
a toilet and a hair drier
a hair drier and a toothbrush
a toothbrush and a sink
a sink and a toilet
a wine glass and a chair
a cup and a couch
a fork and a potted plant
a knife and a tv
a spoon and a laptop
a bowl and a remote
a banana and a keyboard
an apple and a cell phone
a sandwich and a book
an orange and a clock
broccoli and a backpack
a carrot and an umbrella
a hot dog and a handbag
a pizza and a tie
a donut and a suitcase
a cake and a vase
an oven and scissors
a toaster and a teddy bear
a microwave and a frisbee
a refrigerator and skis
a bicycle and an airplane
a car and a train
a motorcycle and a boat
a person and a toilet
a person and a hair drier
a person and a toothbrush
a person and a sink
A person is riding a bike
A person is marching
A person is roller skating
A person is tasting beer
A person is clapping
A person is drawing
A person is petting animal (not cat)
A person is eating watermelon
A person is playing harp
A person is wrestling
A person is riding scooter
A person is sweeping floor
A person is skateboarding
A person is dunking basketball
A person is playing flute
A person is stretching leg
A person is tying tie
A person is skydiving
A person is shooting goal (soccer)
A person is playing piano
A person is finger snapping
A person is canoeing or kayaking
A person is laughing
A person is digging
A person is clay pottery making
A person is shooting basketball
A person is bending back
A person is shaking hands
A person is bandaging
A person is push up
A person is catching or throwing frisbee
A person is playing trumpet
A person is flying kite
A person is filling eyebrows
A person is shuffling cards
A person is folding clothes
A person is smoking
A person is tai chi
A person is squat
A person is playing controller
A person is throwing axe
A person is giving or receiving award
A person is air drumming
A person is taking a shower
A person is planting trees
A person is sharpening knives
A person is robot dancing
A person is rock climbing
A person is hula hooping
A person is writing
A person is bungee jumping
A person is pushing cart
A person is cleaning windows
A person is cutting watermelon
A person is cheerleading
A person is washing hands
A person is ironing
A person is cutting nails
A person is hugging
A person is trimming or shaving beard
A person is jogging
A person is making bed
A person is washing dishes
A person is grooming dog
A person is doing laundry
A person is knitting
A person is reading book
A person is baby waking up
A person is massaging legs
A person is brushing teeth
A person is crawling baby
A person is motorcycling
A person is driving car
A person is sticking tongue out
A person is shaking head
A person is sword fighting
A person is doing aerobics
A person is strumming guitar
A person is riding or walking with horse
A person is archery
A person is catching or throwing baseball
A person is playing chess
A person is rock scissors paper
A person is using computer
A person is arranging flowers
A person is bending metal
A person is ice skating
A person is climbing a rope
A person is crying
A person is dancing ballet
A person is getting a haircut
A person is running on treadmill
A person is kissing
A person is counting money
A person is barbequing
A person is peeling apples
A person is milking cow
A person is shining shoes
A person is making snowman
A person is sailing
a person swimming in ocean
a person giving a presentation to a room full of colleagues
a person washing the dishes
a person eating a burger
a person walking in the snowstorm
a person drinking coffee in a cafe
a person playing guitar
a bicycle leaning against a tree
a bicycle gliding through a snowy field
a bicycle slowing down to stop
a bicycle accelerating to gain speed
a car stuck in traffic during rush hour
a car turning a corner
a car slowing down to stop
a car accelerating to gain speed
a motorcycle cruising along a coastal highway
a motorcycle turning a corner
a motorcycle slowing down to stop
a motorcycle gliding through a snowy field
a motorcycle accelerating to gain speed
an airplane soaring through a clear blue sky
an airplane taking off
an airplane landing smoothly on a runway
an airplane accelerating to gain speed
a bus turning a corner
a bus stuck in traffic during rush hour
a bus accelerating to gain speed
a train speeding down the tracks
a train crossing over a tall bridge
a train accelerating to gain speed
a truck turning a corner
a truck anchored in a tranquil bay
a truck stuck in traffic during rush hour
a truck slowing down to stop
a truck accelerating to gain speed
a boat sailing smoothly on a calm lake
a boat slowing down to stop
a boat accelerating to gain speed
a bird soaring gracefully in the sky
a bird building a nest from twigs and leaves
a bird flying over a snowy forest
a cat grooming itself meticulously with its tongue
a cat playing in park
a cat drinking water
a cat running happily
a dog enjoying a peaceful walk
a dog playing in park
a dog drinking water
a dog running happily
a horse bending down to drink water from a river
a horse galloping across an open field
a horse taking a peaceful walk
a horse running to join a herd of its kind
a sheep bending down to drink water from a river
a sheep taking a peaceful walk
a sheep running to join a herd of its kind
a cow bending down to drink water from a river
a cow chewing cud while resting in a tranquil barn
a cow running to join a herd of its kind
an elephant spraying itself with water using its trunk to cool down
an elephant taking a peaceful walk
an elephant running to join a herd of its kind
a bear catching a salmon in its powerful jaws
a bear sniffing the air for scents of food
a bear climbing a tree
a bear hunting for prey
a zebra bending down to drink water from a river
a zebra running to join a herd of its kind
a zebra taking a peaceful walk
a giraffe bending down to drink water from a river
a giraffe taking a peaceful walk
a giraffe running to join a herd of its kind
a person
a bicycle
a car
a motorcycle
an airplane
a bus
a train
a truck
a boat
a traffic light
a fire hydrant
a stop sign
a parking meter
a bench
a bird
a cat
a dog
a horse
a sheep
a cow
an elephant
a bear
a zebra
a giraffe
a backpack
an umbrella
a handbag
a tie
a suitcase
a frisbee
skis
a snowboard
a sports ball
a kite
a baseball bat
a baseball glove
a skateboard
a surfboard
a tennis racket
a bottle
a wine glass
a cup
a fork
a knife
a spoon
a bowl
a banana
an apple
a sandwich
an orange
broccoli
a carrot
a hot dog
a pizza
a donut
a cake
a chair
a couch
a potted plant
a bed
a dining table
a toilet
a tv
a laptop
a remote
a keyboard
a cell phone
a microwave
an oven
a toaster
a sink
a refrigerator
a book
a clock
a vase
scissors
a teddy bear
a hair drier
a toothbrush
a red bicycle
a green bicycle
a blue bicycle
a yellow bicycle
an orange bicycle
a purple bicycle
a pink bicycle
a black bicycle
a white bicycle
a red car
a green car
a blue car
a yellow car
an orange car
a purple car
a pink car
a black car
a white car
a red bird
a green bird
a blue bird
a yellow bird
an orange bird
a purple bird
a pink bird
a black bird
a white bird
a black cat
a white cat
an orange cat
a yellow cat
a red umbrella
a green umbrella
a blue umbrella
a yellow umbrella
an orange umbrella
a purple umbrella
a pink umbrella
a black umbrella
a white umbrella
a red suitcase
a green suitcase
a blue suitcase
a yellow suitcase
an orange suitcase
a purple suitcase
a pink suitcase
a black suitcase
a white suitcase
a red bowl
a green bowl
a blue bowl
a yellow bowl
an orange bowl
a purple bowl
a pink bowl
a black bowl
a white bowl
a red chair
a green chair
a blue chair
a yellow chair
an orange chair
a purple chair
a pink chair
a black chair
a white chair
a red clock
a green clock
a blue clock
a yellow clock
an orange clock
a purple clock
a pink clock
a black clock
a white clock
a red vase
a green vase
a blue vase
a yellow vase
an orange vase
a purple vase
a pink vase
a black vase
a white vase
A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style
A beautiful coastal beach in spring, waves lapping on sand, oil painting
A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo
A beautiful coastal beach in spring, waves lapping on sand, black and white
A beautiful coastal beach in spring, waves lapping on sand, pixel art
A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style
A beautiful coastal beach in spring, waves lapping on sand, animated style
A beautiful coastal beach in spring, waves lapping on sand, watercolor painting
A beautiful coastal beach in spring, waves lapping on sand, surrealism style
The bund Shanghai, Van Gogh style
The bund Shanghai, oil painting
The bund Shanghai by Hokusai, in the style of Ukiyo
The bund Shanghai, black and white
The bund Shanghai, pixel art
The bund Shanghai, in cyberpunk style
The bund Shanghai, animated style
The bund Shanghai, watercolor painting
The bund Shanghai, surrealism style
a shark is swimming in the ocean, Van Gogh style
a shark is swimming in the ocean, oil painting
a shark is swimming in the ocean by Hokusai, in the style of Ukiyo
a shark is swimming in the ocean, black and white
a shark is swimming in the ocean, pixel art
a shark is swimming in the ocean, in cyberpunk style
a shark is swimming in the ocean, animated style
a shark is swimming in the ocean, watercolor painting
a shark is swimming in the ocean, surrealism style
A panda drinking coffee in a cafe in Paris, Van Gogh style
A panda drinking coffee in a cafe in Paris, oil painting
A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo
A panda drinking coffee in a cafe in Paris, black and white
A panda drinking coffee in a cafe in Paris, pixel art
A panda drinking coffee in a cafe in Paris, in cyberpunk style
A panda drinking coffee in a cafe in Paris, animated style
A panda drinking coffee in a cafe in Paris, watercolor painting
A panda drinking coffee in a cafe in Paris, surrealism style
A cute happy Corgi playing in park, sunset, Van Gogh style
A cute happy Corgi playing in park, sunset, oil painting
A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo
A cute happy Corgi playing in park, sunset, black and white
A cute happy Corgi playing in park, sunset, pixel art
A cute happy Corgi playing in park, sunset, in cyberpunk style
A cute happy Corgi playing in park, sunset, animated style
A cute happy Corgi playing in park, sunset, watercolor painting
A cute happy Corgi playing in park, sunset, surrealism style
Gwen Stacy reading a book, Van Gogh style
Gwen Stacy reading a book, oil painting
Gwen Stacy reading a book by Hokusai, in the style of Ukiyo
Gwen Stacy reading a book, black and white
Gwen Stacy reading a book, pixel art
Gwen Stacy reading a book, in cyberpunk style
Gwen Stacy reading a book, animated style
Gwen Stacy reading a book, watercolor painting
Gwen Stacy reading a book, surrealism style
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting
A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style
An astronaut flying in space, Van Gogh style
An astronaut flying in space, oil painting
An astronaut flying in space by Hokusai, in the style of Ukiyo
An astronaut flying in space, black and white
An astronaut flying in space, pixel art
An astronaut flying in space, in cyberpunk style
An astronaut flying in space, animated style
An astronaut flying in space, watercolor painting
An astronaut flying in space, surrealism style
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style
A beautiful coastal beach in spring, waves lapping on sand, in super slow motion
A beautiful coastal beach in spring, waves lapping on sand, zoom in
A beautiful coastal beach in spring, waves lapping on sand, zoom out
A beautiful coastal beach in spring, waves lapping on sand, pan left
A beautiful coastal beach in spring, waves lapping on sand, pan right
A beautiful coastal beach in spring, waves lapping on sand, tilt up
A beautiful coastal beach in spring, waves lapping on sand, tilt down
A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect
A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective
A beautiful coastal beach in spring, waves lapping on sand, racking focus
The bund Shanghai, in super slow motion
The bund Shanghai, zoom in
The bund Shanghai, zoom out
The bund Shanghai, pan left
The bund Shanghai, pan right
The bund Shanghai, tilt up
The bund Shanghai, tilt down
The bund Shanghai, with an intense shaking effect
The bund Shanghai, featuring a steady and smooth perspective
The bund Shanghai, racking focus
a shark is swimming in the ocean, in super slow motion
a shark is swimming in the ocean, zoom in
a shark is swimming in the ocean, zoom out
a shark is swimming in the ocean, pan left
a shark is swimming in the ocean, pan right
a shark is swimming in the ocean, tilt up
a shark is swimming in the ocean, tilt down
a shark is swimming in the ocean, with an intense shaking effect
a shark is swimming in the ocean, featuring a steady and smooth perspective
a shark is swimming in the ocean, racking focus
A panda drinking coffee in a cafe in Paris, in super slow motion
A panda drinking coffee in a cafe in Paris, zoom in
A panda drinking coffee in a cafe in Paris, zoom out
A panda drinking coffee in a cafe in Paris, pan left
A panda drinking coffee in a cafe in Paris, pan right
A panda drinking coffee in a cafe in Paris, tilt up
A panda drinking coffee in a cafe in Paris, tilt down
A panda drinking coffee in a cafe in Paris, with an intense shaking effect
A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective
A panda drinking coffee in a cafe in Paris, racking focus
A cute happy Corgi playing in park, sunset, in super slow motion
A cute happy Corgi playing in park, sunset, zoom in
A cute happy Corgi playing in park, sunset, zoom out
A cute happy Corgi playing in park, sunset, pan left
A cute happy Corgi playing in park, sunset, pan right
A cute happy Corgi playing in park, sunset, tilt up
A cute happy Corgi playing in park, sunset, tilt down
A cute happy Corgi playing in park, sunset, with an intense shaking effect
A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective
A cute happy Corgi playing in park, sunset, racking focus
Gwen Stacy reading a book, in super slow motion
Gwen Stacy reading a book, zoom in
Gwen Stacy reading a book, zoom out
Gwen Stacy reading a book, pan left
Gwen Stacy reading a book, pan right
Gwen Stacy reading a book, tilt up
Gwen Stacy reading a book, tilt down
Gwen Stacy reading a book, with an intense shaking effect
Gwen Stacy reading a book, featuring a steady and smooth perspective
Gwen Stacy reading a book, racking focus
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus
An astronaut flying in space, in super slow motion
An astronaut flying in space, zoom in
An astronaut flying in space, zoom out
An astronaut flying in space, pan left
An astronaut flying in space, pan right
An astronaut flying in space, tilt up
An astronaut flying in space, tilt down
An astronaut flying in space, with an intense shaking effect
An astronaut flying in space, featuring a steady and smooth perspective
An astronaut flying in space, racking focus
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus
Close up of grapes on a rotating table.
Turtle swimming in ocean.
A storm trooper vacuuming the beach.
A panda standing on a surfboard in the ocean in sunset.
An astronaut feeding ducks on a sunny afternoon, reflection from the water.
Two pandas discussing an academic paper.
Sunset time lapse at the beach with moving clouds and colors in the sky.
A fat rabbit wearing a purple robe walking through a fantasy landscape.
A koala bear playing piano in the forest.
An astronaut flying in space.
Fireworks.
An animated painting of fluffy white clouds moving in sky.
Flying through fantasy landscapes.
A bigfoot walking in the snowstorm.
A squirrel eating a burger.
A cat wearing sunglasses and working as a lifeguard at a pool.
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks.
Splash of turquoise water in extreme slow motion, alpha channel included.
an ice cream is melting on the table.
a drone flying over a snowy forest.
a shark is swimming in the ocean.
Aerial panoramic video from a drone of a fantasy land.
a teddy bear is swimming in the ocean.
time lapse of sunrise on mars.
golden fish swimming in the ocean.
An artist brush painting on a canvas close up.
A drone view of celebration with Christmas tree and fireworks, starry sky - background.
happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background
Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance.
Campfire at night in a snowy forest with starry sky in the background.
a fantasy landscape
A 3D model of a 1800s victorian house.
this is how I do makeup in the morning.
A raccoon that looks like a turtle, digital art.
Robot dancing in Times Square.
Busy freeway at night.
Balloon full of water exploding in extreme slow motion.
An astronaut is riding a horse in the space in a photorealistic style.
Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl.
Sewing machine, old sewing machine working.
Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink.
Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro.
Vampire makeup face of beautiful girl, red contact lenses.
Ashtray full of butts on table, smoke flowing on black background, close-up
Pacific coast, carmel by the sea ocean and waves.
A teddy bear is playing drum kit in NYC Times Square.
A corgi is playing drum kit.
An Iron man is playing the electronic guitar, high electronic guitar.
A raccoon is playing the electronic guitar.
A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh
A corgi's head depicted as an explosion of a nebula
A fantasy landscape
A future where humans have achieved teleportation technology
A jellyfish floating through the ocean, with bioluminescent tentacles
A Mars rover moving on Mars
A panda drinking coffee in a cafe in Paris
A space shuttle launching into orbit, with flames and smoke billowing out from the engines
A steam train moving on a mountainside
A super cool giant robot in Cyberpunk Beijing
A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground
Cinematic shot of Van Gogh's selfie, Van Gogh style
Gwen Stacy reading a book
Iron Man flying in the sky
The bund Shanghai, oil painting
Yoda playing guitar on the stage
A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo
A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh
A boat sailing leisurely along the Seine River with the Eiffel Tower in background
A car moving slowly on an empty street, rainy evening
A cat eating food out of a bowl
A cat wearing sunglasses at a pool
A confused panda in calculus class
A cute fluffy panda eating Chinese food in a restaurant
A cute happy Corgi playing in park, sunset
A cute raccoon playing guitar in a boat on the ocean
A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background
A lightning striking atop of eiffel tower, dark clouds in the sky
A modern art museum, with colorful paintings
A panda cooking in the kitchen
A panda playing on a swing set
A polar bear is playing guitar
A raccoon dressed in suit playing the trumpet, stage background
A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy
A shark swimming in clear Caribbean ocean
A super robot protecting city
A teddy bear washing the dishes
An epic tornado attacking above a glowing city at night, the tornado is made of smoke
An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas
Clown fish swimming through the coral reef
Hyper-realistic spaceship landing on Mars
The bund Shanghai, vibrant color
Vincent van Gogh is painting in the room
Yellow flowers swing in the wind
alley
amusement park
aquarium
arch
art gallery
bathroom
bakery shop
ballroom
bar
barn
basement
beach
bedroom
bridge
botanical garden
cafeteria
campsite
campus
carrousel
castle
cemetery
classroom
cliff
crosswalk
construction site
corridor
courtyard
desert
downtown
driveway
farm
food court
football field
forest road
fountain
gas station
glacier
golf course
indoor gymnasium
harbor
highway
hospital
house
iceberg
industrial area
jail cell
junkyard
kitchen
indoor library
lighthouse
laboratory
mansion
marsh
mountain
indoor movie theater
indoor museum
music studio
nursery
ocean
office
palace
parking lot
pharmacy
phone booth
raceway
restaurant
river
science museum
shower
ski slope
sky
skyscraper
baseball stadium
staircase
street
supermarket
indoor swimming pool
tower
outdoor track
train railway
train station platform
underwater coral reef
valley
volcano
waterfall
windmill
a bicycle on the left of a car, front view
a car on the right of a motorcycle, front view
a motorcycle on the left of a bus, front view
a bus on the right of a traffic light, front view
a traffic light on the left of a fire hydrant, front view
a fire hydrant on the right of a stop sign, front view
a stop sign on the left of a parking meter, front view
a parking meter on the right of a bench, front view
a bench on the left of a truck, front view
a truck on the right of a bicycle, front view
a bird on the left of a cat, front view
a cat on the right of a dog, front view
a dog on the left of a horse, front view
a horse on the right of a sheep, front view
a sheep on the left of a cow, front view
a cow on the right of an elephant, front view
an elephant on the left of a bear, front view
a bear on the right of a zebra, front view
a zebra on the left of a giraffe, front view
a giraffe on the right of a bird, front view
a bottle on the left of a wine glass, front view
a wine glass on the right of a cup, front view
a cup on the left of a fork, front view
a fork on the right of a knife, front view
a knife on the left of a spoon, front view
a spoon on the right of a bowl, front view
a bowl on the left of a bottle, front view
a potted plant on the left of a remote, front view
a remote on the right of a clock, front view
a clock on the left of a vase, front view
a vase on the right of scissors, front view
scissors on the left of a teddy bear, front view
a teddy bear on the right of a potted plant, front view
a frisbee on the left of a sports ball, front view
a sports ball on the right of a baseball bat, front view
a baseball bat on the left of a baseball glove, front view
a baseball glove on the right of a tennis racket, front view
a tennis racket on the left of a frisbee, front view
a toilet on the left of a hair drier, front view
a hair drier on the right of a toothbrush, front view
a toothbrush on the left of a sink, front view
a sink on the right of a toilet, front view
a chair on the left of a couch, front view
a couch on the right of a bed, front view
a bed on the left of a tv, front view
a tv on the right of a dining table, front view
a dining table on the left of a chair, front view
an airplane on the left of a train, front view
a train on the right of a boat, front view
a boat on the left of an airplane, front view
an oven on the top of a toaster, front view
an oven on the bottom of a toaster, front view
a toaster on the top of a microwave, front view
a toaster on the bottom of a microwave, front view
a microwave on the top of an oven, front view
a microwave on the bottom of an oven, front view
a banana on the top of an apple, front view
a banana on the bottom of an apple, front view
an apple on the top of a sandwich, front view
an apple on the bottom of a sandwich, front view
a sandwich on the top of an orange, front view
a sandwich on the bottom of an orange, front view
an orange on the top of a carrot, front view
an orange on the bottom of a carrot, front view
a carrot on the top of a hot dog, front view
a carrot on the bottom of a hot dog, front view
a hot dog on the top of a pizza, front view
a hot dog on the bottom of a pizza, front view
a pizza on the top of a donut, front view
a pizza on the bottom of a donut, front view
a donut on the top of broccoli, front view
a donut on the bottom of broccoli, front view
broccoli on the top of a banana, front view
broccoli on the bottom of a banana, front view
skis on the top of a snowboard, front view
skis on the bottom of a snowboard, front view
a snowboard on the top of a kite, front view
a snowboard on the bottom of a kite, front view
a kite on the top of a skateboard, front view
a kite on the bottom of a skateboard, front view
a skateboard on the top of a surfboard, front view
a skateboard on the bottom of a surfboard, front view
a surfboard on the top of skis, front view
a surfboard on the bottom of skis, front view


================================================
FILE: Open-Sora/assets/texts/VBench/all_i2v.txt
================================================
a close up of a blue and orange liquid{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
a close up of a blue and orange liquid, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
a close up of a blue and orange liquid, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
a close up of a blue and orange liquid, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
a close up of a blue and orange liquid, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
a close up of a blue and orange liquid, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
a close up of a blue and orange liquid, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
a close up of a blue and orange liquid, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
A black and white abstract video featuring mesmerizing bubbles{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
A black and white abstract video featuring mesmerizing bubbles, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
A black and white abstract video featuring mesmerizing bubbles, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
A black and white abstract video featuring mesmerizing bubbles, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
A black and white abstract video featuring mesmerizing bubbles, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
A black and white abstract video featuring mesmerizing bubbles, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
A black and white abstract video featuring mesmerizing bubbles, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
A black and white abstract video featuring mesmerizing bubbles, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
a blue and white smoke is swirly in the dark{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
a blue and white smoke is swirly in the dark, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
a blue and white smoke is swirly in the dark, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
a blue and white smoke is swirly in the dark, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
a blue and white smoke is swirly in the dark, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
a blue and white smoke is swirly in the dark, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
a blue and white smoke is swirly in the dark, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
a blue and white smoke is swirly in the dark, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
a close-up view of a sea fan in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
a close-up view of a sea fan in the water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
a close-up view of a sea fan in the water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
a close-up view of a sea fan in the water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
a close-up view of a sea fan in the water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
a close-up view of a sea fan in the water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
a close-up view of a sea fan in the water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
a close-up view of a sea fan in the water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
a visually captivating abstract video, rich in color, set against a dramatic black background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
a visually captivating abstract video, rich in color, set against a dramatic black background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
a visually captivating abstract video, rich in color, set against a dramatic black background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
a visually captivating abstract video, rich in color, set against a dramatic black background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
a visually captivating abstract video, rich in color, set against a dramatic black background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
a visually captivating abstract video, rich in color, set against a dramatic black background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
a visually captivating abstract video, rich in color, set against a dramatic black background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
a visually captivating abstract video, rich in color, set against a dramatic black background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
a purple and yellow abstract painting with a black background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
a purple and yellow abstract painting with a black background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
a purple and yellow abstract painting with a black background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
a purple and yellow abstract painting with a black background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
a purple and yellow abstract painting with a black background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
a purple and yellow abstract painting with a black background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
a purple and yellow abstract painting with a black background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
a purple and yellow abstract painting with a black background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
a dynamic video of a blurry neon light in the dark, radiating captivating colors{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
a view of a star trail in the night sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
a view of a star trail in the night sky, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
a view of a star trail in the night sky, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
a view of a star trail in the night sky, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
a view of a star trail in the night sky, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
a view of a star trail in the night sky, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
a view of a star trail in the night sky, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
a view of a star trail in the night sky, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
an aerial view of a small town on the edge of the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
an aerial view of a small town on the edge of the ocean, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
an aerial view of a small town on the edge of the ocean, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
an aerial view of a small town on the edge of the ocean, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
an aerial view of a small town on the edge of the ocean, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
an aerial view of a small town on the edge of the ocean, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
an aerial view of a small town on the edge of the ocean, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
an aerial view of a small town on the edge of the ocean, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
Colorful buildings on the seaside cliffs{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
Colorful buildings on the seaside cliffs, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
Colorful buildings on the seaside cliffs, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
Colorful buildings on the seaside cliffs, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
Colorful buildings on the seaside cliffs, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
Colorful buildings on the seaside cliffs, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
Colorful buildings on the seaside cliffs, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
Colorful buildings on the seaside cliffs, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
a bunch of houses that are on a hillside{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
a bunch of houses that are on a hillside, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
a bunch of houses that are on a hillside, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
a bunch of houses that are on a hillside, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
a bunch of houses that are on a hillside, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
a bunch of houses that are on a hillside, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
a bunch of houses that are on a hillside, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
a bunch of houses that are on a hillside, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
a building that is sitting on the side of a pond{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
a building that is sitting on the side of a pond, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
a building that is sitting on the side of a pond, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
a building that is sitting on the side of a pond, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
a building that is sitting on the side of a pond, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
a building that is sitting on the side of a pond, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
a building that is sitting on the side of a pond, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
a building that is sitting on the side of a pond, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
an aerial view of a busy city with a bridge in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
an aerial view of a busy city with a bridge in the background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
an aerial view of a busy city with a bridge in the background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
an aerial view of a busy city with a bridge in the background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
an aerial view of a busy city with a bridge in the background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
an aerial view of a busy city with a bridge in the background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
an aerial view of a busy city with a bridge in the background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
an aerial view of a busy city with a bridge in the background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
a bridge that is over a body of water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
a bridge that is over a body of water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
a bridge that is over a body of water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
a bridge that is over a body of water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
a bridge that is over a body of water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
a bridge that is over a body of water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
a bridge that is over a body of water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
a bridge that is over a body of water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
a pile of wood sitting next to a log house{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
a pile of wood sitting next to a log house, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
a pile of wood sitting next to a log house, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
a pile of wood sitting next to a log house, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
a pile of wood sitting next to a log house, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
a pile of wood sitting next to a log house, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
a pile of wood sitting next to a log house, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
a pile of wood sitting next to a log house, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
a view of a snowy mountain side with many buildings{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
a view of a snowy mountain side with many buildings, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
a view of a snowy mountain side with many buildings, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
a view of a snowy mountain side with many buildings, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
a view of a snowy mountain side with many buildings, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
a view of a snowy mountain side with many buildings, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
a view of a snowy mountain side with many buildings, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
a view of a snowy mountain side with many buildings, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
san francisco skyline at sunset{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
san francisco skyline at sunset, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
san francisco skyline at sunset, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
san francisco skyline at sunset, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
san francisco skyline at sunset, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
san francisco skyline at sunset, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
san francisco skyline at sunset, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
san francisco skyline at sunset, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
a castle on top of a hill covered in snow{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
a castle on top of a hill covered in snow, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
a castle on top of a hill covered in snow, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
a castle on top of a hill covered in snow, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
a castle on top of a hill covered in snow, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
a castle on top of a hill covered in snow, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
a castle on top of a hill covered in snow, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
a castle on top of a hill covered in snow, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
an aerial view of big ben and the houses of parliament in london{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
an aerial view of big ben and the houses of parliament in london, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
an aerial view of big ben and the houses of parliament in london, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
an aerial view of big ben and the houses of parliament in london, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
an aerial view of big ben and the houses of parliament in london, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
an aerial view of big ben and the houses of parliament in london, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
an aerial view of big ben and the houses of parliament in london, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
an aerial view of big ben and the houses of parliament in london, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
a beach with a lot of buildings on the side of a cliff{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
a beach with a lot of buildings on the side of a cliff, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
a beach with a lot of buildings on the side of a cliff, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
a beach with a lot of buildings on the side of a cliff, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
a beach with a lot of buildings on the side of a cliff, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
a beach with a lot of buildings on the side of a cliff, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
a beach with a lot of buildings on the side of a cliff, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
a beach with a lot of buildings on the side of a cliff, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
an alley way in an old european city{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
an alley way in an old european city, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
an alley way in an old european city, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
an alley way in an old european city, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
an alley way in an old european city, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
an alley way in an old european city, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
an alley way in an old european city, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
an alley way in an old european city, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
the golden gate bridge in san franscisco is lit up by the setting sun{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
the golden gate bridge in san franscisco is lit up by the setting sun, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
the golden gate bridge in san franscisco is lit up by the setting sun, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
the golden gate bridge in san franscisco is lit up by the setting sun, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
the golden gate bridge in san franscisco is lit up by the setting sun, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
the golden gate bridge in san franscisco is lit up by the setting sun, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
the golden gate bridge in san franscisco is lit up by the setting sun, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
the golden gate bridge in san franscisco is lit up by the setting sun, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
the great wall of china in autumn{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
the great wall of china in autumn, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
the great wall of china in autumn, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
the great wall of china in autumn, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
the great wall of china in autumn, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
the great wall of china in autumn, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
the great wall of china in autumn, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
the great wall of china in autumn, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
the town of hallstatt is surrounded by mountains and water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
the town of hallstatt is surrounded by mountains and water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
the town of hallstatt is surrounded by mountains and water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
the town of hallstatt is surrounded by mountains and water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
the town of hallstatt is surrounded by mountains and water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
the town of hallstatt is surrounded by mountains and water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
the town of hallstatt is surrounded by mountains and water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
the town of hallstatt is surrounded by mountains and water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
tokyo skyline at night{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
tokyo skyline at night, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
tokyo skyline at night, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
tokyo skyline at night, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
tokyo skyline at night, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
tokyo skyline at night, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
tokyo skyline at night, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
tokyo skyline at night, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
a church sits on top of a hill under a cloudy sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
a church sits on top of a hill under a cloudy sky, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
a church sits on top of a hill under a cloudy sky, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
a church sits on top of a hill under a cloudy sky, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
a church sits on top of a hill under a cloudy sky, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
a church sits on top of a hill under a cloudy sky, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
a church sits on top of a hill under a cloudy sky, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
a church sits on top of a hill under a cloudy sky, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
the parthenon in acropolis, greece{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
the parthenon in acropolis, greece, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
the parthenon in acropolis, greece, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
the parthenon in acropolis, greece, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
the parthenon in acropolis, greece, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
the parthenon in acropolis, greece, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
the parthenon in acropolis, greece, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
the parthenon in acropolis, greece, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
a large crowd of people walking in a shopping mall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
a large crowd of people walking in a shopping mall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
a large crowd of people walking in a shopping mall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
a large crowd of people walking in a shopping mall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
a large crowd of people walking in a shopping mall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
a large crowd of people walking in a shopping mall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
a large crowd of people walking in a shopping mall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
a large crowd of people walking in a shopping mall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
the pyramids of giza, egypt{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
the pyramids of giza, egypt, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
the pyramids of giza, egypt, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
the pyramids of giza, egypt, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
the pyramids of giza, egypt, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
the pyramids of giza, egypt, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
the pyramids of giza, egypt, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
the pyramids of giza, egypt, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
a stage door painted with a star on the side of a brick wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
a stage door painted with a star on the side of a brick wall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
a stage door painted with a star on the side of a brick wall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
a stage door painted with a star on the side of a brick wall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
a stage door painted with a star on the side of a brick wall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
a stage door painted with a star on the side of a brick wall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
a stage door painted with a star on the side of a brick wall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
a stage door painted with a star on the side of a brick wall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
a light house on the edge of the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
a light house on the edge of the water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
a light house on the edge of the water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
a light house on the edge of the water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
a light house on the edge of the water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
a light house on the edge of the water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
a light house on the edge of the water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
a light house on the edge of the water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
an asian city street at night with people and bicycles{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
an asian city street at night with people and bicycles, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
an asian city street at night with people and bicycles, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
an asian city street at night with people and bicycles, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
an asian city street at night with people and bicycles, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
an asian city street at night with people and bicycles, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
an asian city street at night with people and bicycles, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
an asian city street at night with people and bicycles, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
a couple of wooden benches in the middle of a street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
a couple of wooden benches in the middle of a street, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
a couple of wooden benches in the middle of a street, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
a couple of wooden benches in the middle of a street, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
a couple of wooden benches in the middle of a street, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
a couple of wooden benches in the middle of a street, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
a couple of wooden benches in the middle of a street, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
a couple of wooden benches in the middle of a street, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
a pagoda sits on top of a mountain in japan{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
a pagoda sits on top of a mountain in japan, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
a pagoda sits on top of a mountain in japan, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
a pagoda sits on top of a mountain in japan, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
a pagoda sits on top of a mountain in japan, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
a pagoda sits on top of a mountain in japan, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
a pagoda sits on top of a mountain in japan, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
a pagoda sits on top of a mountain in japan, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
a red bus driving down a snowy street at night{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
a red bus driving down a snowy street at night, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
a red bus driving down a snowy street at night, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
a red bus driving down a snowy street at night, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
a red bus driving down a snowy street at night, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
a red bus driving down a snowy street at night, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
a red bus driving down a snowy street at night, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
a red bus driving down a snowy street at night, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
a snow covered street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
a snow covered street, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
a snow covered street, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
a snow covered street, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
a snow covered street, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
a snow covered street, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
a snow covered street, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
a snow covered street, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
a house with snow on the ground{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
a house with snow on the ground, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
a house with snow on the ground, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
a house with snow on the ground, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
a house with snow on the ground, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
a house with snow on the ground, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
a house with snow on the ground, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
a house with snow on the ground, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
cars parked on the side of the road during a snowstorm{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
cars parked on the side of the road during a snowstorm, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
cars parked on the side of the road during a snowstorm, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
cars parked on the side of the road during a snowstorm, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
cars parked on the side of the road during a snowstorm, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
cars parked on the side of the road during a snowstorm, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
cars parked on the side of the road during a snowstorm, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
cars parked on the side of the road during a snowstorm, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
a group of statues on the side of a building{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
a group of statues on the side of a building, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
a group of statues on the side of a building, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
a group of statues on the side of a building, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
a group of statues on the side of a building, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
a group of statues on the side of a building, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
a group of statues on the side of a building, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
a group of statues on the side of a building, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
a city street at night during a snow storm{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
a city street at night during a snow storm, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
a city street at night during a snow storm, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
a city street at night during a snow storm, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
a city street at night during a snow storm, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
a city street at night during a snow storm, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
a city street at night during a snow storm, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
a city street at night during a snow storm, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
tower bridge in london{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
tower bridge in london, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
tower bridge in london, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
tower bridge in london, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
tower bridge in london, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
tower bridge in london, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
tower bridge in london, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
tower bridge in london, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
chinese pagoda in the middle of a snowy day{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
chinese pagoda in the middle of a snowy day, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
chinese pagoda in the middle of a snowy day, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
chinese pagoda in the middle of a snowy day, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
chinese pagoda in the middle of a snowy day, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
chinese pagoda in the middle of a snowy day, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
chinese pagoda in the middle of a snowy day, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
chinese pagoda in the middle of a snowy day, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
a dark alleyway with a bus driving down it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
a dark alleyway with a bus driving down it, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
a dark alleyway with a bus driving down it, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
a dark alleyway with a bus driving down it, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
a dark alleyway with a bus driving down it, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
a dark alleyway with a bus driving down it, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
a dark alleyway with a bus driving down it, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
a dark alleyway with a bus driving down it, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
a monastery sits on top of a cliff in bhutan{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
a monastery sits on top of a cliff in bhutan, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
a monastery sits on top of a cliff in bhutan, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
a monastery sits on top of a cliff in bhutan, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
a monastery sits on top of a cliff in bhutan, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
a monastery sits on top of a cliff in bhutan, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
a monastery sits on top of a cliff in bhutan, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
a monastery sits on top of a cliff in bhutan, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
the dome of the rock in jerusalem{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
the dome of the rock in jerusalem, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
the dome of the rock in jerusalem, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
the dome of the rock in jerusalem, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
the dome of the rock in jerusalem, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
the dome of the rock in jerusalem, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
the dome of the rock in jerusalem, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
the dome of the rock in jerusalem, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
an aerial view of a futuristic building on a cliff overlooking a body of water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
an aerial view of a futuristic building on a cliff overlooking a body of water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
an aerial view of a futuristic building on a cliff overlooking a body of water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
an aerial view of a futuristic building on a cliff overlooking a body of water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
an aerial view of a futuristic building on a cliff overlooking a body of water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
an aerial view of a futuristic building on a cliff overlooking a body of water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
an aerial view of a futuristic building on a cliff overlooking a body of water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
an aerial view of a futuristic building on a cliff overlooking a body of water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
a reflection of a city with buildings in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
a reflection of a city with buildings in the water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
a reflection of a city with buildings in the water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
a reflection of a city with buildings in the water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
a reflection of a city with buildings in the water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
a reflection of a city with buildings in the water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
a reflection of a city with buildings in the water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
a reflection of a city with buildings in the water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
a bar with chairs and a television on the wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
a bar with chairs and a television on the wall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
a bar with chairs and a television on the wall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
a bar with chairs and a television on the wall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
a bar with chairs and a television on the wall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
a bar with chairs and a television on the wall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
a bar with chairs and a television on the wall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
a bar with chairs and a television on the wall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
a living room filled with lots of books on a wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
a living room filled with lots of books on a wall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
a living room filled with lots of books on a wall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
a living room filled with lots of books on a wall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
a living room filled with lots of books on a wall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
a living room filled with lots of books on a wall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
a living room filled with lots of books on a wall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
a living room filled with lots of books on a wall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
a living room filled with furniture next to a stone wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
a living room filled with furniture next to a stone wall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
a living room filled with furniture next to a stone wall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
a living room filled with furniture next to a stone wall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
a living room filled with furniture next to a stone wall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
a living room filled with furniture next to a stone wall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
a living room filled with furniture next to a stone wall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
a living room filled with furniture next to a stone wall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
a table and chairs in a room with sunlight coming through the window{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
a table and chairs in a room with sunlight coming through the window, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
a table and chairs in a room with sunlight coming through the window, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
a table and chairs in a room with sunlight coming through the window, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
a table and chairs in a room with sunlight coming through the window, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
a table and chairs in a room with sunlight coming through the window, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
a table and chairs in a room with sunlight coming through the window, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
a table and chairs in a room with sunlight coming through the window, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
a room filled with lots of shelves filled with books{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
a room filled with lots of shelves filled with books, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
a room filled with lots of shelves filled with books, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
a room filled with lots of shelves filled with books, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
a room filled with lots of shelves filled with books, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
a room filled with lots of shelves filled with books, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
a room filled with lots of shelves filled with books, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
a room filled with lots of shelves filled with books, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
an art gallery with paintings on the walls{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
an art gallery with paintings on the walls, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
an art gallery with paintings on the walls, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
an art gallery with paintings on the walls, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
an art gallery with paintings on the walls, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
an art gallery with paintings on the walls, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
an art gallery with paintings on the walls, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
an art gallery with paintings on the walls, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
a room with a lot of pictures on the walls{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
a room with a lot of pictures on the walls, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
a room with a lot of pictures on the walls, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
a room with a lot of pictures on the walls, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
a room with a lot of pictures on the walls, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
a room with a lot of pictures on the walls, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
a room with a lot of pictures on the walls, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
a room with a lot of pictures on the walls, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
a painting of a cloudy sky next to an easel{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
a painting of a cloudy sky next to an easel, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
a painting of a cloudy sky next to an easel, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
a painting of a cloudy sky next to an easel, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
a painting of a cloudy sky next to an easel, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
a painting of a cloudy sky next to an easel, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
a painting of a cloudy sky next to an easel, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
a painting of a cloudy sky next to an easel, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
a living room with a christmas tree and a rocking chair{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
a living room with a christmas tree and a rocking chair, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
a living room with a christmas tree and a rocking chair, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
a living room with a christmas tree and a rocking chair, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
a living room with a christmas tree and a rocking chair, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
a living room with a christmas tree and a rocking chair, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
a living room with a christmas tree and a rocking chair, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
a living room with a christmas tree and a rocking chair, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
a kitchen with a sink and a lot of glasses on the counter{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
a kitchen with a sink and a lot of glasses on the counter, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
a kitchen with a sink and a lot of glasses on the counter, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
a kitchen with a sink and a lot of glasses on the counter, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
a kitchen with a sink and a lot of glasses on the counter, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
a kitchen with a sink and a lot of glasses on the counter, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
a kitchen with a sink and a lot of glasses on the counter, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
a kitchen with a sink and a lot of glasses on the counter, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
a wooden table in front of a brick wall with bottles on the wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
a wooden table in front of a brick wall with bottles on the wall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
a wooden table in front of a brick wall with bottles on the wall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
a wooden table in front of a brick wall with bottles on the wall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
a wooden table in front of a brick wall with bottles on the wall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
a wooden table in front of a brick wall with bottles on the wall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
a wooden table in front of a brick wall with bottles on the wall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
a wooden table in front of a brick wall with bottles on the wall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
a room filled with paintings and statues{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
a room filled with paintings and statues, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
a room filled with paintings and statues, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
a room filled with paintings and statues, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
a room filled with paintings and statues, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
a room filled with paintings and statues, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
a room filled with paintings and statues, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
a room filled with paintings and statues, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
an outdoor dining area surrounded by plants and a brick walkway{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
an outdoor dining area surrounded by plants and a brick walkway, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
an outdoor dining area surrounded by plants and a brick walkway, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
an outdoor dining area surrounded by plants and a brick walkway, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
an outdoor dining area surrounded by plants and a brick walkway, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
an outdoor dining area surrounded by plants and a brick walkway, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
an outdoor dining area surrounded by plants and a brick walkway, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
an outdoor dining area surrounded by plants and a brick walkway, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
a room filled with books and teddy bears{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
a room filled with books and teddy bears, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
a room filled with books and teddy bears, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
a room filled with books and teddy bears, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
a room filled with books and teddy bears, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
a room filled with books and teddy bears, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
a room filled with books and teddy bears, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
a room filled with books and teddy bears, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
a table and chairs in a room with a plant in the corner{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
a table and chairs in a room with a plant in the corner, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
a table and chairs in a room with a plant in the corner, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
a table and chairs in a room with a plant in the corner, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
a table and chairs in a room with a plant in the corner, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
a table and chairs in a room with a plant in the corner, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
a table and chairs in a room with a plant in the corner, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
a table and chairs in a room with a plant in the corner, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
a living room with a couch, table, and a window{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
a living room with a couch, table, and a window, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
a living room with a couch, table, and a window, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
a living room with a couch, table, and a window, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
a living room with a couch, table, and a window, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
a living room with a couch, table, and a window, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
a living room with a couch, table, and a window, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
a living room with a couch, table, and a window, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
a modern living room with wood floors and a tv{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
a modern living room with wood floors and a tv, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
a modern living room with wood floors and a tv, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
a modern living room with wood floors and a tv, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
a modern living room with wood floors and a tv, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
a modern living room with wood floors and a tv, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
a modern living room with wood floors and a tv, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
a modern living room with wood floors and a tv, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
a room with a desk and a chair in it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
a room with a desk and a chair in it, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
a room with a desk and a chair in it, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
a room with a desk and a chair in it, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
a room with a desk and a chair in it, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
a room with a desk and a chair in it, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
a room with a desk and a chair in it, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
a room with a desk and a chair in it, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a building{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a building, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a building, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a building, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a building, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a building, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a building, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a building, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
a chair in a room next to some drawings{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
a chair in a room next to some drawings, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
a chair in a room next to some drawings, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
a chair in a room next to some drawings, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
a chair in a room next to some drawings, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
a chair in a room next to some drawings, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
a chair in a room next to some drawings, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
a chair in a room next to some drawings, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
a living room with hardwood floors and a white couch{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
a living room with hardwood floors and a white couch, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
a living room with hardwood floors and a white couch, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
a living room with hardwood floors and a white couch, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
a living room with hardwood floors and a white couch, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
a living room with hardwood floors and a white couch, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
a living room with hardwood floors and a white couch, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
a living room with hardwood floors and a white couch, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
two people in a canoe on a lake with mountains in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
two people in a canoe on a lake with mountains in the background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
two people in a canoe on a lake with mountains in the background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
two people in a canoe on a lake with mountains in the background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
two people in a canoe on a lake with mountains in the background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
two people in a canoe on a lake with mountains in the background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
two people in a canoe on a lake with mountains in the background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
two people in a canoe on a lake with mountains in the background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
an aerial view of a snowy road in a forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
an aerial view of a snowy road in a forest, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
an aerial view of a snowy road in a forest, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
an aerial view of a snowy road in a forest, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
an aerial view of a snowy road in a forest, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
an aerial view of a snowy road in a forest, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
an aerial view of a snowy road in a forest, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
an aerial view of a snowy road in a forest, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
a view of a waterfall from a distance{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
a view of a waterfall from a distance, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
a view of a waterfall from a distance, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
a view of a waterfall from a distance, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
a view of a waterfall from a distance, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
a view of a waterfall from a distance, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
a view of a waterfall from a distance, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
a view of a waterfall from a distance, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a valley{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a valley, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a valley, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a valley, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a valley, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a valley, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a valley, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a valley, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
an aerial view of a group of islands in the middle of a lake{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
an aerial view of a group of islands in the middle of a lake, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
an aerial view of a group of islands in the middle of a lake, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
an aerial view of a group of islands in the middle of a lake, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
an aerial view of a group of islands in the middle of a lake, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
an aerial view of a group of islands in the middle of a lake, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
an aerial view of a group of islands in the middle of a lake, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
an aerial view of a group of islands in the middle of a lake, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
an aerial view of a rocky beach in indonesia{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
an aerial view of a rocky beach in indonesia, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
an aerial view of a rocky beach in indonesia, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
an aerial view of a rocky beach in indonesia, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
an aerial view of a rocky beach in indonesia, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
an aerial view of a rocky beach in indonesia, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
an aerial view of a rocky beach in indonesia, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
an aerial view of a rocky beach in indonesia, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
fireworks in the night sky over a city{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
fireworks in the night sky over a city, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
fireworks in the night sky over a city, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
fireworks in the night sky over a city, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
fireworks in the night sky over a city, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
fireworks in the night sky over a city, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
fireworks in the night sky over a city, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
fireworks in the night sky over a city, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse on a stormy day{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse on a stormy day, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse on a stormy day, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse on a stormy day, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse on a stormy day, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse on a stormy day, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse on a stormy day, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
a large wave crashes into a lighthouse on a stormy day, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
a mountain range with a sky background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
a mountain range with a sky background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
a mountain range with a sky background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
a mountain range with a sky background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
a mountain range with a sky background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
a mountain range with a sky background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
a mountain range with a sky background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
a mountain range with a sky background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
a large bonfire is burning in the night sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
a large bonfire is burning in the night sky, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
a large bonfire is burning in the night sky, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
a large bonfire is burning in the night sky, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
a large bonfire is burning in the night sky, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
a large bonfire is burning in the night sky, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
a large bonfire is burning in the night sky, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
a large bonfire is burning in the night sky, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
a close-up view of the flames of a fireplace{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
a close-up view of the flames of a fireplace, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
a close-up view of the flames of a fireplace, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
a close-up view of the flames of a fireplace, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
a close-up view of the flames of a fireplace, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
a close-up view of the flames of a fireplace, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
a close-up view of the flames of a fireplace, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
a close-up view of the flames of a fireplace, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
a farm in the middle of the day{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
a farm in the middle of the day, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
a farm in the middle of the day, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
a farm in the middle of the day, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
a farm in the middle of the day, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
a farm in the middle of the day, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
a farm in the middle of the day, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
a farm in the middle of the day, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
a flock of birds flying over a tree at sunset{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
a flock of birds flying over a tree at sunset, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
a flock of birds flying over a tree at sunset, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
a flock of birds flying over a tree at sunset, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
a flock of birds flying over a tree at sunset, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
a flock of birds flying over a tree at sunset, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
a flock of birds flying over a tree at sunset, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
a flock of birds flying over a tree at sunset, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
a captivating scene featuring a spiral galaxy shining brilliantly in the night sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
a mountain with snow on it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
a mountain with snow on it, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
a mountain with snow on it, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
a mountain with snow on it, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
a mountain with snow on it, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
a mountain with snow on it, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
a mountain with snow on it, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
a mountain with snow on it, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
a bridge that is in the middle of a river{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
a bridge that is in the middle of a river, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
a bridge that is in the middle of a river, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
a bridge that is in the middle of a river, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
a bridge that is in the middle of a river, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
a bridge that is in the middle of a river, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
a bridge that is in the middle of a river, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
a bridge that is in the middle of a river, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
a group of people standing on top of a green hill{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
a group of people standing on top of a green hill, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
a group of people standing on top of a green hill, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
a group of people standing on top of a green hill, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
a group of people standing on top of a green hill, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
a group of people standing on top of a green hill, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
a group of people standing on top of a green hill, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
a group of people standing on top of a green hill, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
a sandy beach with a wooden pier in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
a sandy beach with a wooden pier in the water, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
a sandy beach with a wooden pier in the water, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
a sandy beach with a wooden pier in the water, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
a sandy beach with a wooden pier in the water, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
a sandy beach with a wooden pier in the water, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
a sandy beach with a wooden pier in the water, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
a sandy beach with a wooden pier in the water, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
a lake surrounded by mountains and flowers{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
a lake surrounded by mountains and flowers, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
a lake surrounded by mountains and flowers, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
a lake surrounded by mountains and flowers, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
a lake surrounded by mountains and flowers, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
a lake surrounded by mountains and flowers, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
a lake surrounded by mountains and flowers, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
a lake surrounded by mountains and flowers, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
a hot-air balloon flying over a desert landscape{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
a hot-air balloon flying over a desert landscape, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
a hot-air balloon flying over a desert landscape, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
a hot-air balloon flying over a desert landscape, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
a hot-air balloon flying over a desert landscape, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
a hot-air balloon flying over a desert landscape, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
a hot-air balloon flying over a desert landscape, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
a hot-air balloon flying over a desert landscape, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
several hot air balloons flying over a city{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
several hot air balloons flying over a city, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
several hot air balloons flying over a city, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
several hot air balloons flying over a city, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
several hot air balloons flying over a city, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
several hot air balloons flying over a city, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
several hot air balloons flying over a city, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
several hot air balloons flying over a city, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a field, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a field, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a field, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a field, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a field, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a field, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
a group of hot air balloons flying over a field, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
a large wave crashes over a rocky cliff{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
a large wave crashes over a rocky cliff, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
a large wave crashes over a rocky cliff, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
a large wave crashes over a rocky cliff, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
a large wave crashes over a rocky cliff, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
a large wave crashes over a rocky cliff, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
a large wave crashes over a rocky cliff, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
a large wave crashes over a rocky cliff, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
the sun is setting over a lake in the mountains{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
the sun is setting over a lake in the mountains, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
the sun is setting over a lake in the mountains, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
the sun is setting over a lake in the mountains, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
the sun is setting over a lake in the mountains, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
the sun is setting over a lake in the mountains, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
the sun is setting over a lake in the mountains, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
the sun is setting over a lake in the mountains, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
a mountain range with snow on the ground{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
a mountain range with snow on the ground, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
a mountain range with snow on the ground, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
a mountain range with snow on the ground, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
a mountain range with snow on the ground, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
a mountain range with snow on the ground, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
a mountain range with snow on the ground, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
a mountain range with snow on the ground, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
sun rays shining through clouds over a lake{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
sun rays shining through clouds over a lake, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
sun rays shining through clouds over a lake, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
sun rays shining through clouds over a lake, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
sun rays shining through clouds over a lake, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
sun rays shining through clouds over a lake, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
sun rays shining through clouds over a lake, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
sun rays shining through clouds over a lake, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
a boat sits on the shore of a lake with mt fuji in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
a boat sits on the shore of a lake with mt fuji in the background, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
a boat sits on the shore of a lake with mt fuji in the background, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
a boat sits on the shore of a lake with mt fuji in the background, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
a boat sits on the shore of a lake with mt fuji in the background, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
a boat sits on the shore of a lake with mt fuji in the background, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
a boat sits on the shore of a lake with mt fuji in the background, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
a boat sits on the shore of a lake with mt fuji in the background, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
a foggy road with trees in the distance{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
a foggy road with trees in the distance, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
a foggy road with trees in the distance, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
a foggy road with trees in the distance, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
a foggy road with trees in the distance, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
a foggy road with trees in the distance, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
a foggy road with trees in the distance, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
a foggy road with trees in the distance, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
two swans swimming on a lake in the fog{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
two swans swimming on a lake in the fog, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
two swans swimming on a lake in the fog, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
two swans swimming on a lake in the fog, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
two swans swimming on a lake in the fog, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
two swans swimming on a lake in the fog, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
two swans swimming on a lake in the fog, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
two swans swimming on a lake in the fog, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
the sun is shining through the trees near a waterfall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
the sun is shining through the trees near a waterfall, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
the sun is shining through the trees near a waterfall, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
the sun is shining through the trees near a waterfall, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
the sun is shining through the trees near a waterfall, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
the sun is shining through the trees near a waterfall, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
the sun is shining through the trees near a waterfall, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
the sun is shining through the trees near a waterfall, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
a sandy beach with palm trees on the shore{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
a sandy beach with palm trees on the shore, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
a sandy beach with palm trees on the shore, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
a sandy beach with palm trees on the shore, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
a sandy beach with palm trees on the shore, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
a sandy beach with palm trees on the shore, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
a sandy beach with palm trees on the shore, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
a sandy beach with palm trees on the shore, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
an aerial view of a body of water and a beach{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
an aerial view of a body of water and a beach, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
an aerial view of a body of water and a beach, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
an aerial view of a body of water and a beach, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
an aerial view of a body of water and a beach, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
an aerial view of a body of water and a beach, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
an aerial view of a body of water and a beach, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
an aerial view of a body of water and a beach, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
a foggy field that has trees in the grass{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
a foggy field that has trees in the grass, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
a foggy field that has trees in the grass, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
a foggy field that has trees in the grass, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
a foggy field that has trees in the grass, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
a foggy field that has trees in the grass, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
a foggy field that has trees in the grass, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
a foggy field that has trees in the grass, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
a foggy landscape with trees and hills in the distance{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
a foggy landscape with trees and hills in the distance, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
a foggy landscape with trees and hills in the distance, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
a foggy landscape with trees and hills in the distance, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
a foggy landscape with trees and hills in the distance, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
a foggy landscape with trees and hills in the distance, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
a foggy landscape with trees and hills in the distance, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
a foggy landscape with trees and hills in the distance, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
a large wave in the ocean with a lot of spray coming from it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
a large wave in the ocean with a lot of spray coming from it, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
a large wave in the ocean with a lot of spray coming from it, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
a large wave in the ocean with a lot of spray coming from it, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
a large wave in the ocean with a lot of spray coming from it, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
a large wave in the ocean with a lot of spray coming from it, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
a large wave in the ocean with a lot of spray coming from it, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
a large wave in the ocean with a lot of spray coming from it, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
a waterfall is shown in the middle of a lush green hillside{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a waterfall is shown in the middle of a lush green hillside, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a waterfall is shown in the middle of a lush green hillside, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a waterfall is shown in the middle of a lush green hillside, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a waterfall is shown in the middle of a lush green hillside, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a waterfall is shown in the middle of a lush green hillside, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a waterfall is shown in the middle of a lush green hillside, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a waterfall is shown in the middle of a lush green hillside, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
an aerial view of a curvy road in the middle of a forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
an aerial view of a curvy road in the middle of a forest, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
an aerial view of a curvy road in the middle of a forest, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
an aerial view of a curvy road in the middle of a forest, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
an aerial view of a curvy road in the middle of a forest, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
an aerial view of a curvy road in the middle of a forest, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
an aerial view of a curvy road in the middle of a forest, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
an aerial view of a curvy road in the middle of a forest, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
a mountain covered in snow with evergreen trees{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
a mountain covered in snow with evergreen trees, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
a mountain covered in snow with evergreen trees, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
a mountain covered in snow with evergreen trees, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
a mountain covered in snow with evergreen trees, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
a mountain covered in snow with evergreen trees, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
a mountain covered in snow with evergreen trees, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
a mountain covered in snow with evergreen trees, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
a very large waterfall in the middle of the day{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
a very large waterfall in the middle of the day, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
a very large waterfall in the middle of the day, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
a very large waterfall in the middle of the day, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
a very large waterfall in the middle of the day, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
a very large waterfall in the middle of the day, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
a very large waterfall in the middle of the day, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
a very large waterfall in the middle of the day, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a lush green hillside{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a lush green hillside, camera pans left{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a lush green hillside, camera pans right{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a lush green hillside, camera tilts up{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a lush green hillside, camera tilts down{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a lush green hillside, camera zooms in{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a lush green hillside, camera zooms out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a large waterfall in the middle of a lush green hillside, camera static{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
a brown bear in the water with a fish in its mouth{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a brown bear in the water with a fish in its mouth.jpg", "mask_strategy": "0"}
a close-up of a hippopotamus eating grass in a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up of a hippopotamus eating grass in a field.jpg", "mask_strategy": "0"}
a sea turtle swimming in the ocean under the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sea turtle swimming in the ocean under the water.jpg", "mask_strategy": "0"}
two bees are flying over a lavender plant{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two bees are flying over a lavender plant.jpg", "mask_strategy": "0"}
the otter is standing in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the otter is standing in the water.jpg", "mask_strategy": "0"}
a dog carrying a soccer ball in its mouth{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dog carrying a soccer ball in its mouth.jpg", "mask_strategy": "0"}
an eagle is flying over a mountain with trees in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an eagle is flying over a mountain with trees in the background.jpg", "mask_strategy": "0"}
a couple of horses are running in the dirt{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of horses are running in the dirt.jpg", "mask_strategy": "0"}
a highland cow with long horns standing in a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a highland cow with long horns standing in a field.jpg", "mask_strategy": "0"}
a monkey is holding a banana in its mouth{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monkey is holding a banana in its mouth.jpg", "mask_strategy": "0"}
a large rhino grazing in the grass near a bush{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large rhino grazing in the grass near a bush.jpg", "mask_strategy": "0"}
a butterfly sits on top of a purple flower{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a butterfly sits on top of a purple flower.jpg", "mask_strategy": "0"}
an alligator is covered in green plants in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an alligator is covered in green plants in the water.jpg", "mask_strategy": "0"}
a red panda eating bamboo in a zoo{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red panda eating bamboo in a zoo.jpg", "mask_strategy": "0"}
a monochromatic video capturing a cat's gaze into the camera{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a monochromatic video capturing a cat's gaze into the camera.jpg", "mask_strategy": "0"}
a frog sitting on top of water lily leaves{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a frog sitting on top of water lily leaves.jpg", "mask_strategy": "0"}
a lion is roaring in the wild{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lion is roaring in the wild.jpg", "mask_strategy": "0"}
a seagull is flying towards a person's hand{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a seagull is flying towards a person's hand.jpg", "mask_strategy": "0"}
a yellow and white jellyfish is floating in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a yellow and white jellyfish is floating in the ocean.jpg", "mask_strategy": "0"}
a group of jellyfish swimming in an aquarium{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of jellyfish swimming in an aquarium.jpg", "mask_strategy": "0"}
a clown fish hiding in a purple anemone{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a clown fish hiding in a purple anemone.jpg", "mask_strategy": "0"}
a snake sitting on the ground next to a bowl{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snake sitting on the ground next to a bowl.jpg", "mask_strategy": "0"}
a brown and white cow eating hay{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a brown and white cow eating hay.jpg", "mask_strategy": "0"}
a seal swimming in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a seal swimming in the water.jpg", "mask_strategy": "0"}
a panda bear is eating a piece of bamboo{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a panda bear is eating a piece of bamboo.jpg", "mask_strategy": "0"}
a small bird sits on a moss covered branch{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a small bird sits on a moss covered branch.jpg", "mask_strategy": "0"}
a bird with a fish in its beak flying over a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bird with a fish in its beak flying over a field.jpg", "mask_strategy": "0"}
a large flock of birds flying in the sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large flock of birds flying in the sky.jpg", "mask_strategy": "0"}
a bald eagle flying over a tree filled forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bald eagle flying over a tree filled forest.jpg", "mask_strategy": "0"}
a giraffe walking in a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a giraffe walking in a field.jpg", "mask_strategy": "0"}
a lioness yawning in a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a lioness yawning in a field.jpg", "mask_strategy": "0"}
a little crab scurried on the sandy beach{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a little crab scurried on the sandy beach.jpg", "mask_strategy": "0"}
a warthog is walking in the grass{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a warthog is walking in the grass.jpg", "mask_strategy": "0"}
a penguin walking on a beach near the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a penguin walking on a beach near the water.jpg", "mask_strategy": "0"}
a tiger walking through a wooded area{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a tiger walking through a wooded area.jpg", "mask_strategy": "0"}
a tiger walking on a dirt path in the woods{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a tiger walking on a dirt path in the woods.jpg", "mask_strategy": "0"}
a small monkey holding a piece of food in it's mouth{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a small monkey holding a piece of food in it's mouth.jpg", "mask_strategy": "0"}
a squirrel sitting on the ground eating a piece of bread{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a squirrel sitting on the ground eating a piece of bread.jpg", "mask_strategy": "0"}
a group of fish swimming over a coral reef{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of fish swimming over a coral reef.jpg", "mask_strategy": "0"}
a toad is sitting on top of some moss{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a toad is sitting on top of some moss.jpg", "mask_strategy": "0"}
a great white shark swimming in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a great white shark swimming in the ocean.jpg", "mask_strategy": "0"}
a group of camels resting in the desert{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of camels resting in the desert.jpg", "mask_strategy": "0"}
two sheep grazing in the grass next to a wooden bridge{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two sheep grazing in the grass next to a wooden bridge.jpg", "mask_strategy": "0"}
an elephant walking through a forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an elephant walking through a forest.jpg", "mask_strategy": "0"}
a white rooster standing in a grassy field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a white rooster standing in a grassy field.jpg", "mask_strategy": "0"}
a zebra walking across a dirt road near a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a zebra walking across a dirt road near a field.jpg", "mask_strategy": "0"}
cars are driving down a street lined with tall trees{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/cars are driving down a street lined with tall trees.jpg", "mask_strategy": "0"}
the cars on the street are waiting for the traffic lights{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the cars on the street are waiting for the traffic lights.jpg", "mask_strategy": "0"}
a bicycle leaning against a fence in the snow{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bicycle leaning against a fence in the snow.jpg", "mask_strategy": "0"}
a blue fishing boat is navigating in the ocean next to a cruise ship{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue fishing boat is navigating in the ocean next to a cruise ship.jpg", "mask_strategy": "0"}
a blue car driving down a dirt road near train tracks{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue car driving down a dirt road near train tracks.jpg", "mask_strategy": "0"}
a sailboat is drifting on the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sailboat is drifting on the ocean.jpg", "mask_strategy": "0"}
a couple of boats floating on a body of water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a couple of boats floating on a body of water.jpg", "mask_strategy": "0"}
a city street with cars driving in the rain{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city street with cars driving in the rain.jpg", "mask_strategy": "0"}
a red and white tram traveling down a snowy street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red and white tram traveling down a snowy street.jpg", "mask_strategy": "0"}
a city bus driving down a snowy street at night{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a city bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
a green toy car is sitting on the ground{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a green toy car is sitting on the ground.jpg", "mask_strategy": "0"}
a train traveling down tracks through the woods with leaves on the ground{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a train traveling down tracks through the woods with leaves on the ground.jpg", "mask_strategy": "0"}
a man in a small boat fishing in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man in a small boat fishing in the ocean.jpg", "mask_strategy": "0"}
an airplane is flying through the sky at sunset{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an airplane is flying through the sky at sunset.jpg", "mask_strategy": "0"}
an old rusty car sits in the middle of a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an old rusty car sits in the middle of a field.jpg", "mask_strategy": "0"}
a motorcycle driving down a road{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a motorcycle driving down a road.jpg", "mask_strategy": "0"}
a blue train traveling through a lush green area{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a blue train traveling through a lush green area.jpg", "mask_strategy": "0"}
a white car is swiftly driving on a dirt road near a bush, kicking up dust{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a white car is swiftly driving on a dirt road near a bush, kicking up dust.jpg", "mask_strategy": "0"}
a large cargo ship sailing in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large cargo ship sailing in the water.jpg", "mask_strategy": "0"}
the red Alfa sports car is speeding down the road{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/the red Alfa sports car is speeding down the road.jpg", "mask_strategy": "0"}
two cars that have been involved in a violent collision{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two cars that have been involved in a violent collision.jpg", "mask_strategy": "0"}
a red double decker bus driving down a street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a red double decker bus driving down a street.jpg", "mask_strategy": "0"}
A red sports car driving through sand, kicking up a large amount of dust{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A red sports car driving through sand, kicking up a large amount of dust.jpg", "mask_strategy": "0"}
a yellow toy car parked on a rock near the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a yellow toy car parked on a rock near the water.jpg", "mask_strategy": "0"}
a space shuttle taking off into the sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a space shuttle taking off into the sky.jpg", "mask_strategy": "0"}
a steam train traveling through the woods{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a steam train traveling through the woods.jpg", "mask_strategy": "0"}
a group of buses parked at a bus station{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of buses parked at a bus station.jpg", "mask_strategy": "0"}
A bunch of cars are driving on a highway{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A bunch of cars are driving on a highway.jpg", "mask_strategy": "0"}
a white and blue airplane flying in the sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a white and blue airplane flying in the sky.jpg", "mask_strategy": "0"}
A space station orbited above the Earth{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A space station orbited above the Earth.jpg", "mask_strategy": "0"}
A yellow boat is cruising in front of a bridge{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A yellow boat is cruising in front of a bridge.jpg", "mask_strategy": "0"}
tangerines in a metal bowl on a table{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/tangerines in a metal bowl on a table.jpg", "mask_strategy": "0"}
a shadow of a hand reaching for a leaf{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a shadow of a hand reaching for a leaf.jpg", "mask_strategy": "0"}
A teddy bear is climbing over a wooden fence{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A teddy bear is climbing over a wooden fence.jpg", "mask_strategy": "0"}
a book on fire with flames coming out of it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a book on fire with flames coming out of it.jpg", "mask_strategy": "0"}
a close-up of a pink rose with water droplets on it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up of a pink rose with water droplets on it.jpg", "mask_strategy": "0"}
a person is cooking meat on a grill with flames{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person is cooking meat on a grill with flames.jpg", "mask_strategy": "0"}
a snowman wearing a santa hat and scarf{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snowman wearing a santa hat and scarf.jpg", "mask_strategy": "0"}
a person holding a sparkler in their hand{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person holding a sparkler in their hand.jpg", "mask_strategy": "0"}
a teddy bear sitting on a moss covered ground{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a teddy bear sitting on a moss covered ground.jpg", "mask_strategy": "0"}
a statue of a lion is sitting on a pedestal{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a statue of a lion is sitting on a pedestal.jpg", "mask_strategy": "0"}
metal balls are suspended in the air{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/metal balls are suspended in the air.jpg", "mask_strategy": "0"}
a close up of a bunch of green grapes{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a bunch of green grapes.jpg", "mask_strategy": "0"}
a close-up view of a green plant with unfurled fronds{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a green plant with unfurled fronds.jpg", "mask_strategy": "0"}
an orange mushroom sitting on top of a tree stump in the woods{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an orange mushroom sitting on top of a tree stump in the woods.jpg", "mask_strategy": "0"}
a stack of pancakes covered in syrup and fruit{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stack of pancakes covered in syrup and fruit.jpg", "mask_strategy": "0"}
a plate of spaghetti with spinach and tomatoes{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a plate of spaghetti with spinach and tomatoes.jpg", "mask_strategy": "0"}
a pink lotus flower in the middle of a pond{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pink lotus flower in the middle of a pond.jpg", "mask_strategy": "0"}
a person holding a sparkler in front of a sunset{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person holding a sparkler in front of a sunset.jpg", "mask_strategy": "0"}
a pink rose is blooming in a garden{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pink rose is blooming in a garden.jpg", "mask_strategy": "0"}
a snow man holding a lantern in the snow{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snow man holding a lantern in the snow.jpg", "mask_strategy": "0"}
a stack of chocolate cookies with a bite taken out of it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a stack of chocolate cookies with a bite taken out of it.jpg", "mask_strategy": "0"}
a white plate topped with eggs, toast, tomatoes, and a sausage{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a white plate topped with eggs, toast, tomatoes, and a sausage.jpg", "mask_strategy": "0"}
a yellow water lily is floating in a pond{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a yellow water lily is floating in a pond.jpg", "mask_strategy": "0"}
an astronaut floating in space with the earth in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an astronaut floating in space with the earth in the background.jpg", "mask_strategy": "0"}
A little girl, lost in thought, is quietly sitting on the bus{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A little girl, lost in thought, is quietly sitting on the bus.jpg", "mask_strategy": "0"}
a man holding a tray in front of a brick wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man holding a tray in front of a brick wall.jpg", "mask_strategy": "0"}
an older man playing a saxophone on the street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older man playing a saxophone on the street.jpg", "mask_strategy": "0"}
an older man jogging by the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older man jogging by the water.jpg", "mask_strategy": "0"}
a person riding a skateboard on a concrete floor{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding a skateboard on a concrete floor.jpg", "mask_strategy": "0"}
a woman with long black hair is posing for a picture{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman with long black hair is posing for a picture.jpg", "mask_strategy": "0"}
a woman sitting on the ground in front of a guitar{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman sitting on the ground in front of a guitar.jpg", "mask_strategy": "0"}
a little girl wearing a purple helmet riding a blue bike{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a little girl wearing a purple helmet riding a blue bike.jpg", "mask_strategy": "0"}
a young boy is jumping in the mud{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a young boy is jumping in the mud.jpg", "mask_strategy": "0"}
a man sitting in the driver's seat of a car wearing sunglasses{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man sitting in the driver's seat of a car wearing sunglasses.jpg", "mask_strategy": "0"}
a little boy jumping in the air over a puddle of water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a little boy jumping in the air over a puddle of water.jpg", "mask_strategy": "0"}
a woman with afro hair is smiling while wearing earphones{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman with afro hair is smiling while wearing earphones.jpg", "mask_strategy": "0"}
a smiling woman with her hands clasped{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a smiling woman with her hands clasped.jpg", "mask_strategy": "0"}
a young boy standing in a field with horses in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a young boy standing in a field with horses in the background.jpg", "mask_strategy": "0"}
a young man is covered in colored powder{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a young man is covered in colored powder.jpg", "mask_strategy": "0"}
a woman with curly hair is drinking a beer{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman with curly hair is drinking a beer.jpg", "mask_strategy": "0"}
an old man standing in the middle of a field holding a bunch of plants{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an old man standing in the middle of a field holding a bunch of plants.jpg", "mask_strategy": "0"}
a man standing on a boat with a net{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man standing on a boat with a net.jpg", "mask_strategy": "0"}
a woman in a hat is putting salt into a basket{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman in a hat is putting salt into a basket.jpg", "mask_strategy": "0"}
a young girl smelling a pink flower{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a young girl smelling a pink flower.jpg", "mask_strategy": "0"}
a young boy leaning on a wooden pole{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a young boy leaning on a wooden pole.jpg", "mask_strategy": "0"}
a man in a hat sitting in front of a brick oven{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man in a hat sitting in front of a brick oven.jpg", "mask_strategy": "0"}
a man in a mexican outfit holding an acoustic guitar{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man in a mexican outfit holding an acoustic guitar.jpg", "mask_strategy": "0"}
a snowboarder is in the air doing a trick{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a snowboarder is in the air doing a trick.jpg", "mask_strategy": "0"}
a man riding a horse with a spear in his hand{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man riding a horse with a spear in his hand.jpg", "mask_strategy": "0"}
a woman carrying a bundle of plants over their head{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman carrying a bundle of plants over their head.jpg", "mask_strategy": "0"}
a person jumping in the air over a fence{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person jumping in the air over a fence.jpg", "mask_strategy": "0"}
a man on a surfboard riding a wave in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man on a surfboard riding a wave in the ocean.jpg", "mask_strategy": "0"}
a man sitting on steps playing an acoustic guitar{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man sitting on steps playing an acoustic guitar.jpg", "mask_strategy": "0"}
a man swinging a tennis racquet at a tennis ball{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man swinging a tennis racquet at a tennis ball.jpg", "mask_strategy": "0"}
a man riding a mountain bike on top of a rocky hill{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man riding a mountain bike on top of a rocky hill.jpg", "mask_strategy": "0"}
a man riding a bike down a street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man riding a bike down a street.jpg", "mask_strategy": "0"}
a man is running on a dirt road{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man is running on a dirt road.jpg", "mask_strategy": "0"}
A man in a black suit and a sombrero, shouting loudly{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A man in a black suit and a sombrero, shouting loudly.jpg", "mask_strategy": "0"}
a man standing on top of a sand dune in the desert{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man standing on top of a sand dune in the desert.jpg", "mask_strategy": "0"}
a person riding a motorcycle down a road{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding a motorcycle down a road.jpg", "mask_strategy": "0"}
a man standing on top of a mountain with a backpack{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man standing on top of a mountain with a backpack.jpg", "mask_strategy": "0"}
a man with a skull face paint smoking a cigar and holding a guitar{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man with a skull face paint smoking a cigar and holding a guitar.jpg", "mask_strategy": "0"}
a man in sunglasses laying on a wooden bench{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man in sunglasses laying on a wooden bench.jpg", "mask_strategy": "0"}
an older woman sitting in a room with a cigarette in her hand{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older woman sitting in a room with a cigarette in her hand.jpg", "mask_strategy": "0"}
a man sitting on the ground playing a musical instrument{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man sitting on the ground playing a musical instrument.jpg", "mask_strategy": "0"}
a person riding a horse in a polo match{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding a horse in a polo match.jpg", "mask_strategy": "0"}
a woman in a kimono holding an umbrella{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman in a kimono holding an umbrella.jpg", "mask_strategy": "0"}
a person riding a dirt bike{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding a dirt bike.jpg", "mask_strategy": "0"}
a person riding an atv on a dirt track{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding an atv on a dirt track.jpg", "mask_strategy": "0"}
a person riding a wave on a surfboard{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person riding a wave on a surfboard.jpg", "mask_strategy": "0"}
a woman in a wetsuit is swimming in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman in a wetsuit is swimming in the ocean.jpg", "mask_strategy": "0"}
a man snorkling in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man snorkling in the ocean.jpg", "mask_strategy": "0"}
a beautiful woman in a blue sari posing in front of a wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a beautiful woman in a blue sari posing in front of a wall.jpg", "mask_strategy": "0"}
a woman wearing a shawl in front of a mountain{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman wearing a shawl in front of a mountain.jpg", "mask_strategy": "0"}
a woman is making bread in an oven{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman is making bread in an oven.jpg", "mask_strategy": "0"}
a woman smiles while holding a yellow flower{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman smiles while holding a yellow flower.jpg", "mask_strategy": "0"}
A young boy is lifting a bundle of dry grass wrapped in waterproof fabric over his head{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A young boy is lifting a bundle of dry grass wrapped in waterproof fabric over his head.jpg", "mask_strategy": "0"}
two people performing a sword fight in front of a forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people performing a sword fight in front of a forest.jpg", "mask_strategy": "0"}
a woman in a colorful shirt is cooking food{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman in a colorful shirt is cooking food.jpg", "mask_strategy": "0"}
an older woman is drinking a bottle of water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older woman is drinking a bottle of water.jpg", "mask_strategy": "0"}
a smiling woman sitting at a table with food and drinks{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a smiling woman sitting at a table with food and drinks.jpg", "mask_strategy": "0"}
a woman wearing a hijab reading a book on the beach{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman wearing a hijab reading a book on the beach.jpg", "mask_strategy": "0"}
a woman wearing a headscarf is reaching for an olive tree{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman wearing a headscarf is reaching for an olive tree.jpg", "mask_strategy": "0"}
a woman in a white dress jumping in the air in a field of pink flowers{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman in a white dress jumping in the air in a field of pink flowers.jpg", "mask_strategy": "0"}
a woman wearing a conical hat sits on a boat{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman wearing a conical hat sits on a boat.jpg", "mask_strategy": "0"}
an older woman sitting in front of an old building{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older woman sitting in front of an old building.jpg", "mask_strategy": "0"}
a woman is praying in front of a buddhist temple{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman is praying in front of a buddhist temple.jpg", "mask_strategy": "0"}
a woman with green hair smiling for the camera{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman with green hair smiling for the camera.jpg", "mask_strategy": "0"}
A group of people in a yellow raft is rowing through turbulent waters{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A group of people in a yellow raft is rowing through turbulent waters.jpg", "mask_strategy": "0"}
a man carrying a woman on his back in a field{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man carrying a woman on his back in a field.jpg", "mask_strategy": "0"}
an indian police officer talking to an old woman{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an indian police officer talking to an old woman.jpg", "mask_strategy": "0"}
two people scuba diving in the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two people scuba diving in the ocean.jpg", "mask_strategy": "0"}
A man and woman dressed as sugar skulls in a field of flowers, sharing a loving gaze with each other{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A man and woman dressed as sugar skulls in a field of flowers, sharing a loving gaze with each other.jpg", "mask_strategy": "0"}
a group of people watching a cow race{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people watching a cow race.jpg", "mask_strategy": "0"}
a man and a child riding bumper cars in an amusement park{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man and a child riding bumper cars in an amusement park.jpg", "mask_strategy": "0"}
a group of motorcyclists racing on a dirt track{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of motorcyclists racing on a dirt track.jpg", "mask_strategy": "0"}
a man and a woman are boxing in a boxing ring{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man and a woman are boxing in a boxing ring.jpg", "mask_strategy": "0"}
a man holding a baby in his arms{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man holding a baby in his arms.jpg", "mask_strategy": "0"}
a man and a woman sitting on a bench playing instruments{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man and a woman sitting on a bench playing instruments.jpg", "mask_strategy": "0"}
two men are standing next to each other with a bicycle{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two men are standing next to each other with a bicycle.jpg", "mask_strategy": "0"}
a man and a boy sitting on a beach near the ocean{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man and a boy sitting on a beach near the ocean.jpg", "mask_strategy": "0"}
two men in white clothing standing next to each other{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two men in white clothing standing next to each other.jpg", "mask_strategy": "0"}
a group of men riding horses in a dusty arena{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of men riding horses in a dusty arena.jpg", "mask_strategy": "0"}
a soccer player in a yellow and black shirt is chasing a soccer ball{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a soccer player in a yellow and black shirt is chasing a soccer ball.jpg", "mask_strategy": "0"}
a group of women sitting on the steps of a building{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of women sitting on the steps of a building.jpg", "mask_strategy": "0"}
a group of people gathered around a red checkered blanket{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people gathered around a red checkered blanket.jpg", "mask_strategy": "0"}
a group of people in orange jumpsuits running along a river{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people in orange jumpsuits running along a river.jpg", "mask_strategy": "0"}
a woman walking down a sidewalk with a bag{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman walking down a sidewalk with a bag.jpg", "mask_strategy": "0"}
a busy street with cars and people on motorcycles{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a busy street with cars and people on motorcycles.jpg", "mask_strategy": "0"}
a man in a mask is walking through a crowd of people{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man in a mask is walking through a crowd of people.jpg", "mask_strategy": "0"}
a man and a woman walking under an umbrella next to a brick wall{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a man and a woman walking under an umbrella next to a brick wall.jpg", "mask_strategy": "0"}
a group of people riding bikes down a street{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of people riding bikes down a street.jpg", "mask_strategy": "0"}
An old person is holding a cup on the street, and people around are curiously looking at him{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/An old person is holding a cup on the street, and people around are curiously looking at him.jpg", "mask_strategy": "0"}
two young girls playing with leaves in the woods{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two young girls playing with leaves in the woods.jpg", "mask_strategy": "0"}
One person is riding on the back of a horse led by another person{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/One person is riding on the back of a horse led by another person.jpg", "mask_strategy": "0"}
an older woman and a young girl are knitting together{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/an older woman and a young girl are knitting together.jpg", "mask_strategy": "0"}
three geishas walking down the street in traditional clothing{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/three geishas walking down the street in traditional clothing.jpg", "mask_strategy": "0"}
two men riding bikes down a road near a forest{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two men riding bikes down a road near a forest.jpg", "mask_strategy": "0"}
two women carrying bowls on their heads{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two women carrying bowls on their heads.jpg", "mask_strategy": "0"}
two women eating pizza at a restaurant{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two women eating pizza at a restaurant.jpg", "mask_strategy": "0"}
two young women studying in a library{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two young women studying in a library.jpg", "mask_strategy": "0"}
pink water lilies in a pond with leaves{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/pink water lilies in a pond with leaves.jpg", "mask_strategy": "0"}
a group of succulents in a rock garden{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of succulents in a rock garden.jpg", "mask_strategy": "0"}
a close up view of a bunch of snowdrop flowers{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up view of a bunch of snowdrop flowers.jpg", "mask_strategy": "0"}
a close up of leaves with water droplets on them{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of leaves with water droplets on them.jpg", "mask_strategy": "0"}
a close-up of a sea anemone in the water{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up of a sea anemone in the water.jpg", "mask_strategy": "0"}
a plant with water droplets on it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a plant with water droplets on it.jpg", "mask_strategy": "0"}
a group of cactus plants in the desert{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of cactus plants in the desert.jpg", "mask_strategy": "0"}
a close-up view of a plant with spiky leaves{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a plant with spiky leaves.jpg", "mask_strategy": "0"}
A budding and blossoming flower bud seedling{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A budding and blossoming flower bud seedling.jpg", "mask_strategy": "0"}
a field of orange flowers near the ocean'{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a field of orange flowers near the ocean'.jpg", "mask_strategy": "0"}
a close-up view of a bunch of pink flowers{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close-up view of a bunch of pink flowers.jpg", "mask_strategy": "0"}
pink water lilies in a pond{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/pink water lilies in a pond.jpg", "mask_strategy": "0"}
reeds blowing in the wind against a cloudy sky{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/reeds blowing in the wind against a cloudy sky.jpg", "mask_strategy": "0"}
two tall cacti in the middle of the desert{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two tall cacti in the middle of the desert.jpg", "mask_strategy": "0"}
a sea anemone on a coral reef{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a sea anemone on a coral reef.jpg", "mask_strategy": "0"}
a dandelion blowing in the wind{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a dandelion blowing in the wind.jpg", "mask_strategy": "0"}
A boiling pot cooking vegetables{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A boiling pot cooking vegetables.jpg", "mask_strategy": "0"}
a woman stirring food in a pan on the stove{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a woman stirring food in a pan on the stove.jpg", "mask_strategy": "0"}
two eggs are fried in a frying pan on the stove{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/two eggs are fried in a frying pan on the stove.jpg", "mask_strategy": "0"}
fried onion rings in a basket{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/fried onion rings in a basket.jpg", "mask_strategy": "0"}
a pot is sitting on top of a campfire{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pot is sitting on top of a campfire.jpg", "mask_strategy": "0"}
a chef is preparing a dish with mushrooms on a wooden board{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a chef is preparing a dish with mushrooms on a wooden board.jpg", "mask_strategy": "0"}
a hand holding a slice of pizza{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a hand holding a slice of pizza.jpg", "mask_strategy": "0"}
A person is using tongs to pick up meat from a plate{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A person is using tongs to pick up meat from a plate.jpg", "mask_strategy": "0"}
The meat is picked up from the grill with tongs{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/The meat is picked up from the grill with tongs.jpg", "mask_strategy": "0"}
A person is whisking eggs, and the egg whites and yolks are gently streaming out{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A person is whisking eggs, and the egg whites and yolks are gently streaming out.jpg", "mask_strategy": "0"}
a person is putting sauce on a burger{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person is putting sauce on a burger.jpg", "mask_strategy": "0"}
A person is making dumplings{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A person is making dumplings.jpg", "mask_strategy": "0"}
a pan filled with fried food{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a pan filled with fried food.jpg", "mask_strategy": "0"}
Chopsticks are slowly picking up the buns from the plastic container{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Chopsticks are slowly picking up the buns from the plastic container.jpg", "mask_strategy": "0"}
a basket of french fries in a fryer{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a basket of french fries in a fryer.jpg", "mask_strategy": "0"}
a table with lobsters and drinks on it{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a table with lobsters and drinks on it.jpg", "mask_strategy": "0"}
a person pouring coffee into a pot on a stove{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person pouring coffee into a pot on a stove.jpg", "mask_strategy": "0"}
a kettle is sitting on top of a campfire{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a kettle is sitting on top of a campfire.jpg", "mask_strategy": "0"}
Chopsticks are picking up noodles from the bowl{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/Chopsticks are picking up noodles from the bowl.jpg", "mask_strategy": "0"}
a person is cooking eggs on an outdoor grill{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person is cooking eggs on an outdoor grill.jpg", "mask_strategy": "0"}
a person is cooking food in a wok on a stove{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person is cooking food in a wok on a stove.jpg", "mask_strategy": "0"}
a person is holding up a burger with his hands{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person is holding up a burger with his hands.jpg", "mask_strategy": "0"}
A person is pouring water into a teacup{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/A person is pouring water into a teacup.jpg", "mask_strategy": "0"}
a person pouring seasoning into a pot of food{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person pouring seasoning into a pot of food.jpg", "mask_strategy": "0"}
a person holding a taco in their hand{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person holding a taco in their hand.jpg", "mask_strategy": "0"}
a person slicing salmon on a cutting board{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person slicing salmon on a cutting board.jpg", "mask_strategy": "0"}
a bunch of food is cooking on a grill over an open fire{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a bunch of food is cooking on a grill over an open fire.jpg", "mask_strategy": "0"}
a close up of a piece of sushi on chopsticks{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a close up of a piece of sushi on chopsticks.jpg", "mask_strategy": "0"}
a group of pots on a stove with flames in the background{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a group of pots on a stove with flames in the background.jpg", "mask_strategy": "0"}
a person cooking vegetables in a pan on a stove{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person cooking vegetables in a pan on a stove.jpg", "mask_strategy": "0"}
a large pot of soup filled with vegetables and meat{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a large pot of soup filled with vegetables and meat.jpg", "mask_strategy": "0"}
a person holding chopsticks over a bowl of food{"reference_path": "/mnt/jfs-hdd/sora/data/vbench-i2v/crop/1-1/a person holding chopsticks over a bowl of food.jpg", "mask_strategy": "0"}


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_category/animal.txt
================================================
a black dog wearing halloween costume
spider making a web
bat eating fruits while hanging
a snake crawling on a wooden flooring
a close up video of a dragonfly
macro shot of ladybug on green leaf plant
chameleon eating ant
a bee feeding on nectars
bird nests on a tree captured with moving camera
a squirrel eating nuts
close up video of snail
top view of a hermit crab crawling on a wooden surface
cat licking another cat
red dragonfly perched on green leaf
close up view of a brown caterpillar crawling on green leaf
ants eating dead spider
an eagle on a tree branch
a frog eating an ant
white rabbit near the fence
a gorilla eating a carrot
close up of wolf
a meerkat looking around
a hyena in a zoo
lemur eating grass leaves
an owl being trained by a man
a lizard on a bamboo
brown chicken hunting for its food
video of parrots perched on bird stand
underwater footage of an octopus in a coral reef
a cute pomeranian dog playing with a soccer ball
white fox on rock
close up footage of a horse figurine
giraffe feeding on a tree in a savannah
curious cat sitting and looking around
hummingbird hawk moth flying near pink flowers
close up of a scorpion on a rock
close up on fish in net
koala eating leaves from a branch
a pod of dolphins swirling in the sea catching forage fish
low angle view of a hawk perched on a tree branch
a lion standing on wild grass
deer grazing in the field
elephant herd in a savanna
close up on lobster under water
hedgehog crossing road in forest
a sheep eating yellow flowers from behind a wire fence
twin sisters and a turtle
a pig wallowing in mud
flock of goose eating on the lake water
cow in a field irritated with flies
a close up shot of a fly
cheetah lying on the grass
close up of a lemur
close up shot of a kangaroo itching in the sand
a tortoise covered with algae
turkey in cage
a great blue heron bird in the lakeside
crab with shell in aquarium
a seagull walking on shore
an american crocodile
a tiger walking inside a cage
alligator in the nature
a raccoon climbing a tree
wild rabbit in a green meadow
group of ring tailed lemurs
a clouded leopard on a tree branch
duck grooming its feathers
an african penguin walking on a beach
a video of a peacock
close up shot of a wild bear
baby rhino plays with mom
porcupine climbs tree branches
close up of a natterjack toad on a rock
a sleeping orangutan
mother whale swimming with babies
a bear wearing red jersey
pink jellyfish swimming underwater in a blue sea
beautiful clown fish swimming
animation of disposable objects shaped as a whale
paper cut out of a pair of hands a whale and a heart
vertical video of camel roaming in the field during daytime
a still video of mosquito biting human
a curious sloth hanging from a tree branch
a plastic flamingo bird stumbles from the wind
a wolf in its natural habitat
a monkey sitting in the stone and scratching his head
bat hanging upside down
a red panda eating leaves
snake on ground
a harbour seal swimming near the shore
shark swimming in the sea
otter on branch while eating
goat standing over a rock
a troop of monkey on top of a mountain
a zebra eating grass on the field
a colorful butterfly perching on a bud
a snail crawling on a leaf
zookeeper showering a baby elephant
a beetle emerging from the sand
a nine banded armadillo searching for food


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_category/architecture.txt
================================================
an apartment building with balcony
asian garden and medieval castle
illuminated tower in berlin
a wooden house overseeing the lake
a crowd of people in a plaza in front of a government building
a church interior
jewish friends posing with hanukkah menorah in a cabin house
a destroyed building after a missile attack in ukraine
abandoned building in the woods
drone video of an abandoned school building in pripyat ukraine
elegant university building
architecture and designs of buildings in central london
a pancake tower with chocolate syrup and strawberries on top
an ancient white building
friends hanging out at a coffee house
house front door with christmas decorations
city night dark building
a bird house hanging on a tree branch
sacred sculpture in a temple
high angle shot of a clock tower
modern wooden house interior
the interior of an abandoned building
opera house overlooking sea
a concrete structure near the green trees
dome like building in scotland
low angle shot of a building
tower on hill
a miniature house
eiffel tower from the seine river
low angle footage of an apartment building
island with pier and antique building
asian historic architecture
drone footage of a beautiful mansion
mosque in the middle east
building a tent and hammock in the forest camping site
top view of a high rise building
house covered in snow
skyscraper at night
house in village
a casino with people outside the building
silhouette of a building
a woman climbing a tree house
drone view of house near lake during golden hour
an under construction concrete house
a watch tower by the sea
exterior view of arabic style building
video of a hotel building
red paper lantern decorations hanging outside a building
house on seashore
aerial footage of the palace of culture and science building in warsaw poland
aerial video of stuttgart tv tower in germany
aerial view of the highway and building in a city
drone shot of a skyscraper san francisco california usa
waterfall and house
view of the sky through a building
drone footage of a house on top of the mountain
abandoned house in the nature
clouds hovering over a mansion
light house on the ocean
buddhist temple at sunrise
people walking by a graveyard near a mosque at sunset
view of lifeguard tower on the beach
scenic view of a house in the mountains
the landscape in front of a government building
aerial footage of a building and its surrounding landscape in winter
time lapse of a cloudy sky behind a transmission tower
blue ocean near the brown castle
fog over temple
house in countryside top view
building under construction
turkish flag waving on old tower
the georgian building
close up shot of a steel structure
the atrium and interior design of a multi floor building
city view reflected on a glass building
aerial view of a luxurious house with pool
an unpaved road leading to the house
drone footage of a lookout tower in mountain landscape
wind turbines on hill behind building
time lapse footage of the sun light in front of a small house porch
a building built with lots of stairways
overcast over house on seashore
the view of the sydney opera house from the other side of the harbor
candle on a jar and a house figurine on a surface
video of a farm and house
a dilapidated building made of bricks
a view of a unique building from a moving vehicle
aerial footage of a tall building in cambodia
push in shot of a huge house
a beach house built over a seawall protected from the sea waves
exotic house surrounded by trees
drone video of a house surrounded by tropical vegetation
drone footage of a building beside a pond
observation tower on hill in forest
a tree house in the woods
a video of vessel structure during daytime
fire in front of illuminated building at night
a footage of a wooden house on a wheat field
tilt shot of a solar panel below a light tower
water tower on the desert


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_category/food.txt
================================================
freshly baked finger looking cookies
video of fake blood in wine glass
halloween food art
a person slicing a vegetable
a serving of pumpkin dish in a plate
close up view of green leafy vegetable
a birthday cake in the plate
video of a slice papaya fruit
a muffin with a burning candle and a love sign by a ceramic mug
a jack o lantern designed cookie
baked bread with chocolate
a broccoli soup on wooden table
a freshly brewed coffee on a pink mug
grabbing sourdough neapolitan style pizza slices
person cooking mushrooms in frying pan
rice grains placed on a reusable cloth bag
slices of kiwi fruit
grilling a steak on a pan grill
close up of bread popping out of a toaster
man eating noodle
preparing a cocktail drink
close up pasta with bacon on plate
milk and cinnamon rolls
boy getting a dumpling using chopsticks
a mother preparing food with her kids
man using his phone while eating
fresh salmon salad on a plate
cutting cucumbers into long thin slices as ingredient for sushi roll
a steaming cup of tea by the window
a glass filled with beer
a kid eating popcorn while watching tv
close up shot of fried fish on the plate
a man eating a donut
person making a vegetarian dish
spreading cheese on bagel
close up view of a man drinking red wine
a couple having breakfast in a restaurant
a student eating her sandwich
girl peeling a banana
red rice in a small bowl
pancake with blueberry on the top
green apple fruit on white wooden table
a man eating a taco by the bar
making of a burrito
squeezing lemon into salad
a chef cutting sushi rolls
video of a delicious dessert
deep frying a crab on a wok in high fire
close up video of a orange juice
video of a cooked chicken breast
woman holding a pineapple
a woman eating a bar of chocolate
decorating christmas cookie
squeezing a slice of fruit
tuna sashimi on a plate
a strawberry fruit mixed in an alcoholic drink
preparing hot dogs in a grill
a woman cutting a tomato
an orange fruit cut in half
a coconut fruit with drinking straw
woman holding a dragon fruit
a woman pouring hot beverage on a cup
waffles with whipped cream and fruit
focus shot of an insect at the bottom of a fruit
preparing a healthy broccoli dish
man eating snack at picnic
close up video of a grilled shrimp skewer
a woman mixing a smoothie drinks
close up video of woman having a bite of jelly
businessman drinking whiskey at the bar counter of a hotel lounge
cutting an onion with a knife over a wooden chopping board
fresh lemonade in bottles
grilling a meat on a charcoal grill
people enjoying asian cuisine
close up footage of a hot dish on a clay pot
pork ribs dish
waffle with strawberry and syrup for breakfast
tofu dish with rose garnish
uncooked pork meat
egg yolk being dumped over gourmet dish
tasty brunch dish close up
little boy pretending to eat the watermelon
slicing roasted beef
close up of a chef adding teriyaki sauce to a dish
flat lay mexican dish
a person placing an octopus dish on a marble surface
close up of tea leaves brewing in a glass kettle
adding fresh herbs to soup dish
a scoop of roasted coffee beans
fresh dim sum set up on a bamboo steam tray for cooking
a girl putting ketchup on food at the kitchen
cooking on electric stove
a woman with a slice of a pie
grapes and wine on a wooden board
man taking picture of his food
hamburger and fries on restaurant table
close up video of japanese food
a cracker sandwich with cheese filling for snack
barista preparing matcha tea
close up of onion rings being deep fried


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_category/human.txt
================================================
people carving a pumpkin
people sitting on a sofa
a man with a muertos face painting
man walking in the dark
men in front of their computer editing photos
men loading christmas tree on tow truck
woman washing the dishes
woman adding honey to the cinnamon rolls
two women kissing and smiling
three women looking at watercolor paintings
a family wearing paper bag masks
a family posing for the camera
a boy covering a rose flower with a dome glass
boy sitting on grass petting a dog
a girl in her tennis sportswear
a girl coloring the cardboard
silhouette of the couple during sunset
couple dancing with body paint
a child playing with water
a woman with her child sitting on a couch in the living room
a group of friend place doing hand gestures of agreement
friends having a group selfie
friends talking while on the basketball court
group of people protesting
a group of campers with a cute dog
a group of photographers taking pictures at the north western gardens in llandudno north wales
a group of students laughing and talking
a group of martial artist warming up
a person playing golf
a person walking on a wet wooden bridge
person doing a leg exercise
ice hockey athlete on rink
a young athlete training in swimming
chess player dusting a chessboard
baseball player holding his bat
a bearded man putting a vinyl record on a vinyl player
an orchestra finishes a performance
people applauding the performance of the kids
band performance at the recording studio
father and his children playing jenga game
people playing a board game
man playing a video game
a man video recording the movie in theater
man and a woman eating while watching a movie
movie crew talking together
a director explaining the movie scene
man and woman listening to music on car
man playing music
couple dancing slow dance with sun glare
a ballerina practicing in the dance studio
father and son holding hands
father and daughter talking together
a mother and her kids engaged in a video call
mother and daughter reading a book together
a mother teaching her daughter playing a violin
kid in a halloween costume
a happy kid playing the ukulele
a chef slicing a cucumber
chef wearing his gloves properly
brother and sister using hammock
girl applying sunblock to her brother
a girl pushing the chair while her sister is on the chair
colleagues talking in office building
fighter practice kicking
a woman fighter in her cosplay costume
an engineer holding blueprints while talking with her colleague
a young woman looking at vr controllers with her friend
workmates teasing a colleague in the work
a male police officer talking on the radio
teacher holding a marker while talking
teacher writing on her notebook
a young student attending her online classes
a student showing his classmates his wand
a male vendor selling fruits
a shirtless male climber
a sound engineer listening to music
female talking to a psychiatrist in a therapy session
young female activist posing with flag
a man in a hoodie and woman with a red bandana talking to each other and smiling
a medium close up of women wearing kimonos
a male interviewer listening to a person talking
a social worker having a conversation with the foster parents
a farm worker harvesting onions
worker packing street food
worker and client at barber shop
elderly man lifting kettlebell
mom assisting son in riding a bicycle
dad watching her daughter eat
young guy with vr headset
pregnant woman exercising with trainer
a fortune teller talking to a client
wizard doing a ritual on a woman
a footage of an actor on a movie scene
a man holding a best actor trophy
a singer of a music band
a young singer performing on stage
young dancer practicing at home
seller showing room to a couple
cab driver talking to passenger
a policeman talking to the car driver


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_category/lifestyle.txt
================================================
kids celebrating halloween at home
little boy helping mother in kitchen
video of a indoor green plant
a girl arranges a christmas garland hanging by the kitchen cabinet
candle burning in dark room
couple having fun and goofing around the bedroom
girls jumping up and down in the bedroom
woman and man in pajamas working from home
a muslim family sitting and talking in the living room
family enjoying snack time while sitting in the living room
woman holding an animal puppet and a little girl playing together at the living room
kids playing in the indoor tent
young people celebrating new year at the office
a woman writing on the sticky note in the office
a woman exercising at home over a yoga mat
girls preparing easter decorations at home
dog on floor in room
turning on a fluorescent light inside a room
colleagues talking to each other near the office windows
a woman recording herself while exercising at home
music room
different kind of tools kept in a utility room
sofa beds and other furniture
a girl finding her brother reading a book in the bedroom
an elegant ceramic plant pot and hanging plant on indoor
furniture inside a bedroom
interior design of the bar section
living room with party decoration
firewood burning in dark room
a young woman playing the ukulele at home
woman painting at home
a woman in a locker room
video of a bathroom interior
the interior design of a jewish synagogue
a woman in protective suit disinfecting the kitchen
modern minimalist home interior
modern interior design of a coffee shop
person arranging minimalist furniture
aerial shot of interior of the warehouse
a room of a manufacturing facility
interior of catholic
interior design of a restaurant
a female model in a changing room looking herself in mirror
men walking in the office hallway
people sitting in a conference room
the interior design of a shopping mall
chandeliers in room
lucerne railway station interior
a female fencer posing in a foggy room
a toolbox and a paint roller beside a huge package in a room
bedroom in hotel
a woman lying in the operating room
a chef holding and checking kitchen utensils
a couple singing in the shower room together
a woman cleaning mess in the living room
an empty meeting room with natural light
person dancing in a dark room
close up on blood in hospital room
a couple resting on their home floor
a young female staff at courier office
a man entering the gym locker room
a bored man sitting by the tv at home
woman dancing in indoor garden
rubble in the interior of an abandoned house
indoor farm in a greenhouse
man doing handstand in indoor garden
an abandoned indoor swimming pool
home decorations on top of a cabinet
graffiti art on the interior walls of an abandoned mansion
indoor wall climbing activity
sunlight inside a room
teenage girl roller skating at indoor rink
home deco with lighted
baby in the shower room
men enjoying office christmas party
a bedroom with a brick wall
actors prepping in the dressing room
kids playing at an indoor playground
a person sanitizing an office space using smoke machine
mother and daughter choosing clothes at home
a woman sitting by the indoor fire pit
man standing on the corner of the room while looking around
person assembling furniture
a family stacking cardboard boxes in a room
family having fun in the dining room
person disinfecting a room
a woman washing strawberries in the kitchen sink
modern office waiting room
close up view of a person slicing with a kitchen knife
boiling coffee on a stove in the kitchen
modern equipment used in a home studio
interior of a recording studio
people working in a call center office
band performing at a home concert
a group of people watching a concert in a room
people packing their furniture
young employees in office holding a certificate
a criminal inside a dark room handcuffed in a table
couple browsing and looking for furniture in the store
workspace at home


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_category/plant.txt
================================================
video of a indoor green plant
close up view of a plant
close up shot of a burning plant
plucking leaves from plant
a plant on gold pot with glass lid
a branch of a tree and a plant
a leafless tree
close up shot of fern leaf
close up video of strawberry plant
plant with blooming flowers
close up video of flower petals
watering yellow plant
beautiful flower decoration
cannabis flower in a jar
a footage of the tree leaves
a red leaf plant
close up view of a white christmas tree
snow pouring on a tree
close up shot of white flowers on the tree
leaves in the trees daytime
a dead tree lying on a grass field
tree branches in a flowing river
purple flowers with leaves
a coconut tree by the house
close up on flower in winter
bamboo leaves backlit by the sun
close up video of a wet flower
a man putting a flower in a box
dropping flower petals on a wooden bowl
a close up shot of gypsophila flower
variety of succulent plants on a garden
variety of trees and plants in a botanical garden
forest of deciduous trees
a stack of dried leaves burning in a forest
tall forest trees on a misty morning
close up view of dewdrops on a leaf
close up view of white petaled flower
removing a pineapple leaf
a dragonfly perched on a leaf
butterfly pollinating flower
person visiting and checking a corn plant
woman picking beans from a plant
woman plucking mint leaves
single tree in the middle of farmland
a plant on a soil
drone footage of a tree on farm field
a tractor harvesting lavender flower
people putting christmas ornaments on a christmas tree
jack o lantern hanging on a tree
tree with halloween decoration
flower field near the waterfall
truck carrying the tree logs
raindrops falling on leaves
shot of a palm tree swaying with the wind
squirrels on a tree branch
person holding a flower
a fallen tree trunk
tree with golden leaves
cherry tree
wind blows through leaves of the tree in autumn
a leaf on a glass
the long trunks of tall trees in the forest
trees in the forest during sunny day
close up video of tree bark
reflection of tree branches
trunks of many trees in the forest
tree leaves providing shades from the sun
leaves swaying in the wind
low angle shot of baobab tree
bare trees in forest
a plant surrounded by fallen leaves
a couple preparing food and pruning a plant
a man cutting a tree bark
oranges on a tree branch
plant connected on the stones
video of a sawmill machine cutting tree log
women drying flower petals
macro view of an agave plant
a video of a person tying a plant on a string
green moss in forest nature
coconut tree near sea under blue sky
the canopy of a coconut tree
a man leaning on a tree at the beach
a full grown plant on a pot
candle wax dripping on flower petals
close up of leaves in autumn
a woman opening a book with a flower inside
a man holding leaves looking at the camera
a shadow of a swaying plant
a tree and concrete structure under a blue and cloudy sky
trimming excess leaves on a potted plant
the changing color of the tree leaves during autumn season
a gooseberry tree swayed by the wind
forest trees and a medieval castle at sunset
woman cut down tree
an old oak tree in a park across the street from a hotel
wild flowers growing in a forest ground
a mossy fountain and green plants in a botanical garden
mansion with beautiful garden
ants on a dragon fruit flower


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_category/scenery.txt
================================================
scenery of desert landscape
landscape agriculture farm tractor
burning slash piles in the forest
graveyard at sunset
view of a jack o lantern with pumpkins in a smoky garden
sun view through a spider web
view of the sea from an abandoned building
close up view of a full moon
close up view of lighted candles
close up view of swaying white flowers and leaves
scenery of a relaxing beach
selective focus video of grass during sunny day
aerial view of brown dry landscape
fireworks display in the sky at night
a bonfire near river
mountain view
waterfalls in between mountain
a picturesque view of nature
exotic view of a riverfront city
tall trees in the forest under the clear sky
snow on branches in forest
stream in the nature
an airplane flying above the sea of clouds
scenic video of sunset
view of houses with bush fence under a blue and cloudy sky
scenic view from wooden pathway
scenic view of a tropical beach
drone footage of waves crashing on beach shore
a scenic view of the golden hour at norway
time lapse video of foggy mountain forest
brown mountain during fall season
video of ocean during daytime
boat sailing in the ocean
top view of yachts
beautiful scenery of flowing waterfalls and river
wild ducks paddling on the lake surface
a relaxing scenery of beach view under cloudy sky
natural rock formations on beach under cloudy sky
a palm tree against blue sky
video of sailboat on a lake during sunset
aerial view of snow piles
time lapse of a sunset sky in the countryside
aerial footage of a statue
time lapse video of a farm during sunset
clouds formation in the sky at sunset
aerial shot of a village
drone shot of a beautiful sunrise at the mountains
time lapse video of foggy morning during sunrise
sun shining between tree leaves at sunrise
video of lake during dawn
vehicles traveling on roadway under cloudy sky
view of golden domed church
a monument under the blue sky
firecrackers in the sky
view of fruit signage in the farm
a dark clouds over shadowing the full moon
view of the amazon river
a big river swamp in a dense forest
a blooming cherry blossom tree under a blue sky with white clouds
a river waterfall cascading down the plunge basin
flooded landscape with palm trees
a blurry waterfall background
waterfall in the mountains
aerial footage of a city at night
pond by small waterfall in forest
aerial view of farmlands at the bay of lake
rice terraces in the countryside
a highway built across an agricultural area in the countryside
gloomy morning in the countryside
drone shot of an abandoned coliseum on a snowy mountain top
boat sailing in the middle of ocean
drone shot of the grass field
natural landscape of mountain and sea with islets developed into a community
aerial view of zaporizhia in ukraine
aerial footage of a herd
an aerial footage of a red sky
grass and plants growing in the remains of an abandoned house
view from hill on city
aerial view on orthodox church
aerial view of bay in croatia
a footage of a frozen river
overlooking view of a city at daylight
view outside the cemetery
clear sky with moon over meadow
clouds over railway
aerial footage of moving vehicles on the road at night
aerial view of town and park
top view of skyscrapers
top view of the empire state building in manhattan
top view of the central park in new york city
sheep running in a grass field
clear sky over factory
smoke and fire in birds eye view
view of a pathway with snow melting on its side
ferry under bridge on river near city in malaysia
mountain slopes covered in green vegetation
panoramic view of a town surrounded by snow covered mountains
aerial view of a palace
top view of vehicles driving on the intersection
a graveyard by a church in a mountain landscape


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_category/vehicles.txt
================================================
a modern railway station in malaysia use for public transportation
drone footage of amsterdam metro station
train arriving at a station
red vehicle driving on field
close up view of flashing emergency vehicle lighting
vehicle with fertilizer on field
a highway built across an agricultural area in the countryside
drone footage of motorcycles driving on country road between agricultural fields
a road in the woods under fog
footage of a car driving through a wheat field
vehicle stops for an ambulance passing through city traffic
emergency vehicle parked outside the casino
zombies attacking a woman and a boy inside a car
woman seating inside the car while chewing
video of passengers riding a double decker bus during night
traffic in london street at night
elderly couple checking engine of automobile
a green vintage automobile with an open hood parked in a parking area
close up of a prototype automobile with exposed engine on the back seat of the car
aerial view of road in forest
train departing from station
aerial view of a train passing by a bridge
video of a train tracks
video footage of a subway
video of blinking traffic lights
couple walking out on the subway
time lapse of a subway tunnel
monitor board inside the subway
metro train at night
zoom in video of a tram passing by city
young man using laptop in the tram
man reading a book at bus stop
close up shot of a moving taxi
night travel in london street on a public bus
red bus in a rainy city
flow of traffic in the city
close up shot of a yellow taxi turning left
two women calling for a taxi
drone view of an illuminated bridge across a river
policeman in police car talking on radio
airplane taking off at night
view through window in airplane
an airplane in the sky
helicopter landing on the street
a pilot getting out of a helicopter
a helicopter flying under blue sky
boat sailing in the middle of the ocean
girl playing with a toy boat
silhouette of a boat on sea during golden hour
a boat travelling around the lake
road on mountain ridge
ship sailing on danube river
slow motion video of a ship water trail in the sea
drone footage of a wreck ship on shore
a white yacht traveling on a river and passing under the bridge
female teenagers drinking champagne in the yacht
video of yacht sailing in the ocean
red combine harvester on road on field
a woman sitting on a bicycle while using a mobile phone
a woman sitting on a motorcycle looking around
three teenagers fixing a bicycle
a woman in a halloween costume posing on a motorcycle
a parked motorcycle on a foggy roadside
cable car near sea shore
a truck travelling in the road
footage of the road without any traffic
a road sign
love padlocks on a bridge
camera moving at highway construction site
vehicles driving on highway
a motorbike on highway at timelapse mode
point of view of a car driving through a tunnel
time lapse of heavy traffic on an avenue
ferry boat on city canal
black vintage car in museum
a zigzag road across a forest
people crossing the road
video of a kayak boat in a river
a person paddling a wooden boat in a lake
a car charging in the parking area
cars parked on the road
footage of the street with people and vehicle passing by in the rain
traffic on busy city street
a woman getting out of the car to walk with their dog
yacht sailing through the ocean
people in queue to military ship
man wearing motorcycle helmet looking at the camera
empty seats in the bus
empty boat on the water
cargo train traveling on the mountainside
cruise ship in harbor
counting down at traffic lights
pressing the car ignition
fire truck driving on the road
a footage of a broken bicycle
drone footage of an ambulance on the road
slow motion footage of a racing car
ship sailing on sea against sunset
big cargo ship passing on the shore
back view of man and woman walking on unpaved road


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/appearance_style.txt
================================================
A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style
A beautiful coastal beach in spring, waves lapping on sand, oil painting
A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo
A beautiful coastal beach in spring, waves lapping on sand, black and white
A beautiful coastal beach in spring, waves lapping on sand, pixel art
A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style
A beautiful coastal beach in spring, waves lapping on sand, animated style
A beautiful coastal beach in spring, waves lapping on sand, watercolor painting
A beautiful coastal beach in spring, waves lapping on sand, surrealism style
The bund Shanghai, Van Gogh style
The bund Shanghai, oil painting
The bund Shanghai by Hokusai, in the style of Ukiyo
The bund Shanghai, black and white
The bund Shanghai, pixel art
The bund Shanghai, in cyberpunk style
The bund Shanghai, animated style
The bund Shanghai, watercolor painting
The bund Shanghai, surrealism style
a shark is swimming in the ocean, Van Gogh style
a shark is swimming in the ocean, oil painting
a shark is swimming in the ocean by Hokusai, in the style of Ukiyo
a shark is swimming in the ocean, black and white
a shark is swimming in the ocean, pixel art
a shark is swimming in the ocean, in cyberpunk style
a shark is swimming in the ocean, animated style
a shark is swimming in the ocean, watercolor painting
a shark is swimming in the ocean, surrealism style
A panda drinking coffee in a cafe in Paris, Van Gogh style
A panda drinking coffee in a cafe in Paris, oil painting
A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo
A panda drinking coffee in a cafe in Paris, black and white
A panda drinking coffee in a cafe in Paris, pixel art
A panda drinking coffee in a cafe in Paris, in cyberpunk style
A panda drinking coffee in a cafe in Paris, animated style
A panda drinking coffee in a cafe in Paris, watercolor painting
A panda drinking coffee in a cafe in Paris, surrealism style
A cute happy Corgi playing in park, sunset, Van Gogh style
A cute happy Corgi playing in park, sunset, oil painting
A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo
A cute happy Corgi playing in park, sunset, black and white
A cute happy Corgi playing in park, sunset, pixel art
A cute happy Corgi playing in park, sunset, in cyberpunk style
A cute happy Corgi playing in park, sunset, animated style
A cute happy Corgi playing in park, sunset, watercolor painting
A cute happy Corgi playing in park, sunset, surrealism style
Gwen Stacy reading a book, Van Gogh style
Gwen Stacy reading a book, oil painting
Gwen Stacy reading a book by Hokusai, in the style of Ukiyo
Gwen Stacy reading a book, black and white
Gwen Stacy reading a book, pixel art
Gwen Stacy reading a book, in cyberpunk style
Gwen Stacy reading a book, animated style
Gwen Stacy reading a book, watercolor painting
Gwen Stacy reading a book, surrealism style
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting
A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style
An astronaut flying in space, Van Gogh style
An astronaut flying in space, oil painting
An astronaut flying in space by Hokusai, in the style of Ukiyo
An astronaut flying in space, black and white
An astronaut flying in space, pixel art
An astronaut flying in space, in cyberpunk style
An astronaut flying in space, animated style
An astronaut flying in space, watercolor painting
An astronaut flying in space, surrealism style
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/color.txt
================================================
a red bicycle
a green bicycle
a blue bicycle
a yellow bicycle
an orange bicycle
a purple bicycle
a pink bicycle
a black bicycle
a white bicycle
a red car
a green car
a blue car
a yellow car
an orange car
a purple car
a pink car
a black car
a white car
a red bird
a green bird
a blue bird
a yellow bird
an orange bird
a purple bird
a pink bird
a black bird
a white bird
a black cat
a white cat
an orange cat
a yellow cat
a red umbrella
a green umbrella
a blue umbrella
a yellow umbrella
an orange umbrella
a purple umbrella
a pink umbrella
a black umbrella
a white umbrella
a red suitcase
a green suitcase
a blue suitcase
a yellow suitcase
an orange suitcase
a purple suitcase
a pink suitcase
a black suitcase
a white suitcase
a red bowl
a green bowl
a blue bowl
a yellow bowl
an orange bowl
a purple bowl
a pink bowl
a black bowl
a white bowl
a red chair
a green chair
a blue chair
a yellow chair
an orange chair
a purple chair
a pink chair
a black chair
a white chair
a red clock
a green clock
a blue clock
a yellow clock
an orange clock
a purple clock
a pink clock
a black clock
a white clock
a red vase
a green vase
a blue vase
a yellow vase
an orange vase
a purple vase
a pink vase
a black vase
a white vase


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/human_action.txt
================================================
A person is riding a bike
A person is marching
A person is roller skating
A person is tasting beer
A person is clapping
A person is drawing
A person is petting animal (not cat)
A person is eating watermelon
A person is playing harp
A person is wrestling
A person is riding scooter
A person is sweeping floor
A person is skateboarding
A person is dunking basketball
A person is playing flute
A person is stretching leg
A person is tying tie
A person is skydiving
A person is shooting goal (soccer)
A person is playing piano
A person is finger snapping
A person is canoeing or kayaking
A person is laughing
A person is digging
A person is clay pottery making
A person is shooting basketball
A person is bending back
A person is shaking hands
A person is bandaging
A person is push up
A person is catching or throwing frisbee
A person is playing trumpet
A person is flying kite
A person is filling eyebrows
A person is shuffling cards
A person is folding clothes
A person is smoking
A person is tai chi
A person is squat
A person is playing controller
A person is throwing axe
A person is giving or receiving award
A person is air drumming
A person is taking a shower
A person is planting trees
A person is sharpening knives
A person is robot dancing
A person is rock climbing
A person is hula hooping
A person is writing
A person is bungee jumping
A person is pushing cart
A person is cleaning windows
A person is cutting watermelon
A person is cheerleading
A person is washing hands
A person is ironing
A person is cutting nails
A person is hugging
A person is trimming or shaving beard
A person is jogging
A person is making bed
A person is washing dishes
A person is grooming dog
A person is doing laundry
A person is knitting
A person is reading book
A person is baby waking up
A person is massaging legs
A person is brushing teeth
A person is crawling baby
A person is motorcycling
A person is driving car
A person is sticking tongue out
A person is shaking head
A person is sword fighting
A person is doing aerobics
A person is strumming guitar
A person is riding or walking with horse
A person is archery
A person is catching or throwing baseball
A person is playing chess
A person is rock scissors paper
A person is using computer
A person is arranging flowers
A person is bending metal
A person is ice skating
A person is climbing a rope
A person is crying
A person is dancing ballet
A person is getting a haircut
A person is running on treadmill
A person is kissing
A person is counting money
A person is barbequing
A person is peeling apples
A person is milking cow
A person is shining shoes
A person is making snowman
A person is sailing


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/multiple_objects.txt
================================================
a bird and a cat
a cat and a dog
a dog and a horse
a horse and a sheep
a sheep and a cow
a cow and an elephant
an elephant and a bear
a bear and a zebra
a zebra and a giraffe
a giraffe and a bird
a chair and a couch
a couch and a potted plant
a potted plant and a tv
a tv and a laptop
a laptop and a remote
a remote and a keyboard
a keyboard and a cell phone
a cell phone and a book
a book and a clock
a clock and a backpack
a backpack and an umbrella
an umbrella and a handbag
a handbag and a tie
a tie and a suitcase
a suitcase and a vase
a vase and scissors
scissors and a teddy bear
a teddy bear and a frisbee
a frisbee and skis
skis and a snowboard
a snowboard and a sports ball
a sports ball and a kite
a kite and a baseball bat
a baseball bat and a baseball glove
a baseball glove and a skateboard
a skateboard and a surfboard
a surfboard and a tennis racket
a tennis racket and a bottle
a bottle and a chair
an airplane and a train
a train and a boat
a boat and an airplane
a bicycle and a car
a car and a motorcycle
a motorcycle and a bus
a bus and a traffic light
a traffic light and a fire hydrant
a fire hydrant and a stop sign
a stop sign and a parking meter
a parking meter and a truck
a truck and a bicycle
a toilet and a hair drier
a hair drier and a toothbrush
a toothbrush and a sink
a sink and a toilet
a wine glass and a chair
a cup and a couch
a fork and a potted plant
a knife and a tv
a spoon and a laptop
a bowl and a remote
a banana and a keyboard
an apple and a cell phone
a sandwich and a book
an orange and a clock
broccoli and a backpack
a carrot and an umbrella
a hot dog and a handbag
a pizza and a tie
a donut and a suitcase
a cake and a vase
an oven and scissors
a toaster and a teddy bear
a microwave and a frisbee
a refrigerator and skis
a bicycle and an airplane
a car and a train
a motorcycle and a boat
a person and a toilet
a person and a hair drier
a person and a toothbrush
a person and a sink


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/object_class.txt
================================================
a person
a bicycle
a car
a motorcycle
an airplane
a bus
a train
a truck
a boat
a traffic light
a fire hydrant
a stop sign
a parking meter
a bench
a bird
a cat
a dog
a horse
a sheep
a cow
an elephant
a bear
a zebra
a giraffe
a backpack
an umbrella
a handbag
a tie
a suitcase
a frisbee
skis
a snowboard
a sports ball
a kite
a baseball bat
a baseball glove
a skateboard
a surfboard
a tennis racket
a bottle
a wine glass
a cup
a fork
a knife
a spoon
a bowl
a banana
an apple
a sandwich
an orange
broccoli
a carrot
a hot dog
a pizza
a donut
a cake
a chair
a couch
a potted plant
a bed
a dining table
a toilet
a tv
a laptop
a remote
a keyboard
a cell phone
a microwave
an oven
a toaster
a sink
a refrigerator
a book
a clock
a vase
scissors
a teddy bear
a hair drier
a toothbrush


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/overall_consistency.txt
================================================
Close up of grapes on a rotating table.
Turtle swimming in ocean.
A storm trooper vacuuming the beach.
A panda standing on a surfboard in the ocean in sunset.
An astronaut feeding ducks on a sunny afternoon, reflection from the water.
Two pandas discussing an academic paper.
Sunset time lapse at the beach with moving clouds and colors in the sky.
A fat rabbit wearing a purple robe walking through a fantasy landscape.
A koala bear playing piano in the forest.
An astronaut flying in space.
Fireworks.
An animated painting of fluffy white clouds moving in sky.
Flying through fantasy landscapes.
A bigfoot walking in the snowstorm.
A squirrel eating a burger.
A cat wearing sunglasses and working as a lifeguard at a pool.
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks.
Splash of turquoise water in extreme slow motion, alpha channel included.
an ice cream is melting on the table.
a drone flying over a snowy forest.
a shark is swimming in the ocean.
Aerial panoramic video from a drone of a fantasy land.
a teddy bear is swimming in the ocean.
time lapse of sunrise on mars.
golden fish swimming in the ocean.
An artist brush painting on a canvas close up.
A drone view of celebration with Christmas tree and fireworks, starry sky - background.
happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background
Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance.
Campfire at night in a snowy forest with starry sky in the background.
a fantasy landscape
A 3D model of a 1800s victorian house.
this is how I do makeup in the morning.
A raccoon that looks like a turtle, digital art.
Robot dancing in Times Square.
Busy freeway at night.
Balloon full of water exploding in extreme slow motion.
An astronaut is riding a horse in the space in a photorealistic style.
Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl.
Sewing machine, old sewing machine working.
Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink.
Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro.
Vampire makeup face of beautiful girl, red contact lenses.
Ashtray full of butts on table, smoke flowing on black background, close-up
Pacific coast, carmel by the sea ocean and waves.
A teddy bear is playing drum kit in NYC Times Square.
A corgi is playing drum kit.
An Iron man is playing the electronic guitar, high electronic guitar.
A raccoon is playing the electronic guitar.
A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh
A corgi's head depicted as an explosion of a nebula
A fantasy landscape
A future where humans have achieved teleportation technology
A jellyfish floating through the ocean, with bioluminescent tentacles
A Mars rover moving on Mars
A panda drinking coffee in a cafe in Paris
A space shuttle launching into orbit, with flames and smoke billowing out from the engines
A steam train moving on a mountainside
A super cool giant robot in Cyberpunk Beijing
A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground
Cinematic shot of Van Gogh's selfie, Van Gogh style
Gwen Stacy reading a book
Iron Man flying in the sky
The bund Shanghai, oil painting
Yoda playing guitar on the stage
A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo
A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh
A boat sailing leisurely along the Seine River with the Eiffel Tower in background
A car moving slowly on an empty street, rainy evening
A cat eating food out of a bowl
A cat wearing sunglasses at a pool
A confused panda in calculus class
A cute fluffy panda eating Chinese food in a restaurant
A cute happy Corgi playing in park, sunset
A cute raccoon playing guitar in a boat on the ocean
A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background
A lightning striking atop of eiffel tower, dark clouds in the sky
A modern art museum, with colorful paintings
A panda cooking in the kitchen
A panda playing on a swing set
A polar bear is playing guitar
A raccoon dressed in suit playing the trumpet, stage background
A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy
A shark swimming in clear Caribbean ocean
A super robot protecting city
A teddy bear washing the dishes
An epic tornado attacking above a glowing city at night, the tornado is made of smoke
An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas
Clown fish swimming through the coral reef
Hyper-realistic spaceship landing on Mars
The bund Shanghai, vibrant color
Vincent van Gogh is painting in the room
Yellow flowers swing in the wind


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/scene.txt
================================================
alley
amusement park
aquarium
arch
art gallery
bathroom
bakery shop
ballroom
bar
barn
basement
beach
bedroom
bridge
botanical garden
cafeteria
campsite
campus
carrousel
castle
cemetery
classroom
cliff
crosswalk
construction site
corridor
courtyard
desert
downtown
driveway
farm
food court
football field
forest road
fountain
gas station
glacier
golf course
indoor gymnasium
harbor
highway
hospital
house
iceberg
industrial area
jail cell
junkyard
kitchen
indoor library
lighthouse
laboratory
mansion
marsh
mountain
indoor movie theater
indoor museum
music studio
nursery
ocean
office
palace
parking lot
pharmacy
phone booth
raceway
restaurant
river
science museum
shower
ski slope
sky
skyscraper
baseball stadium
staircase
street
supermarket
indoor swimming pool
tower
outdoor track
train railway
train station platform
underwater coral reef
valley
volcano
waterfall
windmill


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/spatial_relationship.txt
================================================
a bicycle on the left of a car, front view
a car on the right of a motorcycle, front view
a motorcycle on the left of a bus, front view
a bus on the right of a traffic light, front view
a traffic light on the left of a fire hydrant, front view
a fire hydrant on the right of a stop sign, front view
a stop sign on the left of a parking meter, front view
a parking meter on the right of a bench, front view
a bench on the left of a truck, front view
a truck on the right of a bicycle, front view
a bird on the left of a cat, front view
a cat on the right of a dog, front view
a dog on the left of a horse, front view
a horse on the right of a sheep, front view
a sheep on the left of a cow, front view
a cow on the right of an elephant, front view
an elephant on the left of a bear, front view
a bear on the right of a zebra, front view
a zebra on the left of a giraffe, front view
a giraffe on the right of a bird, front view
a bottle on the left of a wine glass, front view
a wine glass on the right of a cup, front view
a cup on the left of a fork, front view
a fork on the right of a knife, front view
a knife on the left of a spoon, front view
a spoon on the right of a bowl, front view
a bowl on the left of a bottle, front view
a potted plant on the left of a remote, front view
a remote on the right of a clock, front view
a clock on the left of a vase, front view
a vase on the right of scissors, front view
scissors on the left of a teddy bear, front view
a teddy bear on the right of a potted plant, front view
a frisbee on the left of a sports ball, front view
a sports ball on the right of a baseball bat, front view
a baseball bat on the left of a baseball glove, front view
a baseball glove on the right of a tennis racket, front view
a tennis racket on the left of a frisbee, front view
a toilet on the left of a hair drier, front view
a hair drier on the right of a toothbrush, front view
a toothbrush on the left of a sink, front view
a sink on the right of a toilet, front view
a chair on the left of a couch, front view
a couch on the right of a bed, front view
a bed on the left of a tv, front view
a tv on the right of a dining table, front view
a dining table on the left of a chair, front view
an airplane on the left of a train, front view
a train on the right of a boat, front view
a boat on the left of an airplane, front view
an oven on the top of a toaster, front view
an oven on the bottom of a toaster, front view
a toaster on the top of a microwave, front view
a toaster on the bottom of a microwave, front view
a microwave on the top of an oven, front view
a microwave on the bottom of an oven, front view
a banana on the top of an apple, front view
a banana on the bottom of an apple, front view
an apple on the top of a sandwich, front view
an apple on the bottom of a sandwich, front view
a sandwich on the top of an orange, front view
a sandwich on the bottom of an orange, front view
an orange on the top of a carrot, front view
an orange on the bottom of a carrot, front view
a carrot on the top of a hot dog, front view
a carrot on the bottom of a hot dog, front view
a hot dog on the top of a pizza, front view
a hot dog on the bottom of a pizza, front view
a pizza on the top of a donut, front view
a pizza on the bottom of a donut, front view
a donut on the top of broccoli, front view
a donut on the bottom of broccoli, front view
broccoli on the top of a banana, front view
broccoli on the bottom of a banana, front view
skis on the top of a snowboard, front view
skis on the bottom of a snowboard, front view
a snowboard on the top of a kite, front view
a snowboard on the bottom of a kite, front view
a kite on the top of a skateboard, front view
a kite on the bottom of a skateboard, front view
a skateboard on the top of a surfboard, front view
a skateboard on the bottom of a surfboard, front view
a surfboard on the top of skis, front view
a surfboard on the bottom of skis, front view


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/subject_consistency.txt
================================================
a person swimming in ocean
a person giving a presentation to a room full of colleagues
a person washing the dishes
a person eating a burger
a person walking in the snowstorm
a person drinking coffee in a cafe
a person playing guitar
a bicycle leaning against a tree
a bicycle gliding through a snowy field
a bicycle slowing down to stop
a bicycle accelerating to gain speed
a car stuck in traffic during rush hour
a car turning a corner
a car slowing down to stop
a car accelerating to gain speed
a motorcycle cruising along a coastal highway
a motorcycle turning a corner
a motorcycle slowing down to stop
a motorcycle gliding through a snowy field
a motorcycle accelerating to gain speed
an airplane soaring through a clear blue sky
an airplane taking off
an airplane landing smoothly on a runway
an airplane accelerating to gain speed
a bus turning a corner
a bus stuck in traffic during rush hour
a bus accelerating to gain speed
a train speeding down the tracks
a train crossing over a tall bridge
a train accelerating to gain speed
a truck turning a corner
a truck anchored in a tranquil bay
a truck stuck in traffic during rush hour
a truck slowing down to stop
a truck accelerating to gain speed
a boat sailing smoothly on a calm lake
a boat slowing down to stop
a boat accelerating to gain speed
a bird soaring gracefully in the sky
a bird building a nest from twigs and leaves
a bird flying over a snowy forest
a cat grooming itself meticulously with its tongue
a cat playing in park
a cat drinking water
a cat running happily
a dog enjoying a peaceful walk
a dog playing in park
a dog drinking water
a dog running happily
a horse bending down to drink water from a river
a horse galloping across an open field
a horse taking a peaceful walk
a horse running to join a herd of its kind
a sheep bending down to drink water from a river
a sheep taking a peaceful walk
a sheep running to join a herd of its kind
a cow bending down to drink water from a river
a cow chewing cud while resting in a tranquil barn
a cow running to join a herd of its kind
an elephant spraying itself with water using its trunk to cool down
an elephant taking a peaceful walk
an elephant running to join a herd of its kind
a bear catching a salmon in its powerful jaws
a bear sniffing the air for scents of food
a bear climbing a tree
a bear hunting for prey
a zebra bending down to drink water from a river
a zebra running to join a herd of its kind
a zebra taking a peaceful walk
a giraffe bending down to drink water from a river
a giraffe taking a peaceful walk
a giraffe running to join a herd of its kind


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/temporal_flickering.txt
================================================
In a still frame, a stop sign
a toilet, frozen in time
a laptop, frozen in time
A tranquil tableau of alley
A tranquil tableau of bar
A tranquil tableau of barn
A tranquil tableau of bathroom
A tranquil tableau of bedroom
A tranquil tableau of cliff
In a still frame, courtyard
In a still frame, gas station
A tranquil tableau of house
indoor gymnasium, frozen in time
A tranquil tableau of indoor library
A tranquil tableau of kitchen
A tranquil tableau of palace
In a still frame, parking lot
In a still frame, phone booth
A tranquil tableau of restaurant
A tranquil tableau of tower
A tranquil tableau of a bowl
A tranquil tableau of an apple
A tranquil tableau of a bench
A tranquil tableau of a bed
A tranquil tableau of a chair
A tranquil tableau of a cup
A tranquil tableau of a dining table
In a still frame, a pear
A tranquil tableau of a bunch of grapes
A tranquil tableau of a bowl on the kitchen counter
A tranquil tableau of a beautiful, handcrafted ceramic bowl
A tranquil tableau of an antique bowl
A tranquil tableau of an exquisite mahogany dining table
A tranquil tableau of a wooden bench in the park
A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers
In a still frame, a park bench with a view of the lake
A tranquil tableau of a vintage rocking chair was placed on the porch
A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars
A tranquil tableau of the phone booth was tucked away in a quiet alley
a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time
A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside
A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow
In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water
In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape
In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens
In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels
A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility
In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity
static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water
A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night
A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water
In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square
In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner
A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy
A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins
A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes
A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades
In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall
A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels
A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour
In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting
In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light
A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon
A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon
A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space
In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk
In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier
A tranquil tableau of a country estate's library featured elegant wooden shelves
A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently
A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm
A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden
In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface
In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation
A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms
A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time


================================================
FILE: Open-Sora/assets/texts/VBench/prompts_per_dimension/temporal_style.txt
================================================
A beautiful coastal beach in spring, waves lapping on sand, in super slow motion
A beautiful coastal beach in spring, waves lapping on sand, zoom in
A beautiful coastal beach in spring, waves lapping on sand, zoom out
A beautiful coastal beach in spring, waves lapping on sand, pan left
A beautiful coastal beach in spring, waves lapping on sand, pan right
A beautiful coastal beach in spring, waves lapping on sand, tilt up
A beautiful coastal beach in spring, waves lapping on sand, tilt down
A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect
A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective
A beautiful coastal beach in spring, waves lapping on sand, racking focus
The bund Shanghai, in super slow motion
The bund Shanghai, zoom in
The bund Shanghai, zoom out
The bund Shanghai, pan left
The bund Shanghai, pan right
The bund Shanghai, tilt up
The bund Shanghai, tilt down
The bund Shanghai, with an intense shaking effect
The bund Shanghai, featuring a steady and smooth perspective
The bund Shanghai, racking focus
a shark is swimming in the ocean, in super slow motion
a shark is swimming in the ocean, zoom in
a shark is swimming in the ocean, zoom out
a shark is swimming in the ocean, pan left
a shark is swimming in the ocean, pan right
a shark is swimming in the ocean, tilt up
a shark is swimming in the ocean, tilt down
a shark is swimming in the ocean, with an intense shaking effect
a shark is swimming in the ocean, featuring a steady and smooth perspective
a shark is swimming in the ocean, racking focus
A panda drinking coffee in a cafe in Paris, in super slow motion
A panda drinking coffee in a cafe in Paris, zoom in
A panda drinking coffee in a cafe in Paris, zoom out
A panda drinking coffee in a cafe in Paris, pan left
A panda drinking coffee in a cafe in Paris, pan right
A panda drinking coffee in a cafe in Paris, tilt up
A panda drinking coffee in a cafe in Paris, tilt down
A panda drinking coffee in a cafe in Paris, with an intense shaking effect
A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective
A panda drinking coffee in a cafe in Paris, racking focus
A cute happy Corgi playing in park, sunset, in super slow motion
A cute happy Corgi playing in park, sunset, zoom in
A cute happy Corgi playing in park, sunset, zoom out
A cute happy Corgi playing in park, sunset, pan left
A cute happy Corgi playing in park, sunset, pan right
A cute happy Corgi playing in park, sunset, tilt up
A cute happy Corgi playing in park, sunset, tilt down
A cute happy Corgi playing in park, sunset, with an intense shaking effect
A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective
A cute happy Corgi playing in park, sunset, racking focus
Gwen Stacy reading a book, in super slow motion
Gwen Stacy reading a book, zoom in
Gwen Stacy reading a book, zoom out
Gwen Stacy reading a book, pan left
Gwen Stacy reading a book, pan right
Gwen Stacy reading a book, tilt up
Gwen Stacy reading a book, tilt down
Gwen Stacy reading a book, with an intense shaking effect
Gwen Stacy reading a book, featuring a steady and smooth perspective
Gwen Stacy reading a book, racking focus
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective
A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective
A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus
An astronaut flying in space, in super slow motion
An astronaut flying in space, zoom in
An astronaut flying in space, zoom out
An astronaut flying in space, pan left
An astronaut flying in space, pan right
An astronaut flying in space, tilt up
An astronaut flying in space, tilt down
An astronaut flying in space, with an intense shaking effect
An astronaut flying in space, featuring a steady and smooth perspective
An astronaut flying in space, racking focus
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective
Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus


================================================
FILE: Open-Sora/assets/texts/imagenet_id.txt
================================================
207
360
387
974
88
979
417
279


================================================
FILE: Open-Sora/assets/texts/imagenet_labels.txt
================================================
golden retriever
otter
lesser panda
geyser
macaw
valley
balloon
golden panda


================================================
FILE: Open-Sora/assets/texts/rand_types.txt
================================================
随机电影镜头
随机电影镜头
随机电影镜头
随机电影镜头
随机电影镜头
随机任务镜头
随机任务镜头
随机任务镜头
随机任务镜头
随机任务镜头
随机游戏镜头
随机游戏镜头
随机游戏镜头
随机游戏镜头
随机游戏镜头
随机开车镜头
随机开车镜头
随机开车镜头
随机开车镜头
随机开车镜头
随机动物镜头
随机动物镜头
随机动物镜头
随机动物镜头
随机动物镜头
随机森林镜头
随机森林镜头
随机森林镜头
随机森林镜头
随机森林镜头
随机动漫镜头
随机动漫镜头
随机动漫镜头
随机动漫镜头
随机动漫镜头
随机舞蹈镜头
随机舞蹈镜头
随机舞蹈镜头
随机舞蹈镜头
随机舞蹈镜头


================================================
FILE: Open-Sora/assets/texts/t2i_samples.txt
================================================
A small cactus with a happy face in the Sahara desert.
Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens.
Nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.
Poster of a mechanical cat, techical Schematics viewed from front.
Luffy from ONEPIECE, handsome face, fantasy.
Real beautiful woman.
A alpaca made of colorful building blocks, cyberpunk.
artistic


================================================
FILE: Open-Sora/assets/texts/t2i_sigma.txt
================================================
Eiffel Tower was Made up of more than 2 million translucent straws to look like a cloud, with the bell tower at the top of the building, Michel installed huge foam-making machines in the forest to blow huge amounts of unpredictable wet clouds in the building's classic architecture.
A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures.
Full body shot, a French woman, Photography, French Streets background, backlighting, rim light, Fujifilm.
Close-up photos of models, hazy light and shadow, laser metal hair accessories, soft and beautiful, light gold pupils, white eyelashes, low saturation, real skin details, clear pores and fine lines, light reflection and refraction, ultra-clear, cinematography, award-winning works.
A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in.
Lego model, future rocket station, intricate details, high resolution, unreal engine, UHD
One giant, sharp, metal square mirror in the center of the frame, four young people on the foreground, background sunny palm oil planation, tropical, realistic style, photography, nostalgic, green tone, mysterious, dreamy, bright color.
Modern luxury contemporary luxury home interiors house, in the style of mimicking ruined materials, ray tracing, haunting houses, and stone, capture the essence of nature, gray and bronze, dynamic outdoor shots.
Over the shoulder game perspective, game screen of Diablo 4, Inside the gorgeous palace is the wet ground, The necromancer knelt before the king, and a horde of skeletons he summoned stood at his side, cinematic light.
A curvy timber house near a sea, designed by Zaha Hadid, represent the image of a cold, modern architecture, at night, white lighting, highly detailed.


================================================
FILE: Open-Sora/assets/texts/t2v_car.txt
================================================
|0|A car driving on the in forest.|2|A car driving in the desert.|4|A car driving near the coast.|6|A car driving in the city.|8|A car driving near a mountain.|10|A car driving on the surface of a river.|12|A car driving on the surface of the earch.|14|A car driving in the universe.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16,0.4"}


================================================
FILE: Open-Sora/assets/texts/t2v_latte.txt
================================================
Yellow and black tropical fish dart through the sea.
An epic tornado attacking above aglowing city at night.
Slow pan upward of blazing oak fire in an indoor fireplace.
a cat wearing sunglasses and working as a lifeguard at pool.
Sunset over the sea.
A dog in astronaut suit and sunglasses floating in space.
A astronaut in flying in space, 4k, high resolution


================================================
FILE: Open-Sora/assets/texts/t2v_pllava.txt
================================================
a close-up shot of a woman standing in a room with a white wall and a plant on the left side. the woman has curly hair and is wearing a green tank top. she is looking to the side with a neutral expression on her face. the lighting in the room is soft and appears to be natural, coming from the left side of the frame. the focus is on the woman, with the background being out of focus. there are no texts or other objects in the video. the style of the video is a simple, candid portrait with a shallow depth of field.
a serene scene of a pond filled with water lilies. the water is a deep blue, providing a striking contrast to the pink and white flowers that float on its surface. the flowers, in full bloom, are the main focus of the video. they are scattered across the pond, with some closer to the camera and others further away, creating a sense of depth. the pond is surrounded by lush greenery, adding a touch of nature to the scene. the video is taken from a low angle, looking up at the flowers, which gives a unique perspective and emphasizes their beauty. the overall composition of the video suggests a peaceful and tranquil setting, likely a garden or a park.
a professional setting where a woman is presenting a slide from a presentation. she is standing in front of a projector screen, which displays a bar chart. the chart is colorful, with bars of different heights, indicating some sort of data comparison. the woman is holding a pointer, which she uses to highlight specific parts of the chart. she is dressed in a white blouse and black pants, and her hair is styled in a bun. the room has a modern design, with a sleek black floor and a white ceiling. the lighting is bright, illuminating the woman and the projector screen. the focus of the image is on the woman and the projector screen, with the background being out of focus. there are no texts visible in the image. the relative positions of the objects suggest that the woman is the main subject of the image, and the projector screen is the object of her attention. the image does not provide any information about the content of the presentation or the context of the meeting.
a bustling city street from the perspective of a car. the car, a sleek black sedan, is in motion, driving down the street. the dashboard of the car is visible in the foreground, providing a view of the road ahead. the street is lined with parked cars on both sides, their colors muted in the bright sunlight. buildings rise on either side of the street, their windows reflecting the sunlight. the sky above is a clear blue, and the sun is shining brightly, casting a warm glow on the scene. the street is busy with pedestrians and other vehicles, adding to the dynamic nature of the scene. the video does not contain any text. the relative positions of the objects suggest a typical city street scene with the car in the foreground, the parked cars on either side, and the buildings in the background. the sunlight illuminates the scene, highlighting the colors and details of the objects. the pedestrians and other vehicles are in motion, adding a sense of life and activity to the scene. the buildings provide a sense of depth and scale to the image. the video does not contain any text or countable objects. the
a serene scene in a park. the sun is shining brightly, casting a warm glow on the lush green trees and the grassy field. the camera is positioned low, looking up at the towering trees, which are the main focus of the image. the trees are dense and full of leaves, creating a canopy of green that fills the frame. the sunlight filters through the leaves, creating a beautiful pattern of light and shadow on the ground. the overall atmosphere of the video is peaceful and tranquil, evoking a sense of calm and relaxation.
a moment in a movie theater. a couple is seated in the middle of the theater, engrossed in the movie they are watching. the man is dressed in a casual outfit, complete with a pair of sunglasses, while the woman is wearing a cozy sweater. they are seated on a red theater seat, which stands out against the dark surroundings. the theater itself is dimly lit, with the screen displaying the movie they are watching. the couple appears to be enjoying the movie, their attention completely absorbed by the on-screen action. the theater is mostly empty, with only a few other seats visible in the background. the video does not contain any text or additional objects. the relative positions of the objects are such that the couple is in the foreground, while the screen and the other seats are in the background. the focus of the video is clearly on the couple and their shared experience of watching a movie in a theater.
a scene where a person is examining a dog. the person is wearing a blue shirt with the word "volunteer" printed on it. the dog is lying on its side, and the person is using a stethoscope to listen to the dog's heartbeat. the dog appears to be a golden retriever and is looking directly at the camera. the background is blurred, but it seems to be an indoor setting with a white wall. the person's focus is on the dog, and they seem to be checking its health. the dog's expression is calm, and it seems to be comfortable with the person's touch. the overall atmosphere of the video is calm and professional.
a close-up shot of a woman applying makeup. she is using a black brush to apply a dark powder to her face. the woman has blonde hair and is wearing a black top. the background is black, which contrasts with her skin tone and the makeup. the focus is on her face and the brush, with the rest of her body and the background being out of focus. the lighting is soft and even, highlighting the texture of the makeup and the woman's skin. there are no texts or other objects in the video. the woman's expression is neutral, and she is looking directly at the camera. the video does not contain any action, as it is a still shot of a woman applying makeup. the relative position of the woman and the brush is such that the brush is in her hand and is being used to apply the makeup to her face. the video does not contain any other objects or actions. the woman is the only person in the video, and she is the main subject. the video does not contain any sound. the description is based on the visible content of the video and does not include any assumptions or interpretations.
a young woman is seated in a black gaming chair in a room filled with computer monitors and other gaming equipment. she is wearing a red tank top and black pants, and her hair is styled in loose waves. the room is dimly lit, with the glow of the monitors casting a soft light on her face. she is holding a black game controller in her hands, and her attention is focused on the screen in front of her. the room is filled with other gaming equipment, including keyboards and mice, and there are other chairs and desks scattered around the room. the woman appears to be engrossed in her game, her posture relaxed yet focused. the room is quiet, the only sound coming from the beeps and boops of the game. the woman is the only person in the room, adding a sense of solitude to the scene. the video does not contain any text. the relative positions of the objects suggest a well-organized gaming setup, with the woman at the center, surrounded by her gaming equipment. the video does not contain any action, but the woman's focused expression suggests that she is in the middle of an intense g
a breathtaking aerial view of a coastal landscape at sunset. the sky, painted in hues of orange and pink, serves as a stunning backdrop to the scene. the sun, partially obscured by the horizon, casts a warm glow on the landscape below. the foreground of the image is dominated by a rocky cliff, its rugged surface adding a touch of raw beauty to the scene. the cliff's edge is adorned with patches of green vegetation, providing a stark contrast to the otherwise barren landscape. the middle ground of the image reveals a winding road that hugs the coastline. the road, appearing as a thin line against the vast expanse of the landscape, guides the viewer's eye towards the horizon. in the background, the silhouette of mountains can be seen, their peaks shrouded in a light mist. the mountains, along with the road, add depth to the image, creating a sense of distance and scale. overall, the video presents a serene and majestic coastal landscape, captured at the perfect moment of sunset. the colors


================================================
FILE: Open-Sora/assets/texts/t2v_ref.txt
================================================
Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.
Pirate ship in a cosmic maelstrom nebula.
Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
A sad small cactus with in the Sahara desert becomes happy.
A car driving on a road in the middle of a desert.


================================================
FILE: Open-Sora/assets/texts/t2v_samples.txt
================================================
A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures.
A majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene. The camera angle provides a bird's eye view of the waterfall, allowing viewers to appreciate the full height and grandeur of the waterfall. The video is a stunning representation of nature's power and beauty.
A vibrant scene of a snowy mountain landscape. The sky is filled with a multitude of colorful hot air balloons, each floating at different heights, creating a dynamic and lively atmosphere. The balloons are scattered across the sky, some closer to the viewer, others further away, adding depth to the scene.  Below, the mountainous terrain is blanketed in a thick layer of snow, with a few patches of bare earth visible here and there. The snow-covered mountains provide a stark contrast to the colorful balloons, enhancing the visual appeal of the scene.  In the foreground, a few cars can be seen driving along a winding road that cuts through the mountains. The cars are small compared to the vastness of the landscape, emphasizing the grandeur of the surroundings.  The overall style of the video is a mix of adventure and tranquility, with the hot air balloons adding a touch of whimsy to the otherwise serene mountain landscape. The video is likely shot during the day, as the lighting is bright and even, casting soft shadows on the snow-covered mountains.
The vibrant beauty of a sunflower field. The sunflowers, with their bright yellow petals and dark brown centers, are in full bloom, creating a stunning contrast against the green leaves and stems. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. The sun is shining brightly, casting a warm glow on the flowers and highlighting their intricate details. The video is shot from a low angle, looking up at the sunflowers, which adds a sense of grandeur and awe to the scene. The sunflowers are the main focus of the video, with no other objects or people present. The video is a celebration of nature's beauty and the simple joy of a sunny day in the countryside.
A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell, is the main focus of the video, swimming gracefully towards the right side of the frame. The coral reef, teeming with life, is visible in the background, providing a vibrant and colorful backdrop to the turtle's journey. Several small fish, darting around the turtle, add a sense of movement and dynamism to the scene. The video is shot from a slightly elevated angle, providing a comprehensive view of the turtle's surroundings. The overall style of the video is calm and peaceful, capturing the beauty and tranquility of the underwater world.
A vibrant underwater scene. A group of blue fish, with yellow fins, are swimming around a coral reef. The coral reef is a mix of brown and green, providing a natural habitat for the fish. The water is a deep blue, indicating a depth of around 30 feet. The fish are swimming in a circular pattern around the coral reef, indicating a sense of motion and activity. The overall scene is a beautiful representation of marine life.
A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. The scene is a blur of motion, with cars speeding by and pedestrians navigating the crosswalks. The cityscape is a mix of towering buildings and illuminated signs, creating a vibrant and dynamic atmosphere. The perspective of the video is from a high angle, providing a bird's eye view of the street and its surroundings. The overall style of the video is dynamic and energetic, capturing the essence of urban life at night.
A snowy forest landscape with a dirt road running through it. The road is flanked by trees covered in snow, and the ground is also covered in snow. The sun is shining, creating a bright and serene atmosphere. The road appears to be empty, and there are no people or animals visible in the video. The style of the video is a natural landscape shot, with a focus on the beauty of the snowy forest and the peacefulness of the road.
The dynamic movement of tall, wispy grasses swaying in the wind. The sky above is filled with clouds, creating a dramatic backdrop. The sunlight pierces through the clouds, casting a warm glow on the scene. The grasses are a mix of green and brown, indicating a change in seasons. The overall style of the video is naturalistic, capturing the beauty of the landscape in a realistic manner. The focus is on the grasses and their movement, with the sky serving as a secondary element. The video does not contain any human or animal elements.
A serene night scene in a forested area. The first frame shows a tranquil lake reflecting the star-filled sky above. The second frame reveals a beautiful sunset, casting a warm glow over the landscape. The third frame showcases the night sky, filled with stars and a vibrant Milky Way galaxy. The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. The style of the video is naturalistic, emphasizing the beauty of the night sky and the peacefulness of the forest.


================================================
FILE: Open-Sora/assets/texts/t2v_short.txt
================================================
A fat rabbit wearing a purple robe walking through a fantasy landscape
Waves crashing against a lone lighthouse, ominous lighting
A mystical forest showcasing the adventures of travelers who enter
A blue-haired mage singing
A surreal landscape with floating islands and waterfalls in the sky craft
A blue bird standing in water
A young man walks alone by the seaside
Pink rose on a glass surface with droplets, close-up
Drove viewpoint, a subway train coming out of a tunnel
Space with all planets green and pink color with background of bright white stars
A city floating in an astral space, with stars and nebulae
Sunrise on top of a high-rise building
Pink and cyan powder explosions
Deers in the woods gaze into the camera under the sunlight
In a flash of lightning, a wizard appeared from thin air, his long robes billowing in the wind
A futuristic cyberpunk cityscape at night with towering neon-lit skyscrapers
A scene where the trees, flowers, and animals come together to create a symphony of nature
A ghostly ship sailing through the clouds, navigating through a sea under a moonlit sky
A sunset with beautiful beach
A young man walking alone in the forest


================================================
FILE: Open-Sora/assets/texts/t2v_sora.txt
================================================
A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors.
Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
Animated scene features a close-up of a short fluffy monster kneeling beside a melting red candle. The art style is 3D and realistic, with a focus on lighting and texture. The mood of the painting is one of wonder and curiosity, as the monster gazes at the flame with wide eyes and open mouth. Its pose and expression convey a sense of innocence and playfulness, as if it is exploring the world around it for the first time. The use of warm colors and dramatic lighting further enhances the cozy atmosphere of the image.
A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures.
This close-up shot of a Victoria crowned pigeon showcases its striking blue plumage and red chest. Its crest is made of delicate, lacy feathers, while its eye is a striking red color. The bird’s head is tilted slightly to the side, giving the impression of it looking regal and majestic. The background is blurred, drawing attention to the bird’s striking appearance.
Photorealistic closeup video of two pirate ships battling each other as they sail inside a cup of coffee.
A young man at his 20s is sitting on a piece of cloud in the sky, reading a book.
Historical footage of California during the gold rush.
A close up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
Extreme close up of a 24 year old woman’s eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70mm, depth of field, vivid colors, cinematic
A cartoon kangaroo disco dances.
A beautiful homemade video showing the people of Lagos, Nigeria in the year 2056. Shot with a mobile phone camera.
A petri dish with a bamboo forest growing within it that has tiny red pandas running around.
The camera rotates around a large stack of vintage televisions all showing different programs — 1950s sci-fi movies, horror movies, news, static, a 1970s sitcom, etc, set inside a large New York museum gallery.
3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.
The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from it’s tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.
Reflections in the window of a train traveling through the Tokyo suburbs.
A drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast, the view showcases historic and magnificent architectural details and tiered pathways and patios, waves are seen crashing against the rocks below as the view overlooks the horizon of the coastal waters and hilly landscapes of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.
A large orange octopus is seen resting on the bottom of the ocean floor, blending in with the sandy and rocky terrain. Its tentacles are spread out around its body, and its eyes are closed. The octopus is unaware of a king crab that is crawling towards it from behind a rock, its claws raised and ready to attack. The crab is brown and spiny, with long legs and antennae. The scene is captured from a wide angle, showing the vastness and depth of the ocean. The water is clear and blue, with rays of sunlight filtering through. The shot is sharp and crisp, with a high dynamic range. The octopus and the crab are in focus, while the background is slightly blurred, creating a depth of field effect.
A flock of paper airplanes flutters through a dense jungle, weaving around trees as if they were migrating birds.
A cat waking up its sleeping owner demanding breakfast. The owner tries to ignore the cat, but the cat tries new tactics and finally the owner pulls out a secret stash of treats from under the pillow to hold the cat off a little longer.
Borneo wildlife on the Kinabatangan River
A Chinese Lunar New Year celebration video with Chinese Dragon.
Tour of an art gallery with many beautiful works of art in different styles.
Beautiful, snowy Tokyo city is bustling. The camera moves through the bustling city street, following several people enjoying the beautiful snowy weather and shopping at nearby stalls. Gorgeous sakura petals are flying through the wind along with snowflakes.
A stop motion animation of a flower growing out of the windowsill of a suburban house.
The story of a robot’s life in a cyberpunk setting.
An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film.
A beautiful silhouette animation shows a wolf howling at the moon, feeling lonely, until it finds its pack.
New York City submerged like Atlantis. Fish, whales, sea turtles and sharks swim through the streets of New York.
A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in.
Step-printing scene of a person running, cinematic film shot in 35mm.
Five gray wolf pups frolicking and chasing each other around a remote gravel road, surrounded by grass. The pups run and leap, chasing each other, and nipping at each other, playing.
Basketball through hoop then explodes.
Archeologists discover a generic plastic chair in the desert, excavating and dusting it with great care.
A grandmother with neatly combed grey hair stands behind a colorful birthday cake with numerous candles at a wood dining room table, expression is one of pure joy and happiness, with a happy glow in her eye. She leans forward and blows out the candles with a gentle puff, the cake has pink frosting and sprinkles and the candles cease to flicker, the grandmother wears a light blue blouse adorned with floral patterns, several happy friends and family sitting at the table can be seen celebrating, out of focus. The scene is beautifully captured, cinematic, showing a 3/4 view of the grandmother and the dining room. Warm color tones and soft lighting enhance the mood.
The camera directly faces colorful buildings in Burano Italy. An adorable dalmation looks through a window on a building on the ground floor. Many people are walking and cycling along the canal streets in front of the buildings.
An adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush tropical islands, 3D digital render art style.
This close-up shot of a chameleon showcases its striking color changing capabilities. The background is blurred, drawing attention to the animal’s striking appearance.
A corgi vlogging itself in tropical Maui.
A white and orange tabby cat is seen happily darting through a dense garden, as if chasing something. Its eyes are wide and happy as it jogs forward, scanning the branches, flowers, and leaves as it walks. The path is narrow as it makes its way between all the plants. the scene is captured from a ground-level angle, following the cat closely, giving a low and intimate perspective. The image is cinematic with warm tones and a grainy texture. The scattered daylight between the leaves and plants above creates a warm contrast, accentuating the cat’s orange fur. The shot is clear and sharp, with a shallow depth of field.
Aerial view of Santorini during the blue hour, showcasing the stunning architecture of white Cycladic buildings with blue domes. The caldera views are breathtaking, and the lighting creates a beautiful, serene atmosphere.
Tiltshift of a construction site filled with workers, equipment, and heavy machinery.
A giant, towering cloud in the shape of a man looms over the earth. The cloud man shoots lighting bolts down to the earth.
A Samoyed and a Golden Retriever dog are playfully romping through a futuristic neon city at night. The neon lights emitted from the nearby buildings glistens off of their fur.
The Glenfinnan Viaduct is a historic railway bridge in Scotland, UK, that crosses over the west highland line between the towns of Mallaig and Fort William. It is a stunning sight as a steam train leaves the bridge, traveling over the arch-covered viaduct. The landscape is dotted with lush greenery and rocky mountains, creating a picturesque backdrop for the train journey. The sky is blue and the sun is shining, making for a beautiful day to explore this majestic spot.


================================================
FILE: Open-Sora/assets/texts/ucf101_id.txt
================================================
0
1
2
3
4
5


================================================
FILE: Open-Sora/assets/texts/ucf101_labels.txt
================================================
Apply Eye Makeup
Apply Lipstick
Archery
Baby Crawling
Balance Beam
Band Marching


================================================
FILE: Open-Sora/build/lib/opensora/acceleration/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/opensora/acceleration/checkpoint.py
================================================
from collections.abc import Iterable

import torch.nn as nn
from torch.utils.checkpoint import checkpoint, checkpoint_sequential


def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
    assert isinstance(model, nn.Module)

    def set_attr(module):
        module.grad_checkpointing = True
        module.fp32_attention = use_fp32_attention
        module.grad_checkpointing_step = gc_step

    model.apply(set_attr)


def auto_grad_checkpoint(module, *args, **kwargs):
    if getattr(module, "grad_checkpointing", False):
        if not isinstance(module, Iterable):
            return checkpoint(module, *args, use_reentrant=False, **kwargs)
        gc_step = module[0].grad_checkpointing_step
        return checkpoint_sequential(module, gc_step, *args, use_reentrant=False, **kwargs)
    return module(*args, **kwargs)


================================================
FILE: Open-Sora/build/lib/opensora/acceleration/communications.py
================================================
import torch
import torch.distributed as dist


# ====================
# All-To-All
# ====================
def _all_to_all(
    input_: torch.Tensor,
    world_size: int,
    group: dist.ProcessGroup,
    scatter_dim: int,
    gather_dim: int,
):
    input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
    output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
    dist.all_to_all(output_list, input_list, group=group)
    return torch.cat(output_list, dim=gather_dim).contiguous()


class _AllToAll(torch.autograd.Function):
    """All-to-all communication.

    Args:
        input_: input matrix
        process_group: communication group
        scatter_dim: scatter dimension
        gather_dim: gather dimension
    """

    @staticmethod
    def forward(ctx, input_, process_group, scatter_dim, gather_dim):
        ctx.process_group = process_group
        ctx.scatter_dim = scatter_dim
        ctx.gather_dim = gather_dim
        ctx.world_size = dist.get_world_size(process_group)
        output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        grad_output = _all_to_all(
            grad_output,
            ctx.world_size,
            ctx.process_group,
            ctx.gather_dim,
            ctx.scatter_dim,
        )
        return (
            grad_output,
            None,
            None,
            None,
        )


def all_to_all(
    input_: torch.Tensor,
    process_group: dist.ProcessGroup,
    scatter_dim: int = 2,
    gather_dim: int = 1,
):
    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)


def _gather(
    input_: torch.Tensor,
    world_size: int,
    group: dist.ProcessGroup,
    gather_dim: int,
):
    if gather_list is None:
        gather_list = [torch.empty_like(input_) for _ in range(world_size)]
    dist.gather(input_, gather_list, group=group, gather_dim=gather_dim)
    return gather_list


# ====================
# Gather-Split
# ====================


def _split(input_, pg: dist.ProcessGroup, dim=-1):
    # skip if only one rank involved
    world_size = dist.get_world_size(pg)
    rank = dist.get_rank(pg)
    if world_size == 1:
        return input_

    # Split along last dimension.
    dim_size = input_.size(dim)
    assert dim_size % world_size == 0, (
        f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), "
        f"cannot split tensor evenly"
    )

    tensor_list = torch.split(input_, dim_size // world_size, dim=dim)
    output = tensor_list[rank].contiguous()

    return output


def _gather(input_, pg: dist.ProcessGroup, dim=-1):
    # skip if only one rank involved
    input_ = input_.contiguous()
    world_size = dist.get_world_size(pg)
    dist.get_rank(pg)

    if world_size == 1:
        return input_

    # all gather
    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
    assert input_.device.type == "cuda"
    torch.distributed.all_gather(tensor_list, input_, group=pg)

    # concat
    output = torch.cat(tensor_list, dim=dim).contiguous()

    return output


class _GatherForwardSplitBackward(torch.autograd.Function):
    """Gather the input from model parallel region and concatenate.

    Args:
        input_: input matrix.
        process_group: parallel mode.
        dim: dimension
    """

    @staticmethod
    def symbolic(graph, input_):
        return _gather(input_)

    @staticmethod
    def forward(ctx, input_, process_group, dim, grad_scale):
        ctx.mode = process_group
        ctx.dim = dim
        ctx.grad_scale = grad_scale
        return _gather(input_, process_group, dim)

    @staticmethod
    def backward(ctx, grad_output):
        if ctx.grad_scale == "up":
            grad_output = grad_output * dist.get_world_size(ctx.mode)
        elif ctx.grad_scale == "down":
            grad_output = grad_output / dist.get_world_size(ctx.mode)

        return _split(grad_output, ctx.mode, ctx.dim), None, None, None


class _SplitForwardGatherBackward(torch.autograd.Function):
    """
    Split the input and keep only the corresponding chuck to the rank.

    Args:
        input_: input matrix.
        process_group: parallel mode.
        dim: dimension
    """

    @staticmethod
    def symbolic(graph, input_):
        return _split(input_)

    @staticmethod
    def forward(ctx, input_, process_group, dim, grad_scale):
        ctx.mode = process_group
        ctx.dim = dim
        ctx.grad_scale = grad_scale
        return _split(input_, process_group, dim)

    @staticmethod
    def backward(ctx, grad_output):
        if ctx.grad_scale == "up":
            grad_output = grad_output * dist.get_world_size(ctx.mode)
        elif ctx.grad_scale == "down":
            grad_output = grad_output / dist.get_world_size(ctx.mode)
        return _gather(grad_output, ctx.mode, ctx.dim), None, None, None


def split_forward_gather_backward(input_, process_group, dim, grad_scale=1.0):
    return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale)


def gather_forward_split_backward(input_, process_group, dim, grad_scale=None):
    return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale)


================================================
FILE: Open-Sora/build/lib/opensora/acceleration/parallel_states.py
================================================
import torch.distributed as dist

_GLOBAL_PARALLEL_GROUPS = dict()


def set_data_parallel_group(group: dist.ProcessGroup):
    _GLOBAL_PARALLEL_GROUPS["data"] = group


def get_data_parallel_group():
    return _GLOBAL_PARALLEL_GROUPS.get("data", dist.group.WORLD)


def set_sequence_parallel_group(group: dist.ProcessGroup):
    _GLOBAL_PARALLEL_GROUPS["sequence"] = group


def get_sequence_parallel_group():
    return _GLOBAL_PARALLEL_GROUPS.get("sequence", None)


================================================
FILE: Open-Sora/build/lib/opensora/acceleration/plugin.py
================================================
import random
from typing import Optional

import numpy as np
import torch
from colossalai.booster.plugin import LowLevelZeroPlugin
from colossalai.cluster import ProcessGroupMesh
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler

DP_AXIS, SP_AXIS = 0, 1


class ZeroSeqParallelPlugin(LowLevelZeroPlugin):
    def __init__(
        self,
        sp_size: int = 1,
        stage: int = 2,
        precision: str = "fp16",
        initial_scale: float = 2**32,
        min_scale: float = 1,
        growth_factor: float = 2,
        backoff_factor: float = 0.5,
        growth_interval: int = 1000,
        hysteresis: int = 2,
        max_scale: float = 2**32,
        max_norm: float = 0.0,
        norm_type: float = 2.0,
        reduce_bucket_size_in_m: int = 12,
        communication_dtype: Optional[torch.dtype] = None,
        overlap_communication: bool = True,
        cpu_offload: bool = False,
        master_weights: bool = True,
        verbose: bool = False,
    ) -> None:
        super().__init__(
            stage=stage,
            precision=precision,
            initial_scale=initial_scale,
            min_scale=min_scale,
            growth_factor=growth_factor,
            backoff_factor=backoff_factor,
            growth_interval=growth_interval,
            hysteresis=hysteresis,
            max_scale=max_scale,
            max_norm=max_norm,
            norm_type=norm_type,
            reduce_bucket_size_in_m=reduce_bucket_size_in_m,
            communication_dtype=communication_dtype,
            overlap_communication=overlap_communication,
            cpu_offload=cpu_offload,
            master_weights=master_weights,
            verbose=verbose,
        )
        self.sp_size = sp_size
        assert self.world_size % sp_size == 0, "world_size must be divisible by sp_size"
        self.dp_size = self.world_size // sp_size
        self.pg_mesh = ProcessGroupMesh(self.dp_size, self.sp_size)
        self.dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS)
        self.sp_group = self.pg_mesh.get_group_along_axis(SP_AXIS)
        self.dp_rank = self.pg_mesh.coordinate(DP_AXIS)
        self.sp_rank = self.pg_mesh.coordinate(SP_AXIS)

    def __del__(self):
        """Destroy the prcess groups in ProcessGroupMesh"""
        self.pg_mesh.destroy_mesh_process_groups()

    def prepare_dataloader(
        self,
        dataset,
        batch_size,
        shuffle=False,
        seed=1024,
        drop_last=False,
        pin_memory=False,
        num_workers=0,
        distributed_sampler_cls=None,
        **kwargs,
    ):
        _kwargs = kwargs.copy()
        distributed_sampler_cls = distributed_sampler_cls or DistributedSampler
        sampler = distributed_sampler_cls(dataset, num_replicas=self.dp_size, rank=self.dp_rank, shuffle=shuffle)

        # Deterministic dataloader
        def seed_worker(worker_id):
            worker_seed = seed
            np.random.seed(worker_seed)
            torch.manual_seed(worker_seed)
            random.seed(worker_seed)

        return DataLoader(
            dataset,
            batch_size=batch_size,
            sampler=sampler,
            worker_init_fn=seed_worker,
            drop_last=drop_last,
            pin_memory=pin_memory,
            num_workers=num_workers,
            **_kwargs,
        )


================================================
FILE: Open-Sora/build/lib/opensora/acceleration/shardformer/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/opensora/acceleration/shardformer/modeling/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/opensora/acceleration/shardformer/modeling/t5.py
================================================
import torch
import torch.nn as nn


class T5LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
        # half-precision inputs is done in fp32

        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

        # convert into half-precision if necessary
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        return self.weight * hidden_states

    @staticmethod
    def from_native_module(module, *args, **kwargs):
        assert module.__class__.__name__ == "FusedRMSNorm", (
            "Recovering T5LayerNorm requires the original layer to be apex's Fused RMS Norm."
            "Apex's fused norm is automatically used by Hugging Face Transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L265C5-L265C48"
        )

        layer_norm = T5LayerNorm(module.normalized_shape, eps=module.eps)
        layer_norm.weight.data.copy_(module.weight.data)
        layer_norm = layer_norm.to(module.weight.device)
        return layer_norm


================================================
FILE: Open-Sora/build/lib/opensora/acceleration/shardformer/policy/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/opensora/acceleration/shardformer/policy/t5_encoder.py
================================================
from colossalai.shardformer.modeling.jit import get_jit_fused_dropout_add_func
from colossalai.shardformer.modeling.t5 import get_jit_fused_T5_layer_ff_forward, get_T5_layer_self_attention_forward
from colossalai.shardformer.policies.base_policy import Policy, SubModuleReplacementDescription


class T5EncoderPolicy(Policy):
    def config_sanity_check(self):
        assert not self.shard_config.enable_tensor_parallelism
        assert not self.shard_config.enable_flash_attention

    def preprocess(self):
        return self.model

    def module_policy(self):
        from transformers.models.t5.modeling_t5 import T5LayerFF, T5LayerSelfAttention, T5Stack

        policy = {}

        # check whether apex is installed
        try:
            from opensora.acceleration.shardformer.modeling.t5 import T5LayerNorm

            # recover hf from fused rms norm to T5 norm which is faster
            self.append_or_create_submodule_replacement(
                description=SubModuleReplacementDescription(
                    suffix="layer_norm",
                    target_module=T5LayerNorm,
                ),
                policy=policy,
                target_key=T5LayerFF,
            )
            self.append_or_create_submodule_replacement(
                description=SubModuleReplacementDescription(suffix="layer_norm", target_module=T5LayerNorm),
                policy=policy,
                target_key=T5LayerSelfAttention,
            )
            self.append_or_create_submodule_replacement(
                description=SubModuleReplacementDescription(suffix="final_layer_norm", target_module=T5LayerNorm),
                policy=policy,
                target_key=T5Stack,
            )
        except (ImportError, ModuleNotFoundError):
            pass

        # use jit operator
        if self.shard_config.enable_jit_fused:
            self.append_or_create_method_replacement(
                description={
                    "forward": get_jit_fused_T5_layer_ff_forward(),
                    "dropout_add": get_jit_fused_dropout_add_func(),
                },
                policy=policy,
                target_key=T5LayerFF,
            )
            self.append_or_create_method_replacement(
                description={
                    "forward": get_T5_layer_self_attention_forward(),
                    "dropout_add": get_jit_fused_dropout_add_func(),
                },
                policy=policy,
                target_key=T5LayerSelfAttention,
            )

        return policy

    def postprocess(self):
        return self.model


================================================
FILE: Open-Sora/build/lib/opensora/datasets/__init__.py
================================================
from .datasets import IMG_FPS, BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset
from .utils import get_transforms_image, get_transforms_video, is_img, is_vid, save_sample


================================================
FILE: Open-Sora/build/lib/opensora/datasets/aspect.py
================================================
import math


# computation
def get_h_w(a, ts, eps=1e-4):
    h = (ts * a) ** 0.5
    h = h + eps
    h = math.ceil(h) if math.ceil(h) % 2 == 0 else math.floor(h)
    w = h / a
    w = w + eps
    w = math.ceil(w) if math.ceil(w) % 2 == 0 else math.floor(w)
    return h, w


def get_aspect_ratios_dict(ars, ts=360 * 640):
    est = {f"{a:.2f}": get_h_w(a, ts) for a in ars}
    return est


def get_ar(ratio):
    h, w = ratio.split(":")
    return int(h) / int(w)


# H:W
ASPECT_RATIO_MAP = {
    "3:8": "0.38",
    "9:21": "0.43",
    "12:25": "0.48",
    "1:2": "0.50",
    "9:17": "0.53",
    "27:50": "0.54",
    "9:16": "0.56",
    "5:8": "0.62",
    "2:3": "0.67",
    "3:4": "0.75",
    "1:1": "1.00",
    "4:3": "1.33",
    "3:2": "1.50",
    "16:9": "1.78",
    "17:9": "1.89",
    "2:1": "2.00",
    "50:27": "2.08",
}


AR = [get_ar(ratio) for ratio in ASPECT_RATIO_MAP.keys()]

# computed from above code
# S = 8294400
ASPECT_RATIO_4K = {
    "0.38": (1764, 4704),
    "0.43": (1886, 4400),
    "0.48": (1996, 4158),
    "0.50": (2036, 4072),
    "0.53": (2096, 3960),
    "0.54": (2118, 3918),
    "0.62": (2276, 3642),
    "0.56": (2160, 3840),  # base
    "0.67": (2352, 3528),
    "0.75": (2494, 3326),
    "1.00": (2880, 2880),
    "1.33": (3326, 2494),
    "1.50": (3528, 2352),
    "1.78": (3840, 2160),
    "1.89": (3958, 2096),
    "2.00": (4072, 2036),
    "2.08": (4156, 1994),
}

# S = 3686400
ASPECT_RATIO_2K = {
    "0.38": (1176, 3136),
    "0.43": (1256, 2930),
    "0.48": (1330, 2770),
    "0.50": (1358, 2716),
    "0.53": (1398, 2640),
    "0.54": (1412, 2612),
    "0.56": (1440, 2560),  # base
    "0.62": (1518, 2428),
    "0.67": (1568, 2352),
    "0.75": (1662, 2216),
    "1.00": (1920, 1920),
    "1.33": (2218, 1664),
    "1.50": (2352, 1568),
    "1.78": (2560, 1440),
    "1.89": (2638, 1396),
    "2.00": (2716, 1358),
    "2.08": (2772, 1330),
}

# S = 2073600
ASPECT_RATIO_1080P = {
    "0.38": (882, 2352),
    "0.43": (942, 2198),
    "0.48": (998, 2080),
    "0.50": (1018, 2036),
    "0.53": (1048, 1980),
    "0.54": (1058, 1958),
    "0.56": (1080, 1920),  # base
    "0.62": (1138, 1820),
    "0.67": (1176, 1764),
    "0.75": (1248, 1664),
    "1.00": (1440, 1440),
    "1.33": (1662, 1246),
    "1.50": (1764, 1176),
    "1.78": (1920, 1080),
    "1.89": (1980, 1048),
    "2.00": (2036, 1018),
    "2.08": (2078, 998),
}

# S = 921600
ASPECT_RATIO_720P = {
    "0.38": (588, 1568),
    "0.43": (628, 1466),
    "0.48": (666, 1388),
    "0.50": (678, 1356),
    "0.53": (698, 1318),
    "0.54": (706, 1306),
    "0.56": (720, 1280),  # base
    "0.62": (758, 1212),
    "0.67": (784, 1176),
    "0.75": (832, 1110),
    "1.00": (960, 960),
    "1.33": (1108, 832),
    "1.50": (1176, 784),
    "1.78": (1280, 720),
    "1.89": (1320, 698),
    "2.00": (1358, 680),
    "2.08": (1386, 666),
}

# S = 409920
ASPECT_RATIO_480P = {
    "0.38": (392, 1046),
    "0.43": (420, 980),
    "0.48": (444, 925),
    "0.50": (452, 904),
    "0.53": (466, 880),
    "0.54": (470, 870),
    "0.56": (480, 854),  # base
    "0.62": (506, 810),
    "0.67": (522, 784),
    "0.75": (554, 738),
    "1.00": (640, 640),
    "1.33": (740, 555),
    "1.50": (784, 522),
    "1.78": (854, 480),
    "1.89": (880, 466),
    "2.00": (906, 454),
    "2.08": (924, 444),
}

# S = 230400
ASPECT_RATIO_360P = {
    "0.38": (294, 784),
    "0.43": (314, 732),
    "0.48": (332, 692),
    "0.50": (340, 680),
    "0.53": (350, 662),
    "0.54": (352, 652),
    "0.56": (360, 640),  # base
    "0.62": (380, 608),
    "0.67": (392, 588),
    "0.75": (416, 554),
    "1.00": (480, 480),
    "1.33": (554, 416),
    "1.50": (588, 392),
    "1.78": (640, 360),
    "1.89": (660, 350),
    "2.00": (678, 340),
    "2.08": (692, 332),
}

# S = 102240
ASPECT_RATIO_240P = {
    "0.38": (196, 522),
    "0.43": (210, 490),
    "0.48": (222, 462),
    "0.50": (226, 452),
    "0.53": (232, 438),
    "0.54": (236, 436),
    "0.56": (240, 426),  # base
    "0.62": (252, 404),
    "0.67": (262, 393),
    "0.75": (276, 368),
    "1.00": (320, 320),
    "1.33": (370, 278),
    "1.50": (392, 262),
    "1.78": (426, 240),
    "1.89": (440, 232),
    "2.00": (452, 226),
    "2.08": (462, 222),
}

# S = 36864
ASPECT_RATIO_144P = {
    "0.38": (117, 312),
    "0.43": (125, 291),
    "0.48": (133, 277),
    "0.50": (135, 270),
    "0.53": (139, 262),
    "0.54": (141, 260),
    "0.56": (144, 256),  # base
    "0.62": (151, 241),
    "0.67": (156, 234),
    "0.75": (166, 221),
    "1.00": (192, 192),
    "1.33": (221, 165),
    "1.50": (235, 156),
    "1.78": (256, 144),
    "1.89": (263, 139),
    "2.00": (271, 135),
    "2.08": (277, 132),
}

# from PixArt
# S = 8294400
ASPECT_RATIO_2880 = {
    "0.25": (1408, 5760),
    "0.26": (1408, 5568),
    "0.27": (1408, 5376),
    "0.28": (1408, 5184),
    "0.32": (1600, 4992),
    "0.33": (1600, 4800),
    "0.34": (1600, 4672),
    "0.40": (1792, 4480),
    "0.42": (1792, 4288),
    "0.47": (1920, 4096),
    "0.49": (1920, 3904),
    "0.51": (1920, 3776),
    "0.55": (2112, 3840),
    "0.59": (2112, 3584),
    "0.68": (2304, 3392),
    "0.72": (2304, 3200),
    "0.78": (2496, 3200),
    "0.83": (2496, 3008),
    "0.89": (2688, 3008),
    "0.93": (2688, 2880),
    "1.00": (2880, 2880),
    "1.07": (2880, 2688),
    "1.12": (3008, 2688),
    "1.21": (3008, 2496),
    "1.28": (3200, 2496),
    "1.39": (3200, 2304),
    "1.47": (3392, 2304),
    "1.70": (3584, 2112),
    "1.82": (3840, 2112),
    "2.03": (3904, 1920),
    "2.13": (4096, 1920),
    "2.39": (4288, 1792),
    "2.50": (4480, 1792),
    "2.92": (4672, 1600),
    "3.00": (4800, 1600),
    "3.12": (4992, 1600),
    "3.68": (5184, 1408),
    "3.82": (5376, 1408),
    "3.95": (5568, 1408),
    "4.00": (5760, 1408),
}

# S = 4194304
ASPECT_RATIO_2048 = {
    "0.25": (1024, 4096),
    "0.26": (1024, 3968),
    "0.27": (1024, 3840),
    "0.28": (1024, 3712),
    "0.32": (1152, 3584),
    "0.33": (1152, 3456),
    "0.35": (1152, 3328),
    "0.40": (1280, 3200),
    "0.42": (1280, 3072),
    "0.48": (1408, 2944),
    "0.50": (1408, 2816),
    "0.52": (1408, 2688),
    "0.57": (1536, 2688),
    "0.60": (1536, 2560),
    "0.68": (1664, 2432),
    "0.72": (1664, 2304),
    "0.78": (1792, 2304),
    "0.82": (1792, 2176),
    "0.88": (1920, 2176),
    "0.94": (1920, 2048),
    "1.00": (2048, 2048),
    "1.07": (2048, 1920),
    "1.13": (2176, 1920),
    "1.21": (2176, 1792),
    "1.29": (2304, 1792),
    "1.38": (2304, 1664),
    "1.46": (2432, 1664),
    "1.67": (2560, 1536),
    "1.75": (2688, 1536),
    "2.00": (2816, 1408),
    "2.09": (2944, 1408),
    "2.40": (3072, 1280),
    "2.50": (3200, 1280),
    "2.89": (3328, 1152),
    "3.00": (3456, 1152),
    "3.11": (3584, 1152),
    "3.62": (3712, 1024),
    "3.75": (3840, 1024),
    "3.88": (3968, 1024),
    "4.00": (4096, 1024),
}

# S = 1048576
ASPECT_RATIO_1024 = {
    "0.25": (512, 2048),
    "0.26": (512, 1984),
    "0.27": (512, 1920),
    "0.28": (512, 1856),
    "0.32": (576, 1792),
    "0.33": (576, 1728),
    "0.35": (576, 1664),
    "0.40": (640, 1600),
    "0.42": (640, 1536),
    "0.48": (704, 1472),
    "0.50": (704, 1408),
    "0.52": (704, 1344),
    "0.57": (768, 1344),
    "0.60": (768, 1280),
    "0.68": (832, 1216),
    "0.72": (832, 1152),
    "0.78": (896, 1152),
    "0.82": (896, 1088),
    "0.88": (960, 1088),
    "0.94": (960, 1024),
    "1.00": (1024, 1024),
    "1.07": (1024, 960),
    "1.13": (1088, 960),
    "1.21": (1088, 896),
    "1.29": (1152, 896),
    "1.38": (1152, 832),
    "1.46": (1216, 832),
    "1.67": (1280, 768),
    "1.75": (1344, 768),
    "2.00": (1408, 704),
    "2.09": (1472, 704),
    "2.40": (1536, 640),
    "2.50": (1600, 640),
    "2.89": (1664, 576),
    "3.00": (1728, 576),
    "3.11": (1792, 576),
    "3.62": (1856, 512),
    "3.75": (1920, 512),
    "3.88": (1984, 512),
    "4.00": (2048, 512),
}

# S = 262144
ASPECT_RATIO_512 = {
    "0.25": (256, 1024),
    "0.26": (256, 992),
    "0.27": (256, 960),
    "0.28": (256, 928),
    "0.32": (288, 896),
    "0.33": (288, 864),
    "0.35": (288, 832),
    "0.40": (320, 800),
    "0.42": (320, 768),
    "0.48": (352, 736),
    "0.50": (352, 704),
    "0.52": (352, 672),
    "0.57": (384, 672),
    "0.60": (384, 640),
    "0.68": (416, 608),
    "0.72": (416, 576),
    "0.78": (448, 576),
    "0.82": (448, 544),
    "0.88": (480, 544),
    "0.94": (480, 512),
    "1.00": (512, 512),
    "1.07": (512, 480),
    "1.13": (544, 480),
    "1.21": (544, 448),
    "1.29": (576, 448),
    "1.38": (576, 416),
    "1.46": (608, 416),
    "1.67": (640, 384),
    "1.75": (672, 384),
    "2.00": (704, 352),
    "2.09": (736, 352),
    "2.40": (768, 320),
    "2.50": (800, 320),
    "2.89": (832, 288),
    "3.00": (864, 288),
    "3.11": (896, 288),
    "3.62": (928, 256),
    "3.75": (960, 256),
    "3.88": (992, 256),
    "4.00": (1024, 256),
}

# S = 65536
ASPECT_RATIO_256 = {
    "0.25": (128, 512),
    "0.26": (128, 496),
    "0.27": (128, 480),
    "0.28": (128, 464),
    "0.32": (144, 448),
    "0.33": (144, 432),
    "0.35": (144, 416),
    "0.40": (160, 400),
    "0.42": (160, 384),
    "0.48": (176, 368),
    "0.50": (176, 352),
    "0.52": (176, 336),
    "0.57": (192, 336),
    "0.60": (192, 320),
    "0.68": (208, 304),
    "0.72": (208, 288),
    "0.78": (224, 288),
    "0.82": (224, 272),
    "0.88": (240, 272),
    "0.94": (240, 256),
    "1.00": (256, 256),
    "1.07": (256, 240),
    "1.13": (272, 240),
    "1.21": (272, 224),
    "1.29": (288, 224),
    "1.38": (288, 208),
    "1.46": (304, 208),
    "1.67": (320, 192),
    "1.75": (336, 192),
    "2.00": (352, 176),
    "2.09": (368, 176),
    "2.40": (384, 160),
    "2.50": (400, 160),
    "2.89": (416, 144),
    "3.00": (432, 144),
    "3.11": (448, 144),
    "3.62": (464, 128),
    "3.75": (480, 128),
    "3.88": (496, 128),
    "4.00": (512, 128),
}


def get_closest_ratio(height: float, width: float, ratios: dict):
    aspect_ratio = height / width
    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
    return closest_ratio


ASPECT_RATIOS = {
    "144p": (36864, ASPECT_RATIO_144P),
    "256": (65536, ASPECT_RATIO_256),
    "240p": (102240, ASPECT_RATIO_240P),
    "360p": (230400, ASPECT_RATIO_360P),
    "512": (262144, ASPECT_RATIO_512),
    "480p": (409920, ASPECT_RATIO_480P),
    "720p": (921600, ASPECT_RATIO_720P),
    "1024": (1048576, ASPECT_RATIO_1024),
    "1080p": (2073600, ASPECT_RATIO_1080P),
    "2k": (3686400, ASPECT_RATIO_2K),
    "2048": (4194304, ASPECT_RATIO_2048),
    "2880": (8294400, ASPECT_RATIO_2880),
    "4k": (8294400, ASPECT_RATIO_4K),
}


def get_num_pixels(name):
    return ASPECT_RATIOS[name][0]


def get_image_size(resolution, ar_ratio):
    if ar_ratio in ASPECT_RATIO_MAP:
        ar_key = ASPECT_RATIO_MAP[ar_ratio]
    else:
        ar_key = ar_ratio
    rs_dict = ASPECT_RATIOS[resolution][1]
    assert ar_key in rs_dict, f"Aspect ratio {ar_ratio} not found for resolution {resolution}"
    return rs_dict[ar_key]


NUM_FRAMES_MAP = {
    "1x": 51,
    "2x": 102,
    "4x": 204,
    "8x": 408,
    "16x": 816,
    "2s": 51,
    "4s": 102,
    "8s": 204,
    "16s": 408,
    "32s": 816,
}


def get_num_frames(num_frames):
    if num_frames in NUM_FRAMES_MAP:
        return NUM_FRAMES_MAP[num_frames]
    else:
        return int(num_frames)


================================================
FILE: Open-Sora/build/lib/opensora/datasets/bucket.py
================================================
from collections import OrderedDict

import numpy as np

from opensora.utils.misc import get_logger

from .aspect import ASPECT_RATIOS, get_closest_ratio


def find_approximate_hw(hw, hw_dict, approx=0.8):
    for k, v in hw_dict.items():
        if hw >= v * approx:
            return k
    return None


def find_closet_smaller_bucket(t, t_dict, frame_interval):
    # process image
    if t == 1:
        if 1 in t_dict:
            return 1
        else:
            return None
    # process video
    for k, v in t_dict.items():
        if t >= v * frame_interval and v != 1:
            return k
    return None


class Bucket:
    def __init__(self, bucket_config):
        for key in bucket_config:
            assert key in ASPECT_RATIOS, f"Aspect ratio {key} not found."
        # wrap config with OrderedDict
        bucket_probs = OrderedDict()
        bucket_bs = OrderedDict()
        bucket_names = sorted(bucket_config.keys(), key=lambda x: ASPECT_RATIOS[x][0], reverse=True)
        for key in bucket_names:
            bucket_time_names = sorted(bucket_config[key].keys(), key=lambda x: x, reverse=True)
            bucket_probs[key] = OrderedDict({k: bucket_config[key][k][0] for k in bucket_time_names})
            bucket_bs[key] = OrderedDict({k: bucket_config[key][k][1] for k in bucket_time_names})

        # first level: HW
        num_bucket = 0
        hw_criteria = dict()
        t_criteria = dict()
        ar_criteria = dict()
        bucket_id = OrderedDict()
        bucket_id_cnt = 0
        for k1, v1 in bucket_probs.items():
            hw_criteria[k1] = ASPECT_RATIOS[k1][0]
            t_criteria[k1] = dict()
            ar_criteria[k1] = dict()
            bucket_id[k1] = dict()
            for k2, _ in v1.items():
                t_criteria[k1][k2] = k2
                bucket_id[k1][k2] = bucket_id_cnt
                bucket_id_cnt += 1
                ar_criteria[k1][k2] = dict()
                for k3, v3 in ASPECT_RATIOS[k1][1].items():
                    ar_criteria[k1][k2][k3] = v3
                    num_bucket += 1

        self.bucket_probs = bucket_probs
        self.bucket_bs = bucket_bs
        self.bucket_id = bucket_id
        self.hw_criteria = hw_criteria
        self.t_criteria = t_criteria
        self.ar_criteria = ar_criteria
        self.num_bucket = num_bucket
        get_logger().info("Number of buckets: %s", num_bucket)

    def get_bucket_id(self, T, H, W, frame_interval=1, seed=None):
        resolution = H * W
        approx = 0.8

        fail = True
        for hw_id, t_criteria in self.bucket_probs.items():
            if resolution < self.hw_criteria[hw_id] * approx:
                continue

            # if sample is an image
            if T == 1:
                if 1 in t_criteria:
                    rng = np.random.default_rng(seed + self.bucket_id[hw_id][1])
                    if rng.random() < t_criteria[1]:
                        fail = False
                        t_id = 1
                        break
                else:
                    continue

            # otherwise, find suitable t_id for video
            t_fail = True
            for t_id, prob in t_criteria.items():
                rng = np.random.default_rng(seed + self.bucket_id[hw_id][t_id])
                if isinstance(prob, tuple):
                    prob_t = prob[1]
                    if rng.random() > prob_t:
                        continue
                if T > t_id * frame_interval and t_id != 1:
                    t_fail = False
                    break
            if t_fail:
                continue

            # leave the loop if prob is high enough
            if isinstance(prob, tuple):
                prob = prob[0]
            if prob >= 1 or rng.random() < prob:
                fail = False
                break
        if fail:
            return None

        # get aspect ratio id
        ar_criteria = self.ar_criteria[hw_id][t_id]
        ar_id = get_closest_ratio(H, W, ar_criteria)
        return hw_id, t_id, ar_id

    def get_thw(self, bucket_id):
        assert len(bucket_id) == 3
        T = self.t_criteria[bucket_id[0]][bucket_id[1]]
        H, W = self.ar_criteria[bucket_id[0]][bucket_id[1]][bucket_id[2]]
        return T, H, W

    def get_prob(self, bucket_id):
        return self.bucket_probs[bucket_id[0]][bucket_id[1]]

    def get_batch_size(self, bucket_id):
        return self.bucket_bs[bucket_id[0]][bucket_id[1]]

    def __len__(self):
        return self.num_bucket


def closet_smaller_bucket(value, bucket):
    for i in range(1, len(bucket)):
        if value < bucket[i]:
            return bucket[i - 1]
    return bucket[-1]


================================================
FILE: Open-Sora/build/lib/opensora/datasets/dataloader.py
================================================
import collections
import random
from typing import Optional

import numpy as np
import torch
from torch.distributed import ProcessGroup
from torch.distributed.distributed_c10d import _get_default_group
from torch.utils.data import DataLoader

from .datasets import BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset
from .sampler import BatchDistributedSampler, StatefulDistributedSampler, VariableVideoBatchSampler


# Deterministic dataloader
def get_seed_worker(seed):
    def seed_worker(worker_id):
        worker_seed = seed
        np.random.seed(worker_seed)
        torch.manual_seed(worker_seed)
        random.seed(worker_seed)

    return seed_worker


def prepare_dataloader(
    dataset,
    batch_size=None,
    shuffle=False,
    seed=1024,
    drop_last=False,
    pin_memory=False,
    num_workers=0,
    process_group: Optional[ProcessGroup] = None,
    bucket_config=None,
    num_bucket_build_workers=1,
    prefetch_factor=None,
    **kwargs,
):
    _kwargs = kwargs.copy()
    if isinstance(dataset, VariableVideoTextDataset):
        batch_sampler = VariableVideoBatchSampler(
            dataset,
            bucket_config,
            num_replicas=process_group.size(),
            rank=process_group.rank(),
            shuffle=shuffle,
            seed=seed,
            drop_last=drop_last,
            verbose=True,
            num_bucket_build_workers=num_bucket_build_workers,
        )
        return (
            DataLoader(
                dataset,
                batch_sampler=batch_sampler,
                worker_init_fn=get_seed_worker(seed),
                pin_memory=pin_memory,
                num_workers=num_workers,
                collate_fn=collate_fn_default,
                prefetch_factor=prefetch_factor,
                **_kwargs,
            ),
            batch_sampler,
        )
    elif isinstance(dataset, VideoTextDataset):
        process_group = process_group or _get_default_group()
        sampler = StatefulDistributedSampler(
            dataset,
            num_replicas=process_group.size(),
            rank=process_group.rank(),
            shuffle=shuffle,
        )
        return (
            DataLoader(
                dataset,
                batch_size=batch_size,
                sampler=sampler,
                worker_init_fn=get_seed_worker(seed),
                drop_last=drop_last,
                pin_memory=pin_memory,
                num_workers=num_workers,
                collate_fn=collate_fn_default,
                prefetch_factor=prefetch_factor,
                **_kwargs,
            ),
            sampler,
        )
    elif isinstance(dataset, BatchFeatureDataset):
        sampler = BatchDistributedSampler(
            dataset,
            num_replicas=process_group.size(),
            rank=process_group.rank(),
        )
        return (
            DataLoader(
                dataset,
                batch_size=1,
                sampler=sampler,
                worker_init_fn=get_seed_worker(seed),
                pin_memory=pin_memory,
                num_workers=num_workers,
                collate_fn=collate_fn_batch,
                prefetch_factor=prefetch_factor,
                **_kwargs,
            ),
            sampler,
        )
    else:
        raise ValueError(f"Unsupported dataset type: {type(dataset)}")


def collate_fn_default(batch):
    # filter out None
    batch = [x for x in batch if x is not None]

    # HACK: for loading text features
    use_mask = False
    if "mask" in batch[0] and isinstance(batch[0]["mask"], int):
        masks = [x.pop("mask") for x in batch]

        texts = [x.pop("text") for x in batch]
        texts = torch.cat(texts, dim=1)
        use_mask = True

    ret = torch.utils.data.default_collate(batch)

    if use_mask:
        ret["mask"] = masks
        ret["text"] = texts
    return ret


def collate_fn_batch(batch):
    """
    Used only with BatchDistributedSampler
    """
    # filter out None
    batch = [x for x in batch if x is not None]
    
    res = torch.utils.data.default_collate(batch)

    # squeeze the first dimension, which is due to torch.stack() in default_collate()
    if isinstance(res, collections.abc.Mapping):
        for k, v in res.items():
            if isinstance(v, torch.Tensor):
                res[k] = v.squeeze(0)
    elif isinstance(res, collections.abc.Sequence):
        res = [x.squeeze(0) if isinstance(x, torch.Tensor) else x for x in res]
    elif isinstance(res, torch.Tensor):
        res = res.squeeze(0)
    else:
        raise TypeError

    return res


================================================
FILE: Open-Sora/build/lib/opensora/datasets/datasets.py
================================================
import os
from glob import glob

import numpy as np
import torch
from PIL import ImageFile
from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader

from opensora.registry import DATASETS

from .read_video import read_video
from .utils import VID_EXTENSIONS, get_transforms_image, get_transforms_video, read_file, temporal_random_crop

ImageFile.LOAD_TRUNCATED_IMAGES = True
IMG_FPS = 120


@DATASETS.register_module()
class VideoTextDataset(torch.utils.data.Dataset):
    """load video according to the csv file.

    Args:
        target_video_len (int): the number of video frames will be load.
        align_transform (callable): Align different videos in a specified size.
        temporal_sample (callable): Sample the target length of a video.
    """

    def __init__(
        self,
        data_path=None,
        num_frames=16,
        frame_interval=1,
        image_size=(256, 256),
        transform_name="center",
    ):
        self.data_path = data_path
        self.data = read_file(data_path)
        self.get_text = "text" in self.data.columns
        self.num_frames = num_frames
        self.frame_interval = frame_interval
        self.image_size = image_size
        self.transforms = {
            "image": get_transforms_image(transform_name, image_size),
            "video": get_transforms_video(transform_name, image_size),
        }

    def _print_data_number(self):
        num_videos = 0
        num_images = 0
        for path in self.data["path"]:
            if self.get_type(path) == "video":
                num_videos += 1
            else:
                num_images += 1
        print(f"Dataset contains {num_videos} videos and {num_images} images.")

    def get_type(self, path):
        ext = os.path.splitext(path)[-1].lower()
        if ext.lower() in VID_EXTENSIONS:
            return "video"
        else:
            assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}"
            return "image"

    def getitem(self, index):
        sample = self.data.iloc[index]
        path = sample["path"]
        file_type = self.get_type(path)

        if file_type == "video":
            # loading
            vframes, vinfo = read_video(path, backend="av")
            video_fps = vinfo["video_fps"] if "video_fps" in vinfo else 24

            # Sampling video frames
            video = temporal_random_crop(vframes, self.num_frames, self.frame_interval)

            # transform
            transform = self.transforms["video"]
            video = transform(video)  # T C H W
        else:
            # loading
            image = pil_loader(path)
            video_fps = IMG_FPS

            # transform
            transform = self.transforms["image"]
            image = transform(image)

            # repeat
            video = image.unsqueeze(0).repeat(self.num_frames, 1, 1, 1)

        # TCHW -> CTHW
        video = video.permute(1, 0, 2, 3)

        ret = {"video": video, "fps": video_fps}
        if self.get_text:
            ret["text"] = sample["text"]
        return ret

    def __getitem__(self, index):
        for _ in range(10):
            try:
                return self.getitem(index)
            except Exception as e:
                path = self.data.iloc[index]["path"]
                print(f"data {path}: {e}")
                index = np.random.randint(len(self))
        raise RuntimeError("Too many bad data.")

    def __len__(self):
        return len(self.data)


@DATASETS.register_module()
class VariableVideoTextDataset(VideoTextDataset):
    def __init__(
        self,
        data_path=None,
        num_frames=None,
        frame_interval=1,
        image_size=(None, None),
        transform_name=None,
        dummy_text_feature=False,
    ):
        super().__init__(data_path, num_frames, frame_interval, image_size, transform_name=None)
        self.transform_name = transform_name
        self.data["id"] = np.arange(len(self.data))
        self.dummy_text_feature = dummy_text_feature

    def get_data_info(self, index):
        T = self.data.iloc[index]["num_frames"]
        H = self.data.iloc[index]["height"]
        W = self.data.iloc[index]["width"]
        return T, H, W

    def getitem(self, index):
        # a hack to pass in the (time, height, width) info from sampler
        index, num_frames, height, width = [int(val) for val in index.split("-")]

        sample = self.data.iloc[index]
        path = sample["path"]
        file_type = self.get_type(path)
        ar = height / width

        video_fps = 24  # default fps
        if file_type == "video":
            # loading
            vframes, vinfo = read_video(path, backend="av")
            video_fps = vinfo["video_fps"] if "video_fps" in vinfo else 24

            # Sampling video frames
            video = temporal_random_crop(vframes, num_frames, self.frame_interval)
            video = video.clone()
            del vframes

            video_fps = video_fps // self.frame_interval

            # transform
            transform = get_transforms_video(self.transform_name, (height, width))
            video = transform(video)  # T C H W
        else:
            # loading
            image = pil_loader(path)
            video_fps = IMG_FPS

            # transform
            transform = get_transforms_image(self.transform_name, (height, width))
            image = transform(image)

            # repeat
            video = image.unsqueeze(0)

        # TCHW -> CTHW
        video = video.permute(1, 0, 2, 3)
        ret = {
            "video": video,
            "num_frames": num_frames,
            "height": height,
            "width": width,
            "ar": ar,
            "fps": video_fps,
        }
        if self.get_text:
            ret["text"] = sample["text"]
        if self.dummy_text_feature:
            text_len = 50
            ret["text"] = torch.zeros((1, text_len, 1152))
            ret["mask"] = text_len
        return ret

    def __getitem__(self, index):
        try:
            return self.getitem(index)
        except:
            return None


@DATASETS.register_module()
class BatchFeatureDataset(torch.utils.data.Dataset):
    """
    The dataset is composed of multiple .bin files.
    Each .bin file is a list of batch data (like a buffer). All .bin files have the same length.
    In each training iteration, one batch is fetched from the current buffer.
    Once a buffer is consumed, load another one.
    Avoid loading the same .bin on two difference GPUs, i.e., one .bin is assigned to one GPU only.
    """

    def __init__(self, data_path=None):
        self.path_list = sorted(glob(data_path + "/**/*.bin"))

        self._len_buffer = len(torch.load(self.path_list[0]))
        self._num_buffers = len(self.path_list)
        self.num_samples = self.len_buffer * len(self.path_list)

        self.cur_file_idx = -1
        self.cur_buffer = None

    @property
    def num_buffers(self):
        return self._num_buffers

    @property
    def len_buffer(self):
        return self._len_buffer

    def _load_buffer(self, idx):
        file_idx = idx // self.len_buffer
        if file_idx != self.cur_file_idx:
            self.cur_file_idx = file_idx
            self.cur_buffer = torch.load(self.path_list[file_idx])

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        self._load_buffer(idx)

        batch = self.cur_buffer[idx % self.len_buffer]  # dict; keys are {'x', 'fps'} and text related

        ret = {
            "video": batch["x"],
            "text": batch["y"],
            "mask": batch["mask"],
            "fps": batch["fps"],
            "height": batch["height"],
            "width": batch["width"],
            "num_frames": batch["num_frames"],
        }
        return ret


================================================
FILE: Open-Sora/build/lib/opensora/datasets/read_video.py
================================================
import gc
import math
import os
import re
import warnings
from fractions import Fraction
from typing import Any, Dict, List, Optional, Tuple, Union

import av
import cv2
import numpy as np
import torch
from torchvision import get_video_backend
from torchvision.io.video import _check_av_available

MAX_NUM_FRAMES = 2500


def read_video_av(
    filename: str,
    start_pts: Union[float, Fraction] = 0,
    end_pts: Optional[Union[float, Fraction]] = None,
    pts_unit: str = "pts",
    output_format: str = "THWC",
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
    """
    Reads a video from a file, returning both the video frames and the audio frames

    This method is modified from torchvision.io.video.read_video, with the following changes:

    1. will not extract audio frames and return empty for aframes
    2. remove checks and only support pyav
    3. add container.close() and gc.collect() to avoid thread leakage
    4. try our best to avoid memory leak

    Args:
        filename (str): path to the video file
        start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
            The start presentation time of the video
        end_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
            The end presentation time
        pts_unit (str, optional): unit in which start_pts and end_pts values will be interpreted,
            either 'pts' or 'sec'. Defaults to 'pts'.
        output_format (str, optional): The format of the output video tensors. Can be either "THWC" (default) or "TCHW".

    Returns:
        vframes (Tensor[T, H, W, C] or Tensor[T, C, H, W]): the `T` video frames
        aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points
        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int)
    """
    # format
    output_format = output_format.upper()
    if output_format not in ("THWC", "TCHW"):
        raise ValueError(f"output_format should be either 'THWC' or 'TCHW', got {output_format}.")
    # file existence
    if not os.path.exists(filename):
        raise RuntimeError(f"File not found: {filename}")
    # backend check
    assert get_video_backend() == "pyav", "pyav backend is required for read_video_av"
    _check_av_available()
    # end_pts check
    if end_pts is None:
        end_pts = float("inf")
    if end_pts < start_pts:
        raise ValueError(f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}")

    # == get video info ==
    info = {}
    # TODO: creating an container leads to memory leak (1G for 8 workers 1 GPU)
    container = av.open(filename, metadata_errors="ignore")
    # fps
    video_fps = container.streams.video[0].average_rate
    # guard against potentially corrupted files
    if video_fps is not None:
        info["video_fps"] = float(video_fps)
    iter_video = container.decode(**{"video": 0})
    frame = next(iter_video).to_rgb().to_ndarray()
    height, width = frame.shape[:2]
    total_frames = container.streams.video[0].frames
    if total_frames == 0:
        total_frames = MAX_NUM_FRAMES
        warnings.warn(f"total_frames is 0, using {MAX_NUM_FRAMES} as a fallback")
    container.close()
    del container

    # HACK: must create before iterating stream
    # use np.zeros will not actually allocate memory
    # use np.ones will lead to a little memory leak
    video_frames = np.zeros((total_frames, height, width, 3), dtype=np.uint8)

    # == read ==
    try:
        # TODO: The reading has memory leak (4G for 8 workers 1 GPU)
        container = av.open(filename, metadata_errors="ignore")
        assert container.streams.video is not None
        video_frames = _read_from_stream(
            video_frames,
            container,
            start_pts,
            end_pts,
            pts_unit,
            container.streams.video[0],
            {"video": 0},
            filename=filename,
        )
    except av.AVError as e:
        print(f"[Warning] Error while reading video {filename}: {e}")

    vframes = torch.from_numpy(video_frames).clone()
    del video_frames
    if output_format == "TCHW":
        # [T,H,W,C] --> [T,C,H,W]
        vframes = vframes.permute(0, 3, 1, 2)

    aframes = torch.empty((1, 0), dtype=torch.float32)
    return vframes, aframes, info


def _read_from_stream(
    video_frames,
    container: "av.container.Container",
    start_offset: float,
    end_offset: float,
    pts_unit: str,
    stream: "av.stream.Stream",
    stream_name: Dict[str, Optional[Union[int, Tuple[int, ...], List[int]]]],
    filename: Optional[str] = None,
) -> List["av.frame.Frame"]:
    if pts_unit == "sec":
        # TODO: we should change all of this from ground up to simply take
        # sec and convert to MS in C++
        start_offset = int(math.floor(start_offset * (1 / stream.time_base)))
        if end_offset != float("inf"):
            end_offset = int(math.ceil(end_offset * (1 / stream.time_base)))
    else:
        warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.")

    should_buffer = True
    max_buffer_size = 5
    if stream.type == "video":
        # DivX-style packed B-frames can have out-of-order pts (2 frames in a single pkt)
        # so need to buffer some extra frames to sort everything
        # properly
        extradata = stream.codec_context.extradata
        # overly complicated way of finding if `divx_packed` is set, following
        # https://github.com/FFmpeg/FFmpeg/commit/d5a21172283572af587b3d939eba0091484d3263
        if extradata and b"DivX" in extradata:
            # can't use regex directly because of some weird characters sometimes...
            pos = extradata.find(b"DivX")
            d = extradata[pos:]
            o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d)
            if o is None:
                o = re.search(rb"DivX(\d+)b(\d+)(\w)", d)
            if o is not None:
                should_buffer = o.group(3) == b"p"
    seek_offset = start_offset
    # some files don't seek to the right location, so better be safe here
    seek_offset = max(seek_offset - 1, 0)
    if should_buffer:
        # FIXME this is kind of a hack, but we will jump to the previous keyframe
        # so this will be safe
        seek_offset = max(seek_offset - max_buffer_size, 0)
    try:
        # TODO check if stream needs to always be the video stream here or not
        container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
    except av.AVError as e:
        print(f"[Warning] Error while seeking video {filename}: {e}")
        return []

    # == main ==
    buffer_count = 0
    frames_pts = []
    cnt = 0
    try:
        for _idx, frame in enumerate(container.decode(**stream_name)):
            frames_pts.append(frame.pts)
            video_frames[cnt] = frame.to_rgb().to_ndarray()
            cnt += 1
            if cnt >= len(video_frames):
                break
            if frame.pts >= end_offset:
                if should_buffer and buffer_count < max_buffer_size:
                    buffer_count += 1
                    continue
                break
    except av.AVError as e:
        print(f"[Warning] Error while reading video {filename}: {e}")

    # garbage collection for thread leakage
    container.close()
    del container
    # NOTE: manually garbage collect to close pyav threads
    gc.collect()

    # ensure that the results are sorted wrt the pts
    # NOTE: here we assert frames_pts is sorted
    start_ptr = 0
    end_ptr = cnt
    while start_ptr < end_ptr and frames_pts[start_ptr] < start_offset:
        start_ptr += 1
    while start_ptr < end_ptr and frames_pts[end_ptr - 1] > end_offset:
        end_ptr -= 1
    if start_offset > 0 and start_offset not in frames_pts[start_ptr:end_ptr]:
        # if there is no frame that exactly matches the pts of start_offset
        # add the last frame smaller than start_offset, to guarantee that
        # we will have all the necessary data. This is most useful for audio
        if start_ptr > 0:
            start_ptr -= 1
    result = video_frames[start_ptr:end_ptr].copy()
    return result


def read_video_cv2(video_path):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        # print("Error: Unable to open video")
        raise ValueError
    else:
        fps = cap.get(cv2.CAP_PROP_FPS)
        vinfo = {
            "video_fps": fps,
        }

        frames = []
        while True:
            # Read a frame from the video
            ret, frame = cap.read()

            # If frame is not read correctly, break the loop
            if not ret:
                break

            frames.append(frame[:, :, ::-1])  # BGR to RGB

            # Exit if 'q' is pressed
            if cv2.waitKey(25) & 0xFF == ord("q"):
                break

        # Release the video capture object and close all windows
        cap.release()
        cv2.destroyAllWindows()

        frames = np.stack(frames)
        frames = torch.from_numpy(frames)  # [T, H, W, C=3]
        frames = frames.permute(0, 3, 1, 2)
        return frames, vinfo


def read_video(video_path, backend="av"):
    if backend == "cv2":
        vframes, vinfo = read_video_cv2(video_path)
    elif backend == "av":
        vframes, _, vinfo = read_video_av(filename=video_path, pts_unit="sec", output_format="TCHW")
    else:
        raise ValueError

    return vframes, vinfo


================================================
FILE: Open-Sora/build/lib/opensora/datasets/sampler.py
================================================
from collections import OrderedDict, defaultdict
from pprint import pformat
from typing import Iterator, List, Optional

import numpy as np
import torch
import torch.distributed as dist
from torch.utils.data import Dataset, DistributedSampler

from opensora.utils.misc import format_numel_str, get_logger

from .aspect import get_num_pixels
from .bucket import Bucket
from .datasets import VariableVideoTextDataset


# use pandarallel to accelerate bucket processing
# NOTE: pandarallel should only access local variables
def apply(data, method=None, frame_interval=None, seed=None, num_bucket=None):
    return method(
        data["num_frames"],
        data["height"],
        data["width"],
        frame_interval,
        seed + data["id"] * num_bucket,
    )


class StatefulDistributedSampler(DistributedSampler):
    def __init__(
        self,
        dataset: Dataset,
        num_replicas: Optional[int] = None,
        rank: Optional[int] = None,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
    ) -> None:
        super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
        self.start_index: int = 0

    def __iter__(self) -> Iterator:
        iterator = super().__iter__()
        indices = list(iterator)
        indices = indices[self.start_index :]
        return iter(indices)

    def __len__(self) -> int:
        return self.num_samples - self.start_index

    def reset(self) -> None:
        self.start_index = 0

    def state_dict(self, step) -> dict:
        return {"start_index": step}

    def load_state_dict(self, state_dict: dict) -> None:
        self.__dict__.update(state_dict)


class VariableVideoBatchSampler(DistributedSampler):
    def __init__(
        self,
        dataset: VariableVideoTextDataset,
        bucket_config: dict,
        num_replicas: Optional[int] = None,
        rank: Optional[int] = None,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
        verbose: bool = False,
        num_bucket_build_workers: int = 1,
    ) -> None:
        super().__init__(
            dataset=dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle, seed=seed, drop_last=drop_last
        )
        self.dataset = dataset
        self.bucket = Bucket(bucket_config)
        self.verbose = verbose
        self.last_micro_batch_access_index = 0
        self.approximate_num_batch = None

        self._get_num_batch_cached_bucket_sample_dict = None
        self.num_bucket_build_workers = num_bucket_build_workers

    def __iter__(self) -> Iterator[List[int]]:
        if self._get_num_batch_cached_bucket_sample_dict is not None:
            bucket_sample_dict = self._get_num_batch_cached_bucket_sample_dict
            self._get_num_batch_cached_bucket_sample_dict = None
        else:
            bucket_sample_dict = self.group_by_bucket()
            if self.verbose:
                self._print_bucket_info(bucket_sample_dict)

        g = torch.Generator()
        g.manual_seed(self.seed + self.epoch)
        bucket_micro_batch_count = OrderedDict()
        bucket_last_consumed = OrderedDict()

        # process the samples
        for bucket_id, data_list in bucket_sample_dict.items():
            # handle droplast
            bs_per_gpu = self.bucket.get_batch_size(bucket_id)
            remainder = len(data_list) % bs_per_gpu

            if remainder > 0:
                if not self.drop_last:
                    # if there is remainder, we pad to make it divisible
                    data_list += data_list[: bs_per_gpu - remainder]
                else:
                    # we just drop the remainder to make it divisible
                    data_list = data_list[:-remainder]
            bucket_sample_dict[bucket_id] = data_list

            # handle shuffle
            if self.shuffle:
                data_indices = torch.randperm(len(data_list), generator=g).tolist()
                data_list = [data_list[i] for i in data_indices]
                bucket_sample_dict[bucket_id] = data_list

            # compute how many micro-batches each bucket has
            num_micro_batches = len(data_list) // bs_per_gpu
            bucket_micro_batch_count[bucket_id] = num_micro_batches

        # compute the bucket access order
        # each bucket may have more than one batch of data
        # thus bucket_id may appear more than 1 time
        bucket_id_access_order = []
        for bucket_id, num_micro_batch in bucket_micro_batch_count.items():
            bucket_id_access_order.extend([bucket_id] * num_micro_batch)

        # randomize the access order
        if self.shuffle:
            bucket_id_access_order_indices = torch.randperm(len(bucket_id_access_order), generator=g).tolist()
            bucket_id_access_order = [bucket_id_access_order[i] for i in bucket_id_access_order_indices]

        # make the number of bucket accesses divisible by dp size
        remainder = len(bucket_id_access_order) % self.num_replicas
        if remainder > 0:
            if self.drop_last:
                bucket_id_access_order = bucket_id_access_order[: len(bucket_id_access_order) - remainder]
            else:
                bucket_id_access_order += bucket_id_access_order[: self.num_replicas - remainder]

        # prepare each batch from its bucket
        # according to the predefined bucket access order
        num_iters = len(bucket_id_access_order) // self.num_replicas
        start_iter_idx = self.last_micro_batch_access_index // self.num_replicas

        # re-compute the micro-batch consumption
        # this is useful when resuming from a state dict with a different number of GPUs
        self.last_micro_batch_access_index = start_iter_idx * self.num_replicas
        for i in range(self.last_micro_batch_access_index):
            bucket_id = bucket_id_access_order[i]
            bucket_bs = self.bucket.get_batch_size(bucket_id)
            if bucket_id in bucket_last_consumed:
                bucket_last_consumed[bucket_id] += bucket_bs
            else:
                bucket_last_consumed[bucket_id] = bucket_bs

        for i in range(start_iter_idx, num_iters):
            bucket_access_list = bucket_id_access_order[i * self.num_replicas : (i + 1) * self.num_replicas]
            self.last_micro_batch_access_index += self.num_replicas

            # compute the data samples consumed by each access
            bucket_access_boundaries = []
            for bucket_id in bucket_access_list:
                bucket_bs = self.bucket.get_batch_size(bucket_id)
                last_consumed_index = bucket_last_consumed.get(bucket_id, 0)
                bucket_access_boundaries.append([last_consumed_index, last_consumed_index + bucket_bs])

                # update consumption
                if bucket_id in bucket_last_consumed:
                    bucket_last_consumed[bucket_id] += bucket_bs
                else:
                    bucket_last_consumed[bucket_id] = bucket_bs

            # compute the range of data accessed by each GPU
            bucket_id = bucket_access_list[self.rank]
            boundary = bucket_access_boundaries[self.rank]
            cur_micro_batch = bucket_sample_dict[bucket_id][boundary[0] : boundary[1]]

            # encode t, h, w into the sample index
            real_t, real_h, real_w = self.bucket.get_thw(bucket_id)
            cur_micro_batch = [f"{idx}-{real_t}-{real_h}-{real_w}" for idx in cur_micro_batch]
            yield cur_micro_batch

        self.reset()

    def __len__(self) -> int:
        return self.get_num_batch() // dist.get_world_size()

    def group_by_bucket(self) -> dict:
        bucket_sample_dict = OrderedDict()

        from pandarallel import pandarallel

        pandarallel.initialize(nb_workers=self.num_bucket_build_workers, progress_bar=False)
        get_logger().info("Building buckets...")
        bucket_ids = self.dataset.data.parallel_apply(
            apply,
            axis=1,
            method=self.bucket.get_bucket_id,
            frame_interval=self.dataset.frame_interval,
            seed=self.seed + self.epoch,
            num_bucket=self.bucket.num_bucket,
        )

        # group by bucket
        # each data sample is put into a bucket with a similar image/video size
        for i in range(len(self.dataset)):
            bucket_id = bucket_ids[i]
            if bucket_id is None:
                continue
            if bucket_id not in bucket_sample_dict:
                bucket_sample_dict[bucket_id] = []
            bucket_sample_dict[bucket_id].append(i)
        return bucket_sample_dict

    def get_num_batch(self) -> int:
        bucket_sample_dict = self.group_by_bucket()
        self._get_num_batch_cached_bucket_sample_dict = bucket_sample_dict

        # calculate the number of batches
        if self.verbose:
            self._print_bucket_info(bucket_sample_dict)
        return self.approximate_num_batch

    def _print_bucket_info(self, bucket_sample_dict: dict) -> None:
        # collect statistics
        total_samples = 0
        total_batch = 0
        num_aspect_dict = defaultdict(lambda: [0, 0])
        num_hwt_dict = defaultdict(lambda: [0, 0])
        for k, v in bucket_sample_dict.items():
            size = len(v)
            num_batch = size // self.bucket.get_batch_size(k[:-1])

            total_samples += size
            total_batch += num_batch

            num_aspect_dict[k[-1]][0] += size
            num_aspect_dict[k[-1]][1] += num_batch
            num_hwt_dict[k[:-1]][0] += size
            num_hwt_dict[k[:-1]][1] += num_batch

        # sort
        num_aspect_dict = dict(sorted(num_aspect_dict.items(), key=lambda x: x[0]))
        num_hwt_dict = dict(
            sorted(num_hwt_dict.items(), key=lambda x: (get_num_pixels(x[0][0]), x[0][1]), reverse=True)
        )
        num_hwt_img_dict = {k: v for k, v in num_hwt_dict.items() if k[1] == 1}
        num_hwt_vid_dict = {k: v for k, v in num_hwt_dict.items() if k[1] > 1}

        # log
        if dist.get_rank() == 0 and self.verbose:
            get_logger().info("Bucket Info:")
            get_logger().info(
                "Bucket [#sample, #batch] by aspect ratio:\n%s", pformat(num_aspect_dict, sort_dicts=False)
            )
            get_logger().info(
                "Image Bucket [#sample, #batch] by HxWxT:\n%s", pformat(num_hwt_img_dict, sort_dicts=False)
            )
            get_logger().info(
                "Video Bucket [#sample, #batch] by HxWxT:\n%s", pformat(num_hwt_vid_dict, sort_dicts=False)
            )
            get_logger().info(
                "#training batch: %s, #training sample: %s, #non empty bucket: %s",
                format_numel_str(total_batch),
                format_numel_str(total_samples),
                len(bucket_sample_dict),
            )
        self.approximate_num_batch = total_batch

    def reset(self):
        self.last_micro_batch_access_index = 0

    def state_dict(self, num_steps: int) -> dict:
        # the last_micro_batch_access_index in the __iter__ is often
        # not accurate during multi-workers and data prefetching
        # thus, we need the user to pass the actual steps which have been executed
        # to calculate the correct last_micro_batch_access_index
        return {"seed": self.seed, "epoch": self.epoch, "last_micro_batch_access_index": num_steps * self.num_replicas}

    def load_state_dict(self, state_dict: dict) -> None:
        self.__dict__.update(state_dict)


class BatchDistributedSampler(DistributedSampler):
    """
    Used with BatchDataset;
    Suppose len_buffer == 5, num_buffers == 6, #GPUs == 3, then
           | buffer {i}          | buffer {i+1}
    ------ | ------------------- | -------------------
    rank 0 |  0,  1,  2,  3,  4, |  5,  6,  7,  8,  9
    rank 1 | 10, 11, 12, 13, 14, | 15, 16, 17, 18, 19
    rank 2 | 20, 21, 22, 23, 24, | 25, 26, 27, 28, 29
    """

    def __init__(self, dataset: Dataset, **kwargs):
        super().__init__(dataset, **kwargs)
        self.start_index = 0

    def __iter__(self):
        num_buffers = self.dataset.num_buffers
        len_buffer = self.dataset.len_buffer
        num_buffers_i = num_buffers // self.num_replicas
        num_samples_i = len_buffer * num_buffers_i

        indices_i = np.arange(self.start_index, num_samples_i) + self.rank * num_samples_i
        indices_i = indices_i.tolist()

        return iter(indices_i)

    def reset(self):
        self.start_index = 0

    def state_dict(self, step) -> dict:
        return {"start_index": step}

    def load_state_dict(self, state_dict: dict):
        self.start_index = state_dict["start_index"] + 1


================================================
FILE: Open-Sora/build/lib/opensora/datasets/utils.py
================================================
import os
import re

import numpy as np
import pandas as pd
import requests
import torch
import torchvision
import torchvision.transforms as transforms
from PIL import Image
from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader
from torchvision.io import write_video
from torchvision.utils import save_image

from . import video_transforms

VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")

regex = re.compile(
    r"^(?:http|ftp)s?://"  # http:// or https://
    r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"  # domain...
    r"localhost|"  # localhost...
    r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
    r"(?::\d+)?"  # optional port
    r"(?:/?|[/?]\S+)$",
    re.IGNORECASE,
)


def is_img(path):
    ext = os.path.splitext(path)[-1].lower()
    return ext in IMG_EXTENSIONS


def is_vid(path):
    ext = os.path.splitext(path)[-1].lower()
    return ext in VID_EXTENSIONS


def is_url(url):
    return re.match(regex, url) is not None


def read_file(input_path):
    if input_path.endswith(".csv"):
        return pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        return pd.read_parquet(input_path)
    else:
        raise NotImplementedError(f"Unsupported file format: {input_path}")


def download_url(input_path):
    output_dir = "cache"
    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.basename(input_path)
    output_path = os.path.join(output_dir, base_name)
    img_data = requests.get(input_path).content
    with open(output_path, "wb") as handler:
        handler.write(img_data)
    print(f"URL {input_path} downloaded to {output_path}")
    return output_path


def temporal_random_crop(vframes, num_frames, frame_interval):
    temporal_sample = video_transforms.TemporalRandomCrop(num_frames * frame_interval)
    total_frames = len(vframes)
    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
    assert (
        end_frame_ind - start_frame_ind >= num_frames
    ), f"Not enough frames to sample, {end_frame_ind} - {start_frame_ind} < {num_frames}"
    frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, num_frames, dtype=int)
    video = vframes[frame_indice]
    return video


def get_transforms_video(name="center", image_size=(256, 256)):
    if name is None:
        return None
    elif name == "center":
        assert image_size[0] == image_size[1], "image_size must be square for center crop"
        transform_video = transforms.Compose(
            [
                video_transforms.ToTensorVideo(),  # TCHW
                # video_transforms.RandomHorizontalFlipVideo(),
                video_transforms.UCFCenterCropVideo(image_size[0]),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    elif name == "resize_crop":
        transform_video = transforms.Compose(
            [
                video_transforms.ToTensorVideo(),  # TCHW
                video_transforms.ResizeCrop(image_size),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    else:
        raise NotImplementedError(f"Transform {name} not implemented")
    return transform_video


def get_transforms_image(name="center", image_size=(256, 256)):
    if name is None:
        return None
    elif name == "center":
        assert image_size[0] == image_size[1], "Image size must be square for center crop"
        transform = transforms.Compose(
            [
                transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, image_size[0])),
                # transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    elif name == "resize_crop":
        transform = transforms.Compose(
            [
                transforms.Lambda(lambda pil_image: resize_crop_to_fill(pil_image, image_size)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    else:
        raise NotImplementedError(f"Transform {name} not implemented")
    return transform


def read_image_from_path(path, transform=None, transform_name="center", num_frames=1, image_size=(256, 256)):
    image = pil_loader(path)
    if transform is None:
        transform = get_transforms_image(image_size=image_size, name=transform_name)
    image = transform(image)
    video = image.unsqueeze(0).repeat(num_frames, 1, 1, 1)
    video = video.permute(1, 0, 2, 3)
    return video


def read_video_from_path(path, transform=None, transform_name="center", image_size=(256, 256)):
    vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit="sec", output_format="TCHW")
    if transform is None:
        transform = get_transforms_video(image_size=image_size, name=transform_name)
    video = transform(vframes)  # T C H W
    video = video.permute(1, 0, 2, 3)
    return video


def read_from_path(path, image_size, transform_name="center"):
    if is_url(path):
        path = download_url(path)
    ext = os.path.splitext(path)[-1].lower()
    if ext.lower() in VID_EXTENSIONS:
        return read_video_from_path(path, image_size=image_size, transform_name=transform_name)
    else:
        assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}"
        return read_image_from_path(path, image_size=image_size, transform_name=transform_name)


def save_sample(x, save_path=None, fps=8, normalize=True, value_range=(-1, 1), force_video=False, verbose=True):
    """
    Args:
        x (Tensor): shape [C, T, H, W]
    """
    assert x.ndim == 4

    if not force_video and x.shape[1] == 1:  # T = 1: save as image
        save_path += ".png"
        x = x.squeeze(1)
        save_image([x], save_path, normalize=normalize, value_range=value_range)
    else:
        save_path += ".mp4"
        if normalize:
            low, high = value_range
            x.clamp_(min=low, max=high)
            x.sub_(low).div_(max(high - low, 1e-5))

        x = x.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 3, 0).to("cpu", torch.uint8)
        write_video(save_path, x, fps=fps, video_codec="h264")
    if verbose:
        print(f"Saved to {save_path}")
    return save_path


def center_crop_arr(pil_image, image_size):
    """
    Center cropping implementation from ADM.
    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
    """
    while min(*pil_image.size) >= 2 * image_size:
        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX)

    scale = image_size / min(*pil_image.size)
    pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC)

    arr = np.array(pil_image)
    crop_y = (arr.shape[0] - image_size) // 2
    crop_x = (arr.shape[1] - image_size) // 2
    return Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size])


def resize_crop_to_fill(pil_image, image_size):
    w, h = pil_image.size  # PIL is (W, H)
    th, tw = image_size
    rh, rw = th / h, tw / w
    if rh > rw:
        sh, sw = th, round(w * rh)
        image = pil_image.resize((sw, sh), Image.BICUBIC)
        i = 0
        j = int(round((sw - tw) / 2.0))
    else:
        sh, sw = round(h * rw), tw
        image = pil_image.resize((sw, sh), Image.BICUBIC)
        i = int(round((sh - th) / 2.0))
        j = 0
    arr = np.array(image)
    assert i + th <= arr.shape[0] and j + tw <= arr.shape[1]
    return Image.fromarray(arr[i : i + th, j : j + tw])


================================================
FILE: Open-Sora/build/lib/opensora/datasets/video_transforms.py
================================================
# Copyright 2024 Vchitect/Latte

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.# Modified from Latte

# - This file is adapted from https://github.com/Vchitect/Latte/blob/main/datasets/video_transforms.py


import numbers
import random

import numpy as np
import torch


def _is_tensor_video_clip(clip):
    if not torch.is_tensor(clip):
        raise TypeError("clip should be Tensor. Got %s" % type(clip))

    if not clip.ndimension() == 4:
        raise ValueError("clip should be 4D. Got %dD" % clip.dim())

    return True


def crop(clip, i, j, h, w):
    """
    Args:
        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
    """
    if len(clip.size()) != 4:
        raise ValueError("clip should be a 4D tensor")
    return clip[..., i : i + h, j : j + w]


def resize(clip, target_size, interpolation_mode):
    if len(target_size) != 2:
        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
    return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)


def resize_scale(clip, target_size, interpolation_mode):
    if len(target_size) != 2:
        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
    H, W = clip.size(-2), clip.size(-1)
    scale_ = target_size[0] / min(H, W)
    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)


def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
    """
    Do spatial cropping and resizing to the video clip
    Args:
        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        i (int): i in (i,j) i.e coordinates of the upper left corner.
        j (int): j in (i,j) i.e coordinates of the upper left corner.
        h (int): Height of the cropped region.
        w (int): Width of the cropped region.
        size (tuple(int, int)): height and width of resized clip
    Returns:
        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    clip = crop(clip, i, j, h, w)
    clip = resize(clip, size, interpolation_mode)
    return clip


def center_crop(clip, crop_size):
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)
    th, tw = crop_size
    if h < th or w < tw:
        raise ValueError("height and width must be no smaller than crop_size")

    i = int(round((h - th) / 2.0))
    j = int(round((w - tw) / 2.0))
    return crop(clip, i, j, th, tw)


def center_crop_using_short_edge(clip):
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)
    if h < w:
        th, tw = h, h
        i = 0
        j = int(round((w - tw) / 2.0))
    else:
        th, tw = w, w
        i = int(round((h - th) / 2.0))
        j = 0
    return crop(clip, i, j, th, tw)


def resize_crop_to_fill(clip, target_size):
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)
    th, tw = target_size[0], target_size[1]
    rh, rw = th / h, tw / w
    if rh > rw:
        sh, sw = th, round(w * rh)
        clip = resize(clip, (sh, sw), "bilinear")
        i = 0
        j = int(round(sw - tw) / 2.0)
    else:
        sh, sw = round(h * rw), tw
        clip = resize(clip, (sh, sw), "bilinear")
        i = int(round(sh - th) / 2.0)
        j = 0
    assert i + th <= clip.size(-2) and j + tw <= clip.size(-1)
    return crop(clip, i, j, th, tw)


def random_shift_crop(clip):
    """
    Slide along the long edge, with the short edge as crop size
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)

    if h <= w:
        short_edge = h
    else:
        short_edge = w

    th, tw = short_edge, short_edge

    i = torch.randint(0, h - th + 1, size=(1,)).item()
    j = torch.randint(0, w - tw + 1, size=(1,)).item()
    return crop(clip, i, j, th, tw)


def to_tensor(clip):
    """
    Convert tensor data type from uint8 to float, divide value by 255.0 and
    permute the dimensions of clip tensor
    Args:
        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
    Return:
        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
    """
    _is_tensor_video_clip(clip)
    if not clip.dtype == torch.uint8:
        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
    # return clip.float().permute(3, 0, 1, 2) / 255.0
    return clip.float() / 255.0


def normalize(clip, mean, std, inplace=False):
    """
    Args:
        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
        mean (tuple): pixel RGB mean. Size is (3)
        std (tuple): pixel standard deviation. Size is (3)
    Returns:
        normalized clip (torch.tensor): Size is (T, C, H, W)
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    if not inplace:
        clip = clip.clone()
    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
    # print(mean)
    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
    return clip


def hflip(clip):
    """
    Args:
        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
    Returns:
        flipped clip (torch.tensor): Size is (T, C, H, W)
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    return clip.flip(-1)


class ResizeCrop:
    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, clip):
        clip = resize_crop_to_fill(clip, self.size)
        return clip

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size})"


class RandomCropVideo:
    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: randomly cropped video clip.
                size is (T, C, OH, OW)
        """
        i, j, h, w = self.get_params(clip)
        return crop(clip, i, j, h, w)

    def get_params(self, clip):
        h, w = clip.shape[-2:]
        th, tw = self.size

        if h < th or w < tw:
            raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")

        if w == tw and h == th:
            return 0, 0, h, w

        i = torch.randint(0, h - th + 1, size=(1,)).item()
        j = torch.randint(0, w - tw + 1, size=(1,)).item()

        return i, j, th, tw

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size})"


class CenterCropResizeVideo:
    """
    First use the short side for cropping length,
    center crop video, then resize to the specified size
    """

    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)

        self.interpolation_mode = interpolation_mode

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: scale resized / center cropped video clip.
                size is (T, C, crop_size, crop_size)
        """
        clip_center_crop = center_crop_using_short_edge(clip)
        clip_center_crop_resize = resize(
            clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode
        )
        return clip_center_crop_resize

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"


class UCFCenterCropVideo:
    """
    First scale to the specified size in equal proportion to the short edge,
    then center cropping
    """

    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)

        self.interpolation_mode = interpolation_mode

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: scale resized / center cropped video clip.
                size is (T, C, crop_size, crop_size)
        """
        clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
        clip_center_crop = center_crop(clip_resize, self.size)
        return clip_center_crop

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"


class KineticsRandomCropResizeVideo:
    """
    Slide along the long edge, with the short edge as crop size. And resie to the desired size.
    """

    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)

        self.interpolation_mode = interpolation_mode

    def __call__(self, clip):
        clip_random_crop = random_shift_crop(clip)
        clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
        return clip_resize


class CenterCropVideo:
    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)

        self.interpolation_mode = interpolation_mode

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: center cropped video clip.
                size is (T, C, crop_size, crop_size)
        """
        clip_center_crop = center_crop(clip, self.size)
        return clip_center_crop

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"


class NormalizeVideo:
    """
    Normalize the video clip by mean subtraction and division by standard deviation
    Args:
        mean (3-tuple): pixel RGB mean
        std (3-tuple): pixel RGB standard deviation
        inplace (boolean): whether do in-place normalization
    """

    def __init__(self, mean, std, inplace=False):
        self.mean = mean
        self.std = std
        self.inplace = inplace

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
        """
        return normalize(clip, self.mean, self.std, self.inplace)

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"


class ToTensorVideo:
    """
    Convert tensor data type from uint8 to float, divide value by 255.0 and
    permute the dimensions of clip tensor
    """

    def __init__(self):
        pass

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
        Return:
            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
        """
        return to_tensor(clip)

    def __repr__(self) -> str:
        return self.__class__.__name__


class RandomHorizontalFlipVideo:
    """
    Flip the video clip along the horizontal direction with a given probability
    Args:
        p (float): probability of the clip being flipped. Default value is 0.5
    """

    def __init__(self, p=0.5):
        self.p = p

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Size is (T, C, H, W)
        Return:
            clip (torch.tensor): Size is (T, C, H, W)
        """
        if random.random() < self.p:
            clip = hflip(clip)
        return clip

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(p={self.p})"


#  ------------------------------------------------------------
#  ---------------------  Sampling  ---------------------------
#  ------------------------------------------------------------
class TemporalRandomCrop(object):
    """Temporally crop the given frame indices at a random location.

    Args:
            size (int): Desired length of frames will be seen in the model.
    """

    def __init__(self, size):
        self.size = size

    def __call__(self, total_frames):
        rand_end = max(0, total_frames - self.size - 1)
        begin_index = random.randint(0, rand_end)
        end_index = min(begin_index + self.size, total_frames)
        return begin_index, end_index


if __name__ == "__main__":
    import os

    import numpy as np
    import torchvision.io as io
    from torchvision import transforms
    from torchvision.utils import save_image

    vframes, aframes, info = io.read_video(filename="./v_Archery_g01_c03.avi", pts_unit="sec", output_format="TCHW")

    trans = transforms.Compose(
        [
            ToTensorVideo(),
            RandomHorizontalFlipVideo(),
            UCFCenterCropVideo(512),
            # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
        ]
    )

    target_video_len = 32
    frame_interval = 1
    total_frames = len(vframes)
    print(total_frames)

    temporal_sample = TemporalRandomCrop(target_video_len * frame_interval)

    # Sampling video frames
    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
    # print(start_frame_ind)
    # print(end_frame_ind)
    assert end_frame_ind - start_frame_ind >= target_video_len
    frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int)
    print(frame_indice)

    select_vframes = vframes[frame_indice]
    print(select_vframes.shape)
    print(select_vframes.dtype)

    select_vframes_trans = trans(select_vframes)
    print(select_vframes_trans.shape)
    print(select_vframes_trans.dtype)

    select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8)
    print(select_vframes_trans_int.dtype)
    print(select_vframes_trans_int.permute(0, 2, 3, 1).shape)

    io.write_video("./test.avi", select_vframes_trans_int.permute(0, 2, 3, 1), fps=8)

    for i in range(target_video_len):
        save_image(
            select_vframes_trans[i], os.path.join("./test000", "%04d.png" % i), normalize=True, value_range=(-1, 1)
        )


================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/__init__.py
================================================
from .cache_cutfresh import cache_cutfresh
from .fresh_ratio_scheduler import fresh_ratio_scheduler
from .score_evaluate import score_evaluate
from .global_force_fresh import global_force_fresh
from .cache_cutfresh import cache_cutfresh
from .update_cache import update_cache
from .force_init import force_init
from .attention import cached_attention_forward
from .cache_init import cache_init

================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/attention.py
================================================
# Besides, re-arrange the attention module
from torch.jit import Final
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Union
from xformers.ops.fmha.attn_bias import BlockDiagonalMask
def cached_attention_forward(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attn_bias: Optional[Union[torch.Tensor, BlockDiagonalMask]] = None,
    p: float = 0.0,
    scale: Optional[float] = None
) -> torch.Tensor:
    scale = 1.0 / query.shape[-1] ** 0.5
    query = query * scale
    query = query.transpose(1, 2)
    key = key.transpose(1, 2)
    value = value.transpose(1, 2)
    #attn = query @ key.transpose(-2, -1)
    attn = torch.matmul(query, key.transpose(-2, -1))
    if attn_bias is not None:
        attn_bias = attn_bias.materialize(shape= attn.shape, dtype= attn.dtype, device= attn.device)
        attn = attn + attn_bias
    #out_map = attn
    attn_map = attn.softmax(-1)
    attn = F.dropout(attn_map, p)
    attn = torch.matmul(attn, value)
    #attn = attn @ value

    return attn.transpose(1, 2).contiguous(), attn_map.mean(dim=1)

================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/cache_cutfresh.py
================================================
from .fresh_ratio_scheduler import fresh_ratio_scheduler
from .score_evaluate import score_evaluate
#from .token_merge import token_merge
import torch
def cache_cutfresh(cache_dic, tokens, current):
    """
    indices: (B, N), the index tensor for the fresh tokens, tell where the 1st, 2nd, 3rd... tokens are
    fresh_indices: (B, fresh_ratio * N), top fresh_ratio cut for indices
    fresh_tokens: (B, fresh_ratio * N, D), the fresh tokens
    """
    tick1 = torch.cuda.Event(enable_timing=True)
    tick2 = torch.cuda.Event(enable_timing=True)
    #tick3 = torch.cuda.Event(enable_timing=True)
    #tick4 = torch.cuda.Event(enable_timing=True)

    step = current['step']
    layer = current['layer']
    module = current['module']

    fresh_ratio = fresh_ratio_scheduler(cache_dic, current)

    fresh_ratio = torch.clamp(torch.tensor(fresh_ratio, device = tokens.device), min=0, max=1) # 0.03ms
    # Generate the index tensor for fresh tokens
    #tick1.record()
    score = score_evaluate(cache_dic, tokens, current) # 0.26ms
    #tick2.record()
    #score = local_selection_with_space_time_bonus(cache_dic, score, 0.3, 2, time_mean=False)
    indices = score.argsort(dim=-1, descending=True) # 0.12ms
    #indices = cache_dic['indices_cache'][current['flag']][current['layer']]
    topk = int(fresh_ratio * score.shape[1])
    #topk = int(fresh_ratio * cache_dic['dynamic_size'][2] * cache_dic['dynamic_size'][3]) * cache_dic['dynamic_size'][1]
    fresh_indices = indices[:, :topk] #前fresh_ratio的token的index
    stale_indices = indices[:, topk:] #后1-fresh_ratio的token的index
    # (B, fresh_ratio *N)

    # stale tokens index + 1 in each ***module***, fresh tokens index = 0
    cache_dic['cache_index'][current['flag']][layer][module] += 1
    cache_dic['cache_index'][current['flag']][layer][module].scatter_(dim=1, index=fresh_indices, 
                                                                    src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device))
    cache_dic['cache_index']['layer_index'][module] += 1
    cache_dic['cache_index']['layer_index'][module].scatter_(dim=1, index=fresh_indices, 
                                                                    src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device))
    # 0.08ms
    # select the fresh tokens out
    fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])
    #stale_indices_expand = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])
    #if cache_dic['merge_weight'] != 0:
    #    token_merge(cache_dic, tokens, current, fresh_indices, stale_indices)        
    
    if module in ['mlp', 'attn', 'cross-attn']:
         
        fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices_expand)
        # 0.10ms
        #torch.cuda.synchronize()
        #print(tick1.elapsed_time(tick2))
        return fresh_indices, fresh_tokens
    else:
        raise ValueError("Unrecognized module?", module)
    
import torch
from einops import rearrange

def local_selection_with_space_time_bonus(cache_dic, score, bonus_ratio, grid_size=2, time_mean = False):
    # 从 cache_dic 中获取张量的形状
    B, T, H, W = cache_dic['dynamic_size']
    
    # 对 score 进行变形，将其重塑为 [B, T, H, W] 的形状
    score = rearrange(score, "B (T H W) -> B T H W", T=T, H=H, W=W)
    
    # 计算补 0 的尺寸，使得 H 和 W 都能被 grid_size 整除
    pad_h = (grid_size - H % grid_size) % grid_size  # H 维度需要补充的 0 的数量
    pad_w = (grid_size - W % grid_size) % grid_size  # W 维度需要补充的 0 的数量
    
    # 对 H 和 W 维度进行补 0
    if pad_h > 0 or pad_w > 0:
        score = torch.nn.functional.pad(score, (0, pad_w, 0, pad_h))  # (W 左右补 pad_w, H 上下补 pad_h)

    # 更新补 0 后的 H 和 W
    H_padded, W_padded = score.shape[2], score.shape[3]
    
    # Step 1: 在 H*W 维度上进行归一化，使得不同时间步的信息权重相同
    score = score.view(B, T, -1)  # 将 H 和 W 合并为一个维度 [B, T, H*W]
    score = torch.nn.functional.softmax(score, dim=-1)  # 在 H*W 维度上进行归一化
    score = score.view(B, T, H_padded, W_padded)  # 恢复到 [B, T, H_padded, W_padded] 形状

    # Step 2: 在每个空间切片（即每个 T 时间步内）进行分块操作
    block_size = grid_size * grid_size
    assert (H_padded * W_padded) % block_size == 0, f"H_padded * W_padded 必须能被块大小整除, shape: {B},{T},{H_padded},{W_padded}; block:{grid_size}*{grid_size};" 

    # 将 score 重塑为按块分组的形状
    score_reshaped = score.view(B, T, H_padded // grid_size, grid_size, W_padded // grid_size, grid_size)
    score_reshaped = score_reshaped.permute(0, 1, 2, 4, 3, 5).contiguous()  # [B, T, H//grid_size, W//grid_size, grid_size, grid_size]
    score_reshaped = score_reshaped.view(B, T, -1, block_size)  # [B, T, num_blocks, block_size]

    # Step 3: 找到每个块中的最大分数
    max_scores, max_indices = score_reshaped.max(dim=-1, keepdim=True)  # [B, T, num_blocks, 1]
    
    # Step 4: 创建掩码以标识最大分数的 token
    mask = torch.zeros_like(score_reshaped)
    mask.scatter_(-1, max_indices, 1)  # 将掩码在最大分数的索引位置设置为 1
    
    # Step 5: 仅对最大分数的 token 应用加成
    score_reshaped = score_reshaped + (mask * max_scores * bonus_ratio)  # 仅对最大分数应用加成
    
    # Step 6: 将 score 还原为原始的形状
    score_modified = score_reshaped.view(B, T, H_padded // grid_size, W_padded // grid_size, grid_size, grid_size)
    score_modified = score_modified.permute(0, 1, 2, 4, 3, 5).contiguous()
    score_modified = score_modified.view(B, T, H_padded, W_padded)

    # Step 7: 去除补 0 的部分
    if pad_h > 0 or pad_w > 0:
        score_modified = score_modified[:, :, :H, :W]  # 移除补的 0

    if time_mean:
        score_modified = score_modified.mean(dim = 1)
        score_modified = score_modified.unsqueeze(1).expand(B, T, H, W)
    # 最后将 score 变回原始的形状 [B, (T H W)]
    score_modified = rearrange(score_modified, "B T H W -> B (T H W)")
    
    return score_modified


================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/cache_init.py
================================================
def cache_init(model_kwargs, num_steps):   
    cache_dic = {}
    cache = {}
    indices_cache = {}
    cache_index = {}
    cache[-1]={}
    cache[0]={}
    indices_cache[-1]={}
    indices_cache[0]={}
    cache_index[-1]={}
    cache_index[0]={}
    cache_index['layer_index']={}
    cache_dic['attn_map'] = {}
    cache_dic['attn_map'][-1] = {}
    cache_dic['attn_map'][0] = {}
    cache_dic['cross_attn_map'] = {}
    cache_dic['cross_attn_map'][-1] = {}
    cache_dic['cross_attn_map'][0] = {}

    for j in range(28):
        cache[-1][j] = {}
        indices_cache[-1] = {}
        cache_index[-1][j] = {}
        cache_dic['attn_map'][-1][j] = {}
        cache_dic['cross_attn_map'][-1][j] = {}

        cache[0][j] = {}
        indices_cache[0] = {}
        cache_index[0][j] = {}
        cache_dic['attn_map'][0][j] = {}
        cache_dic['cross_attn_map'][0][j] = {}

    cache_dic['cache_type'] = model_kwargs['cache_type']
    cache_dic['cache_index'] = cache_index
    cache_dic['cache'] = cache
    cache_dic['indices_cache'] = indices_cache
    cache_dic['fresh_ratio_schedule'] = model_kwargs['ratio_scheduler']
    cache_dic['fresh_ratio'] = model_kwargs['fresh_ratio']
    cache_dic['fresh_threshold'] = model_kwargs['fresh_threshold']
    cache_dic['force_fresh'] = model_kwargs['force_fresh']
    cache_dic['soft_fresh_weight'] = model_kwargs['soft_fresh_weight']
    #cache_dic['extra_flops'] = 0.0
    #cache_dic['merge_weight'] = merge_weight
    current = {}
    current['num_steps'] = num_steps
    return cache_dic, current
    

================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/force_init.py
================================================
import torch
from .force_scheduler import force_scheduler
def force_init(cache_dic, current, tokens):
    cache_dic['cache_index'][current['flag']][current['layer']][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device)
    force_scheduler(cache_dic, current)
    if current['layer'] == 0:
        cache_dic['cache_index']['layer_index'][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device)

================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/force_scheduler.py
================================================
import torch
def force_scheduler(cache_dic, current):
    thresholds = {}
    if cache_dic['fresh_ratio'] == 0:
        # FORA
        linear_step_weight = 0.0
    else: 
        # TokenCache
        linear_step_weight = 0.0 #N=6 0.2 #N=4 0.3
    step_factor = torch.tensor(1 - linear_step_weight + 2 * linear_step_weight * current['step'] / current['num_steps'])
    threshold = torch.round(cache_dic['fresh_threshold'] / step_factor)
    #threshold = torch.round(4 / step_factor)
    key_point = 2
    if current['step'] in range(0,key_point):
        threshold = 1
    #thresholds = {
    #    'spat-attn' : 3,
    #    'temp-attn' : 3,
    #   'cross-attn' : 6,
    #          'mlp' : 3   }
    thresholds = {
        'spat-attn' : 1,
        'temp-attn' : 1,
       'cross-attn' : 1,
              'mlp' : 1   }
    #if current['step'] in range(150,175):
    #    threshold = 4
    #elif current['step'] in list(range(0,25)) + list(range(75,100)) + list(range(175,200)) + list(range(225,250)):
    #    threshold = 3
    #elif current['step'] in list(range(100,125)) + list(range(150,175)) + list(range(200,225)):
    #    threshold = 4
    #elif current['step'] in range(100,175):
    #    threshold = 5
    #elif current['step'] in range(200,225):
    #    threshold = 5
    #step_weight = 0.25
    #if current['step'] >= 0.5 * (1 - step_weight) * current['num_steps']:
    #    threshold =  int(cache_dic['fresh_threshold'] * (1 + step_weight))
    #elif current['step'] <= 0.5 * (1 - step_weight) * current['num_steps']:
    #    threshold = int(cache_dic['fresh_threshold'] * (1 - step_weight))
    cache_dic['cal_threshold'] = thresholds
    #return threshold

================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/fresh_ratio_scheduler.py
================================================
import torch
def fresh_ratio_scheduler(cache_dic, current):
    '''
    Return the fresh ratio for the current step.
    '''
    fresh_ratio = cache_dic['fresh_ratio']
    fresh_ratio_schedule = cache_dic['fresh_ratio_schedule']
    step = current['step']
    num_steps = current['num_steps']
    threshold = cache_dic['fresh_threshold']
    weight = 0.9
    if fresh_ratio_schedule == 'constant':
        return fresh_ratio
    elif fresh_ratio_schedule == 'linear':
        return fresh_ratio * (1 + weight - 2 * weight * step / num_steps)
    elif fresh_ratio_schedule == 'exp':
        #return 0.5 * (0.052 ** (step/num_steps))
        return fresh_ratio * (weight ** (step / num_steps))
    elif fresh_ratio_schedule == 'linear-mode':
        mode = (step % threshold)/threshold - 0.5
        mode_weight = 0.1
        return fresh_ratio * (1 + weight - 2 * weight * step / num_steps + mode_weight * mode)
    elif fresh_ratio_schedule == 'layerwise':
        return fresh_ratio * (1 + weight - 2 * weight * current['layer'] / 27)
    elif fresh_ratio_schedule == 'linear-layerwise':
        step_weight = 0.0 #0.9
        step_factor = 1 + step_weight - 2 * step_weight * step / num_steps

        layer_weight = 0.0
        layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27

        module_weight = 1.5
        module_time_weight = 0.33
        module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight)
        
        type_weight = 0.0
        type_factor = 1 + type_weight if current['flag'] == -1 else 1 - type_weight

        return fresh_ratio * layer_factor * step_factor * module_factor * type_factor

        #saved_weight = 0.25
        ##earliest 50%
        #if current['step'] % cache_dic['cal_threshold'] >=  (1- saved_weight) * cache_dic['cal_threshold']:
        #    return fresh_ratio * layer_factor * step_factor / saved_weight
        ## latest 50%
        ##if current['step'] % cache_dic['cal_threshold'] <=  (saved_weight) * cache_dic['cal_threshold']:
        ##    return fresh_ratio * layer_factor * step_factor / saved_weight
#
        #else :
        #    return 0

    else:
        raise ValueError("unrecognized fresh ratio schedule", fresh_ratio_schedule)


================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/global_force_fresh.py
================================================
from .force_scheduler import force_scheduler
def global_force_fresh(cache_dic, current):
    '''
    Return whether to force fresh tokens globally.
    '''
    is_force_fresh = {}
    fresh_thresholds = {}
    first_step = (current['step'] == 0)
    first_3steps = (current['step'] <= 2)
    last_step = current['step'] == current['num_steps'] - 1
    force_fresh = cache_dic['force_fresh']
    if not first_step:
        fresh_thresholds['spat-attn']  = cache_dic['cal_threshold']['spat-attn']
        fresh_thresholds['temp-attn']  = cache_dic['cal_threshold']['temp-attn']
        fresh_thresholds['cross-attn'] = cache_dic['cal_threshold']['cross-attn']
        fresh_thresholds['mlp']        = cache_dic['cal_threshold']['mlp']
    else:
        fresh_thresholds['spat-attn']  = cache_dic['fresh_threshold']
        fresh_thresholds['temp-attn']  = cache_dic['fresh_threshold']
        fresh_thresholds['cross-attn'] = cache_dic['fresh_threshold']
        fresh_thresholds['mlp']        = cache_dic['fresh_threshold']

    if force_fresh == 'global':
        if current['flag'] == -1:
            is_force_fresh['attn'] =   (first_3steps or (current['step']% fresh_thresholds['temp-attn'] == 0))
        else:
            is_force_fresh['attn'] =   (first_3steps or (current['step']% fresh_thresholds['spat-attn'] == 0))

        is_force_fresh['cross-attn'] = (first_3steps or (current['step']% fresh_thresholds['cross-attn'] == 0))
        is_force_fresh['mlp'] =        (first_3steps or (current['step']% fresh_thresholds['mlp'] == 0))

        return is_force_fresh
    elif force_fresh == 'local':
        return first_step
    elif force_fresh == 'none':
        return first_step
    else:
        raise ValueError("unrecognized force fresh strategy", force_fresh)

================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/score_evaluate.py
================================================
import torch
import torch.nn as nn
from .scores import attn_score, similarity_score, norm_score
def score_evaluate(cache_dic, tokens, current) -> torch.Tensor:
    '''
    Return the score tensor (B, N) for the given tokens.
    '''
    #这里用match case 来做可读性更好，但是考虑到match case是3.10版本才有的,而且其加速性能未验证，先用if else
    #fresh_ratio = cache_dic['fresh_ratio']
    #cache_index = cache_dic['cache_index']
    #start = torch.cuda.Event(enable_timing=True)
    #end = torch.cuda.Event(enable_timing=True)
    #start.record()
    if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')):
        # 0.4ms extra on 4090
        # 从cache_index中找出达到cache_step达到fresh_threshold的tokens
        force_fresh_mask = torch.as_tensor((cache_dic['cache_index'][current['flag']][current['layer']][current['module']] >= 2 * cache_dic['fresh_threshold']), dtype = int) # 2 because the threshold is for step, not module
        force_len = force_fresh_mask.sum(dim=1)
        force_indices = force_fresh_mask.argsort(dim = -1, descending = True)[:, :force_len.min()]
        #在维度-1随机重排
        force_indices = force_indices[:, torch.randperm(force_indices.shape[1])]

    if cache_dic['cache_type'] == 'random':
        score = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1], device=tokens.device)
        score = torch.cat([score, score], dim=0).to(tokens.device)

    elif cache_dic['cache_type'] == 'straight':
        score = torch.ones(tokens.shape[0], tokens.shape[1]).to(tokens.device)
    
    elif cache_dic['cache_type'] == 'attention':
        # cache_dic['attn_map'][step][layer] (B, N, N), the last dimention has get softmaxed
        score = attn_score(cache_dic, current)
        #score = score + 0.0 * torch.rand_like(score, device= score.device)
    
    elif cache_dic['cache_type'] == 'similarity':
        score = similarity_score(cache_dic, current, tokens)

    elif cache_dic['cache_type'] == 'norm':
        score = norm_score(cache_dic, current, tokens)

    elif cache_dic['cache_type'] == 'compress':
        score1 = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1])
        score1 = torch.cat([score1, score1], dim=0).to(tokens.device)
        score2 = cache_dic['attn_map'][current['flag']][current['layer']].sum(dim=1)#.mean(dim=0) # (B, N)
        # normalize
        score2 = score2 / score2.max(dim=1, keepdim=True)[0]
        score = 0.5 * score1 + 0.5 * score2
    #end.record()
    #torch.cuda.synchronize()
    #print(f"Time for score evaluation: {start.elapsed_time(end)} ms")
    if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # current['is_force_fresh'] is False, cause when it is True, no cut and fresh are needed
            #print(torch.ones_like(force_indices, dtype=float, device=force_indices.device).dtype)
        score.scatter_(dim=1, index=force_indices, src=torch.ones_like(force_indices, dtype=torch.float32, 
                                                                           device=force_indices.device))
    
    if (True and (cache_dic['force_fresh'] == 'global')):
        soft_step_score = cache_dic['cache_index'][current['flag']][current['layer']][current['module']].float() / (cache_dic['fresh_threshold'])
        soft_layer_score = cache_dic['cache_index']['layer_index'][current['module']].float() / (27)
        score = score + cache_dic['soft_fresh_weight'] * soft_step_score #+ 0.1 *soft_layer_score
    
    return score.to(tokens.device)

================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/scores.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

def attn_score(cache_dic, current):
    #self_attn_score = 1- cache_dic['attn_map'][current['flag']][current['layer']].diagonal(dim1=1, dim2=2)
    #self_attn_score = F.normalize(self_attn_score, dim=1, p=2)
    #attention_score = F.normalize(cache_dic['attn_map'][current['flag']][current['layer']].sum(dim=1), dim=1, p=2)
    #cross_attn_map = F.threshold(cache_dic['cross_attn_map'][current['flag']][current['layer']],threshold=0.0, value=0.0)
    #cross_attention_score = F.normalize(cross_attn_map.sum(dim=-1), dim=-1, p=2)
    cond_cmap, uncond_cmap = torch.split(cache_dic['cross_attn_map'][current['flag']][current['layer']], len(cache_dic['cross_attn_map'][current['flag']][current['layer']]) // 2, dim=0)
    cond_weight = 0.5
    cmap = cond_weight * cond_cmap + (1 - cond_weight) * uncond_cmap
    cross_attention_entropy = -torch.sum(cmap * torch.log(cmap + 1e-7), dim=-1)
    cross_attention_score   = F.normalize(1 + cross_attention_entropy, dim=1, p=2)
    #score = self_attn_score
    #score = attention_score
    score = cross_attention_score.repeat(2, 1)
    #cross_weight = 0.0
    #score =  (1-cross_weight) * attention_score + cross_weight * cross_attention_score
    return score

def similarity_score(cache_dic, current, tokens):
    cosine_sim = F.cosine_similarity(tokens, cache_dic['cache'][current['flag']][current['layer']][current['module']], dim=-1)

    return F.normalize(1- cosine_sim, dim=-1, p=2)

def norm_score(cache_dic, current, tokens):
    norm = tokens.norm(dim=-1, p=2)
    return F.normalize(norm, dim=-1, p=2)


================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/token_merge.py
================================================
import torch
def token_merge(cache_dic, tokens, current, fresh_indices, stale_indices):
        #fresh_tokens = torch.zeros_like(tokens).scatter_(dim=1, index=fresh_indices_expand, src=tokens.gather(dim=1, index=fresh_indices_expand))
        #stale_tokens = torch.zeros_like(tokens).scatter_(dim=1, index=stale_indices_expand, src=tokens.gather(dim=1, index=stale_indices_expand))
        #fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1)
        #stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1)
        #stale_fresh_similarity = stale_tokens @ fresh_tokens.transpose(1, 2)
        #fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])
        if (current['layer'] % 1 == 0):
            fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]))
            stale_tokens = torch.gather(input = tokens, dim = 1, index = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]))
            method = 'similarity'
            if method == 'distance':
                descending = False
                distance = torch.cdist(stale_tokens, fresh_tokens, p=1)
                stale_fresh_dist, stale_fresh_indices_allstale = torch.min(distance, dim=2)

            elif method == 'similarity':
                descending = True
                fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1)
                stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1)
                similarity = stale_tokens @ fresh_tokens.transpose(1, 2)
                stale_fresh_dist, stale_fresh_indices_allstale = torch.max(similarity, dim=2)
            
            # 在dim =  1 上再次排序，保留 saved_topk_stale 个最小的
            # 函数方案
            #layer_weight = 1.0
            #layer_factor = 1 - layer_weight + 2 * layer_weight * current['layer'] / 27
            #layer_factor = 2 * torch.sigmoid(torch.tensor([1.0 * (current['layer'] - 13.5 )]))
            #saved_topk_stale = int(cache_dic['merge_weight'] * stale_tokens.shape[1] * layer_factor)
            # 阈值自适应方案
            saved_topk_stale = int((stale_fresh_dist > 0.995).sum(dim=1).min())
            merged_stale_sequence = torch.sort(stale_fresh_dist, dim=1, descending=descending)[1][:,:saved_topk_stale]
            stale_fresh_indices = stale_fresh_indices_allstale.gather(1, merged_stale_sequence)
            merged_stale_sequence = stale_indices.gather(1, merged_stale_sequence)
            merged_stale_fresh_indices = fresh_indices.gather(1, stale_fresh_indices)

            cache_dic['merged_stale_fresh_indices'] = merged_stale_fresh_indices # 距离从小到大的stale tokens 与其对应fresh tokens的index
            cache_dic['merged_stale_sequence'] = merged_stale_sequence # 距离从小到大的stale tokens 的index
            #print(torch.all(merged_stale_fresh_indices == merged_stale_sequence)) 


================================================
FILE: Open-Sora/build/lib/opensora/models/cache_functions/update_cache.py
================================================
import torch
def update_cache(fresh_indices, fresh_tokens, cache_dic, current, fresh_attn_map=None):
    """
    Update the cache with fresh tokens based on the given index.
    
    Args:
    indices (torch.Tensor): The index tensor for tokens. 从权重高到底的index
    fresh_tokens (torch.Tensor): The fresh tokens to update the cache with.
    cach_dic (dict): The cache dictionary containing cache data and indices.
    current (dict): Dictionary containing the current step, layer, and module information.
    fresh_attn_map (torch.Tensor): The attention map for the fresh tokens. attn模块里已经排好序了,直接盖上去就行
    """
    step = current['step']
    layer = current['layer']
    module = current['module']
    # Update the cached tokens at the positions
    if module == 'attn':
        indices = fresh_indices#.sort(dim=1, descending=False)[0]
        cache_dic['attn_map'][current['flag']][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map)
    elif module == 'cross-attn':
        indices = fresh_indices#.sort(dim=1, descending=False)[0]
        cache_dic['cross_attn_map'][current['flag']][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map)
    elif module == 'mlp':
        indices = fresh_indices

    #if (indices.shape[1] != 0):
    #    to_be_updated_fresh_tokens = torch.gather(input = cache_dic['cache'][current['flag']][layer][module], dim = 1, index = indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]))
    #    residual_token = (fresh_tokens - to_be_updated_fresh_tokens).mean(dim=1)
    #    cache_dic['cache'][current['flag']][layer][module] = cache_dic['cache'][current['flag']][layer][module] + 0.0 * residual_token.unsqueeze(1)
    
    cache_dic['cache'][current['flag']][layer][module].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]), src=fresh_tokens)


================================================
FILE: Open-Sora/build/lib/opensora/models/dit/__init__.py
================================================
from .dit import DiT, DiT_XL_2, DiT_XL_2x2


================================================
FILE: Open-Sora/build/lib/opensora/models/dit/dit.py
================================================
# Modified from Meta DiT

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DiT:   https://github.com/facebookresearch/DiT/tree/main
# GLIDE: https://github.com/openai/glide-text2im
# MAE:   https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------

import numpy as np
import torch
import torch.nn as nn
import torch.utils.checkpoint
from einops import rearrange
from timm.models.vision_transformer import Mlp

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    FinalLayer,
    LabelEmbedder,
    PatchEmbed3D,
    TimestepEmbedder,
    approx_gelu,
    get_1d_sincos_pos_embed,
    get_2d_sincos_pos_embed,
    get_layernorm,
    modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


class DiTBlock(nn.Module):
    """
    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
    """

    def __init__(
        self,
        hidden_size,
        num_heads,
        mlp_ratio=4.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.enable_flash_attn = enable_flash_attn
        mlp_hidden_dim = int(hidden_size * mlp_ratio)

        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = Attention(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=enable_flash_attn,
        )
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))

    def forward(self, x, c):
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1, x, shift_msa, scale_msa))
        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2, x, shift_mlp, scale_mlp))
        return x


@MODELS.register_module()
class DiT(nn.Module):
    """
    Diffusion model with a Transformer backbone.
    """

    def __init__(
        self,
        input_size=(16, 32, 32),
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        learn_sigma=True,
        condition="text",
        no_temporal_pos_emb=False,
        caption_channels=512,
        model_max_length=77,
        dtype=torch.float32,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.learn_sigma = learn_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if learn_sigma else in_channels
        self.hidden_size = hidden_size
        self.patch_size = patch_size
        self.input_size = input_size
        num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
        self.num_patches = num_patches
        self.num_temporal = input_size[0] // patch_size[0]
        self.num_spatial = num_patches // self.num_temporal
        self.num_heads = num_heads
        self.dtype = dtype
        self.use_text_encoder = not condition.startswith("label")
        if enable_flash_attn:
            assert dtype in [
                torch.float16,
                torch.bfloat16,
            ], f"Flash attention only supports float16 and bfloat16, but got {self.dtype}"
        self.no_temporal_pos_emb = no_temporal_pos_emb
        self.mlp_ratio = mlp_ratio
        self.depth = depth
        assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in DiT"

        self.register_buffer("pos_embed_spatial", self.get_spatial_pos_embed())
        self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())

        self.x_embedder = PatchEmbed3D(patch_size, in_channels, embed_dim=hidden_size)
        if not self.use_text_encoder:
            num_classes = int(condition.split("_")[-1])
            self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
        else:
            self.y_embedder = CaptionEmbedder(
                in_channels=caption_channels,
                hidden_size=hidden_size,
                uncond_prob=class_dropout_prob,
                act_layer=approx_gelu,
                token_num=1,  # pooled token
            )
        self.t_embedder = TimestepEmbedder(hidden_size)
        self.blocks = nn.ModuleList(
            [
                DiTBlock(
                    hidden_size,
                    num_heads,
                    mlp_ratio=mlp_ratio,
                    enable_flash_attn=enable_flash_attn,
                    enable_layernorm_kernel=enable_layernorm_kernel,
                )
                for _ in range(depth)
            ]
        )
        self.final_layer = FinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels)

        self.initialize_weights()
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel

    def get_spatial_pos_embed(self):
        pos_embed = get_2d_sincos_pos_embed(
            self.hidden_size,
            self.input_size[1] // self.patch_size[1],
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def get_temporal_pos_embed(self):
        pos_embed = get_1d_sincos_pos_embed(
            self.hidden_size,
            self.input_size[0] // self.patch_size[0],
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def unpatchify(self, x):
        c = self.out_channels
        t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        pt, ph, pw = self.patch_size

        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
        x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
        return imgs

    def forward(self, x, t, y):
        """
        Forward pass of DiT.
        x: (B, C, T, H, W) tensor of inputs
        t: (B,) tensor of diffusion timesteps
        y: list of text
        """
        # origin inputs should be float32, cast to specified dtype
        x = x.to(self.dtype)
        if self.use_text_encoder:
            y = y.to(self.dtype)

        # embedding
        x = self.x_embedder(x)  # (B, N, D)
        x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
        x = x + self.pos_embed_spatial
        if not self.no_temporal_pos_emb:
            x = rearrange(x, "b t s d -> b s t d")
            x = x + self.pos_embed_temporal
            x = rearrange(x, "b s t d -> b (t s) d")
        else:
            x = rearrange(x, "b t s d -> b (t s) d")

        t = self.t_embedder(t, dtype=x.dtype)  # (N, D)
        y = self.y_embedder(y, self.training)  # (N, D)
        if self.use_text_encoder:
            y = y.squeeze(1).squeeze(1)
        condition = t + y

        # blocks
        for _, block in enumerate(self.blocks):
            c = condition
            x = auto_grad_checkpoint(block, x, c)  # (B, N, D)

        # final process
        x = self.final_layer(x, condition)  # (B, N, num_patches * out_channels)
        x = self.unpatchify(x)  # (B, out_channels, T, H, W)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                if module.weight.requires_grad_:
                    torch.nn.init.xavier_uniform_(module.weight)
                    if module.bias is not None:
                        nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
        nn.init.constant_(self.x_embedder.proj.bias, 0)

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)

        # Zero-out adaLN modulation layers in DiT blocks:
        for block in self.blocks:
            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)

        # Zero-out text embedding layers:
        if self.use_text_encoder:
            nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
            nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)


@MODELS.register_module("DiT-XL/2")
def DiT_XL_2(from_pretrained=None, **kwargs):
    model = DiT(
        depth=28,
        hidden_size=1152,
        patch_size=(1, 2, 2),
        num_heads=16,
        **kwargs,
    )
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("DiT-XL/2x2")
def DiT_XL_2x2(from_pretrained=None, **kwargs):
    model = DiT(
        depth=28,
        hidden_size=1152,
        patch_size=(2, 2, 2),
        num_heads=16,
        **kwargs,
    )
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/build/lib/opensora/models/latte/__init__.py
================================================
from .latte import Latte, Latte_XL_2, Latte_XL_2x2


================================================
FILE: Open-Sora/build/lib/opensora/models/latte/latte.py
================================================
# Copyright 2024 Vchitect/Latte
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.# Modified from Latte
#
#
# This file is mofied from https://github.com/Vchitect/Latte/blob/main/models/latte.py
#
# With references to:
# Latte:  https://github.com/Vchitect/Latte
# DiT:    https://github.com/facebookresearch/DiT/tree/main


import torch
from einops import rearrange, repeat

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.models.dit import DiT
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


@MODELS.register_module()
class Latte(DiT):
    def forward(self, x, t, y):
        """
        Forward pass of DiT.
        x: (B, C, T, H, W) tensor of inputs
        t: (B,) tensor of diffusion timesteps
        y: list of text
        """
        # origin inputs should be float32, cast to specified dtype
        x = x.to(self.dtype)

        # embedding
        x = self.x_embedder(x)  # (B, N, D)
        x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
        x = x + self.pos_embed_spatial
        x = rearrange(x, "b t s d -> b (t s) d")

        t = self.t_embedder(t, dtype=x.dtype)  # (N, D)
        y = self.y_embedder(y, self.training)  # (N, D)
        if self.use_text_encoder:
            y = y.squeeze(1).squeeze(1)
        condition = t + y
        condition_spatial = repeat(condition, "b d -> (b t) d", t=self.num_temporal)
        condition_temporal = repeat(condition, "b d -> (b s) d", s=self.num_spatial)

        # blocks
        for i, block in enumerate(self.blocks):
            if i % 2 == 0:
                # spatial
                x = rearrange(x, "b (t s) d -> (b t) s d", t=self.num_temporal, s=self.num_spatial)
                c = condition_spatial
            else:
                # temporal
                x = rearrange(x, "b (t s) d -> (b s) t d", t=self.num_temporal, s=self.num_spatial)
                c = condition_temporal
                if i == 1:
                    x = x + self.pos_embed_temporal

            x = auto_grad_checkpoint(block, x, c)  # (B, N, D)

            if i % 2 == 0:
                x = rearrange(x, "(b t) s d -> b (t s) d", t=self.num_temporal, s=self.num_spatial)
            else:
                x = rearrange(x, "(b s) t d -> b (t s) d", t=self.num_temporal, s=self.num_spatial)

        # final process
        x = self.final_layer(x, condition)  # (B, N, num_patches * out_channels)
        x = self.unpatchify(x)  # (B, out_channels, T, H, W)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x


@MODELS.register_module("Latte-XL/2")
def Latte_XL_2(from_pretrained=None, **kwargs):
    model = Latte(
        depth=28,
        hidden_size=1152,
        patch_size=(1, 2, 2),
        num_heads=16,
        **kwargs,
    )
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("Latte-XL/2x2")
def Latte_XL_2x2(from_pretrained=None, **kwargs):
    model = Latte(
        depth=28,
        hidden_size=1152,
        patch_size=(2, 2, 2),
        num_heads=16,
        **kwargs,
    )
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/build/lib/opensora/models/layers/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/opensora/models/layers/blocks.py
================================================
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
# Latte:  https://github.com/Vchitect/Latte
# DiT:    https://github.com/facebookresearch/DiT/tree/main
# GLIDE:  https://github.com/openai/glide-text2im
# MAE:    https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------

import functools
import math
from typing import Optional

import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
import xformers.ops
from einops import rearrange
from timm.models.vision_transformer import Mlp

from opensora.acceleration.communications import all_to_all, split_forward_gather_backward
from opensora.acceleration.parallel_states import get_sequence_parallel_group

from ..cache_functions.attention import cached_attention_forward

approx_gelu = lambda: nn.GELU(approximate="tanh")


class LlamaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


def get_layernorm(hidden_size: torch.Tensor, eps: float, affine: bool, use_kernel: bool):
    if use_kernel:
        try:
            from apex.normalization import FusedLayerNorm

            return FusedLayerNorm(hidden_size, elementwise_affine=affine, eps=eps)
        except ImportError:
            raise RuntimeError("FusedLayerNorm not available. Please install apex.")
    else:
        return nn.LayerNorm(hidden_size, eps, elementwise_affine=affine)


def modulate(norm_func, x, shift, scale):
    # Suppose x is (B, N, D), shift is (B, D), scale is (B, D)
    dtype = x.dtype
    x = norm_func(x.to(torch.float32)).to(dtype)
    x = x * (scale.unsqueeze(1) + 1) + shift.unsqueeze(1)
    x = x.to(dtype)
    return x


def t2i_modulate(x, shift, scale):
    return x * (1 + scale) + shift


# ===============================================
# General-purpose Layers
# ===============================================


class PatchEmbed3D(nn.Module):
    """Video to Patch Embedding.

    Args:
        patch_size (int): Patch token size. Default: (2,4,4).
        in_chans (int): Number of input video channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    """

    def __init__(
        self,
        patch_size=(2, 4, 4),
        in_chans=3,
        embed_dim=96,
        norm_layer=None,
        flatten=True,
    ):
        super().__init__()
        self.patch_size = patch_size
        self.flatten = flatten

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
            self.norm = None

    def forward(self, x):
        """Forward function."""
        # padding
        _, _, D, H, W = x.size()
        if W % self.patch_size[2] != 0:
            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
        if H % self.patch_size[1] != 0:
            x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
        if D % self.patch_size[0] != 0:
            x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))

        x = self.proj(x)  # (B C T H W)
        if self.norm is not None:
            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
            x = x.flatten(2).transpose(1, 2)
            x = self.norm(x)
            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # BCTHW -> BNC
        return x


class Attention(nn.Module):
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Module = LlamaRMSNorm,
        enable_flash_attn: bool = False,
        rope=None,
        qk_norm_legacy: bool = False,
    ) -> None:
        super().__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads"
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim**-0.5
        self.enable_flash_attn = False

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.qk_norm_legacy = qk_norm_legacy
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.rope = False
        if rope is not None:
            self.rope = True
            self.rotary_emb = rope
        
        self.is_causal = False
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, N, C = x.shape
        # flash attn is not memory efficient for small sequences, this is empirical
        enable_flash_attn = self.enable_flash_attn and (N > B)
        qkv = self.qkv(x)
        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)

        qkv = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(0)
        if self.qk_norm_legacy:
            # WARNING: this may be a bug
            if self.rope:
                q = self.rotary_emb(q)
                k = self.rotary_emb(k)
            q, k = self.q_norm(q), self.k_norm(k)
        else:
            q, k = self.q_norm(q), self.k_norm(k)
            if self.rope:
                q = self.rotary_emb(q)
                k = self.rotary_emb(k)

        if enable_flash_attn:
            from flash_attn import flash_attn_func

            # (B, #heads, N, #dim) -> (B, N, #heads, #dim)
            q = q.permute(0, 2, 1, 3)
            k = k.permute(0, 2, 1, 3)
            v = v.permute(0, 2, 1, 3)
            x = flash_attn_func(
                q,
                k,
                v,
                dropout_p=self.attn_drop.p if self.training else 0.0,
                softmax_scale=self.scale,
                causal=self.is_causal,
            )
        else:
            dtype = q.dtype
            q = q * self.scale
            #attn = q @ k.transpose(-2, -1)  # translate attn to float32
            attn = torch.matmul(q,k.transpose(-2, -1))
            attn = attn.to(torch.float32)
            if self.is_causal:
                causal_mask = torch.tril(torch.ones_like(attn), diagonal=0)
                causal_mask = torch.where(causal_mask.bool(), 0, float('-inf'))
                attn += causal_mask
            attn = attn.softmax(dim=-1)
            attn = attn.to(dtype)  # cast back attn to original dtype
            attn = self.attn_drop(attn)
            #x = attn @ v
            x = torch.matmul(attn,v)

        x_output_shape = (B, N, C)
        if not enable_flash_attn:
            x = x.transpose(1, 2)
        x = x.reshape(x_output_shape)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class KVCompressAttention(nn.Module):
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Module = LlamaRMSNorm,
        enable_flash_attn: bool = False,
        sampling="conv",
        sr_ratio=1,
        mem_eff_attention=False,
        attn_half=False,
    ) -> None:
        super().__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads"
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim**-0.5
        self.enable_flash_attn = enable_flash_attn

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)

        self.sr_ratio = sr_ratio
        self.sampling = sampling
        if sr_ratio > 1 and sampling == "conv":
            # Avg Conv Init.
            self.sr = nn.Conv2d(dim, dim, groups=dim, kernel_size=sr_ratio, stride=sr_ratio)
            self.sr.weight.data.fill_(1 / sr_ratio**2)
            self.sr.bias.data.zero_()
            self.norm = nn.LayerNorm(dim)

        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.mem_eff_attention = mem_eff_attention
        self.attn_half = attn_half

    def downsample_2d(self, tensor, H, W, scale_factor, sampling=None):
        if sampling is None or scale_factor == 1:
            return tensor
        B, N, C = tensor.shape

        if sampling == "uniform_every":
            return tensor[:, ::scale_factor], int(N // scale_factor)

        tensor = tensor.reshape(B, H, W, C).permute(0, 3, 1, 2)
        new_H, new_W = int(H / scale_factor), int(W / scale_factor)
        new_N = new_H * new_W

        if sampling == "ave":
            tensor = F.interpolate(tensor, scale_factor=1 / scale_factor, mode="nearest").permute(0, 2, 3, 1)
        elif sampling == "uniform":
            tensor = tensor[:, :, ::scale_factor, ::scale_factor].permute(0, 2, 3, 1)
        elif sampling == "conv":
            tensor = self.sr(tensor).reshape(B, C, -1).permute(0, 2, 1)
            tensor = self.norm(tensor)
        else:
            raise ValueError

        return tensor.reshape(B, new_N, C).contiguous(), new_N

    def forward(self, x: torch.Tensor, mask=None, HW=None, block_id=None, **kwargs) -> torch.Tensor:
        B, N, C = x.shape
        new_N = N
        H, W = HW
        # flash attn is not memory efficient for small sequences, this is empirical
        enable_flash_attn = self.enable_flash_attn and (N > B)

        qkv = self.qkv(x).reshape(B, N, 3, C)
        q, k, v = qkv.unbind(2)
        dtype = q.dtype
        # KV compression
        if self.sr_ratio > 1:
            k, new_N = self.downsample_2d(k, H, W, self.sr_ratio, sampling=self.sampling)
            v, new_N = self.downsample_2d(v, H, W, self.sr_ratio, sampling=self.sampling)

        q = q.reshape(B, N, self.num_heads, C // self.num_heads).to(dtype)
        k = k.reshape(B, new_N, self.num_heads, C // self.num_heads).to(dtype)
        v = v.reshape(B, new_N, self.num_heads, C // self.num_heads).to(dtype)

        q, k = self.q_norm(q), self.k_norm(k)

        if enable_flash_attn:
            from flash_attn import flash_attn_func

            x = flash_attn_func(
                q,
                k,
                v,
                dropout_p=self.attn_drop.p if self.training else 0.0,
                softmax_scale=self.scale,
            )

        elif self.mem_eff_attention:
            attn_bias = None
            if mask is not None:
                attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
                attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float("-inf"))
            x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
        else:
            # (B, N, #heads, #dim) -> (B, #heads, N, #dim)
            q = q.permute(0, 2, 1, 3)
            k = k.permute(0, 2, 1, 3)
            v = v.permute(0, 2, 1, 3)
            dtype = q.dtype
            q = q * self.scale
            attn = q @ k.transpose(-2, -1)  # translate attn to float32
            if not self.attn_half:
                attn = attn.to(torch.float32)
            attn = attn.softmax(dim=-1)
            attn = attn.to(dtype)  # cast back attn to original dtype
            attn = self.attn_drop(attn)
            x = attn @ v

        x_output_shape = (B, N, C)
        if not enable_flash_attn:
            x = x.transpose(1, 2)
        x = x.reshape(x_output_shape)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class SeqParallelAttention(Attention):
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Module = LlamaRMSNorm,
        enable_flash_attn: bool = False,
        rope=None,
    ) -> None:
        assert rope is None, "Rope is not supported in SeqParallelAttention"
        super().__init__(
            dim=dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
            norm_layer=norm_layer,
            enable_flash_attn=enable_flash_attn,
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, N, C = x.shape  # for sequence parallel here, the N is a local sequence length
        qkv = self.qkv(x)
        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
        qkv = qkv.view(qkv_shape)

        sp_group = get_sequence_parallel_group()

        # apply all_to_all to gather sequence and split attention heads
        # [B, SUB_N, 3, NUM_HEAD, HEAD_DIM] -> [B, N, 3, NUM_HEAD_PER_DEVICE, HEAD_DIM]
        qkv = all_to_all(qkv, sp_group, scatter_dim=3, gather_dim=1)

        if self.enable_flash_attn:
            qkv_permute_shape = (
                2,
                0,
                1,
                3,
                4,
            )  # [3, B, N, NUM_HEAD_PER_DEVICE, HEAD_DIM]
        else:
            qkv_permute_shape = (
                2,
                0,
                3,
                1,
                4,
            )  # [3, B, NUM_HEAD_PER_DEVICE, N, HEAD_DIM]
        qkv = qkv.permute(qkv_permute_shape)

        # ERROR: Should qk_norm first
        q, k, v = qkv.unbind(0)
        q, k = self.q_norm(q), self.k_norm(k)
        if self.enable_flash_attn:
            from flash_attn import flash_attn_func

            x = flash_attn_func(
                q,
                k,
                v,
                dropout_p=self.attn_drop.p if self.training else 0.0,
                softmax_scale=self.scale,
            )
        else:
            dtype = q.dtype
            q = q * self.scale
            attn = q @ k.transpose(-2, -1)  # translate attn to float32
            attn = attn.to(torch.float32)
            attn = attn.softmax(dim=-1)
            attn = attn.to(dtype)  # cast back attn to original dtype
            attn = self.attn_drop(attn)
            x = attn @ v

        if not self.enable_flash_attn:
            x = x.transpose(1, 2)

        # apply all to all to gather back attention heads and split sequence
        # [B, N, NUM_HEAD_PER_DEVICE, HEAD_DIM]  -> [B, SUB_N, NUM_HEAD, HEAD_DIM]
        x = all_to_all(x, sp_group, scatter_dim=1, gather_dim=2)

        # reshape outputs back to [B, N, C]
        x_output_shape = (B, N, C)
        x = x.reshape(x_output_shape)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0):
        super(MultiHeadCrossAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.kv_linear = nn.Linear(d_model, d_model * 2)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(d_model, d_model)
        self.proj_drop = nn.Dropout(proj_drop)
    
    def forward(self, x, cond, mask=None):
        #start = torch.cuda.Event(enable_timing=True)
        #end = torch.cuda.Event(enable_timing=True)
        # query/value: img tokens; key: condition; mask: if padding tokens
        B, N, C = x.shape
        #start.record()
        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
        k, v = kv.unbind(2)

        attn_bias = None
        if mask is not None:
            attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
        #x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)

        x, cross_attn_map = cached_attention_forward(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
        x = x.view(B, -1, C)
        cross_attn_map = cross_attn_map.view(B, -1, cross_attn_map.shape[-1])
        x = self.proj(x)
        x = self.proj_drop(x)
        #end.record()
        #torch.cuda.synchronize()
        #print(start.elapsed_time(end))
        return x, cross_attn_map


class SeqParallelMultiHeadCrossAttention(MultiHeadCrossAttention):
    def __init__(
        self,
        d_model,
        num_heads,
        attn_drop=0.0,
        proj_drop=0.0,
    ):
        super().__init__(
            d_model=d_model,
            num_heads=num_heads,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
        )

    def forward(self, x, cond, mask=None):
        # query/value: img tokens; key: condition; mask: if padding tokens
        sp_group = get_sequence_parallel_group()
        sp_size = dist.get_world_size(sp_group)
        B, SUB_N, C = x.shape  # [B, TS/p, C]
        N = SUB_N * sp_size

        # shape:
        # q, k, v: [B, SUB_N, NUM_HEADS, HEAD_DIM]
        q = self.q_linear(x).view(B, -1, self.num_heads, self.head_dim)
        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
        kv = split_forward_gather_backward(kv, get_sequence_parallel_group(), dim=3, grad_scale="down")
        k, v = kv.unbind(2)

        # apply all_to_all to gather sequence and split attention heads
        q = all_to_all(q, sp_group, scatter_dim=2, gather_dim=1)

        q = q.view(1, -1, self.num_heads // sp_size, self.head_dim)
        k = k.view(1, -1, self.num_heads // sp_size, self.head_dim)
        v = v.view(1, -1, self.num_heads // sp_size, self.head_dim)

        # compute attention
        attn_bias = None
        if mask is not None:
            attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
        x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)

        # apply all to all to gather back attention heads and scatter sequence
        x = x.view(B, -1, self.num_heads // sp_size, self.head_dim)
        x = all_to_all(x, sp_group, scatter_dim=1, gather_dim=2)

        # apply output projection
        x = x.view(B, -1, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class FinalLayer(nn.Module):
    """
    The final layer of DiT.
    """

    def __init__(self, hidden_size, num_patch, out_channels):
        super().__init__()
        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))

    def forward(self, x, c):
        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
        x = modulate(self.norm_final, x, shift, scale)
        x = self.linear(x)
        return x


class T2IFinalLayer(nn.Module):
    """
    The final layer of PixArt.
    """

    def __init__(self, hidden_size, num_patch, out_channels, d_t=None, d_s=None):
        super().__init__()
        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True)
        self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5)
        self.out_channels = out_channels
        self.d_t = d_t
        self.d_s = d_s

    def t_mask_select(self, x_mask, x, masked_x, T, S):
        # x: [B, (T, S), C]
        # mased_x: [B, (T, S), C]
        # x_mask: [B, T]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S)
        x = torch.where(x_mask[:, :, None, None], x, masked_x)
        x = rearrange(x, "B T S C -> B (T S) C")
        return x

    def forward(self, x, t, x_mask=None, t0=None, T=None, S=None):
        if T is None:
            T = self.d_t
        if S is None:
            S = self.d_s
        shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
        x = t2i_modulate(self.norm_final(x), shift, scale)
        if x_mask is not None:
            shift_zero, scale_zero = (self.scale_shift_table[None] + t0[:, None]).chunk(2, dim=1)
            x_zero = t2i_modulate(self.norm_final(x), shift_zero, scale_zero)
            x = self.t_mask_select(x_mask, x, x_zero, T, S)
        x = self.linear(x)
        return x


# ===============================================
# Embedding Layers for Timesteps and Class Labels
# ===============================================


class TimestepEmbedder(nn.Module):
    """
    Embeds scalar timesteps into vector representations.
    """

    def __init__(self, hidden_size, frequency_embedding_size=256):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
            nn.SiLU(),
            nn.Linear(hidden_size, hidden_size, bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size

    @staticmethod
    def timestep_embedding(t, dim, max_period=10000):
        """
        Create sinusoidal timestep embeddings.
        :param t: a 1-D Tensor of N indices, one per batch element.
                          These may be fractional.
        :param dim: the dimension of the output.
        :param max_period: controls the minimum frequency of the embeddings.
        :return: an (N, D) Tensor of positional embeddings.
        """
        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
        half = dim // 2
        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half)
        freqs = freqs.to(device=t.device)
        args = t[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
        return embedding

    def forward(self, t, dtype):
        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
        if t_freq.dtype != dtype:
            t_freq = t_freq.to(dtype)
        t_emb = self.mlp(t_freq)
        return t_emb


class LabelEmbedder(nn.Module):
    """
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
    """

    def __init__(self, num_classes, hidden_size, dropout_prob):
        super().__init__()
        use_cfg_embedding = dropout_prob > 0
        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
        self.num_classes = num_classes
        self.dropout_prob = dropout_prob

    def token_drop(self, labels, force_drop_ids=None):
        """
        Drops labels to enable classifier-free guidance.
        """
        if force_drop_ids is None:
            drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob
        else:
            drop_ids = force_drop_ids == 1
        labels = torch.where(drop_ids, self.num_classes, labels)
        return labels

    def forward(self, labels, train, force_drop_ids=None):
        use_dropout = self.dropout_prob > 0
        if (train and use_dropout) or (force_drop_ids is not None):
            labels = self.token_drop(labels, force_drop_ids)
        return self.embedding_table(labels)


class SizeEmbedder(TimestepEmbedder):
    """
    Embeds scalar timesteps into vector representations.
    """

    def __init__(self, hidden_size, frequency_embedding_size=256):
        super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size)
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
            nn.SiLU(),
            nn.Linear(hidden_size, hidden_size, bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size
        self.outdim = hidden_size

    def forward(self, s, bs):
        if s.ndim == 1:
            s = s[:, None]
        assert s.ndim == 2
        if s.shape[0] != bs:
            s = s.repeat(bs // s.shape[0], 1)
            assert s.shape[0] == bs
        b, dims = s.shape[0], s.shape[1]
        s = rearrange(s, "b d -> (b d)")
        s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype)
        s_emb = self.mlp(s_freq)
        s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
        return s_emb

    @property
    def dtype(self):
        return next(self.parameters()).dtype


class CaptionEmbedder(nn.Module):
    """
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
    """

    def __init__(
        self,
        in_channels,
        hidden_size,
        uncond_prob,
        act_layer=nn.GELU(approximate="tanh"),
        token_num=120,
    ):
        super().__init__()
        self.y_proj = Mlp(
            in_features=in_channels,
            hidden_features=hidden_size,
            out_features=hidden_size,
            act_layer=act_layer,
            drop=0,
        )
        self.register_buffer(
            "y_embedding",
            torch.randn(token_num, in_channels) / in_channels**0.5,
        )
        self.uncond_prob = uncond_prob

    def token_drop(self, caption, force_drop_ids=None):
        """
        Drops labels to enable classifier-free guidance.
        """
        if force_drop_ids is None:
            drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob
        else:
            drop_ids = force_drop_ids == 1
        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
        return caption

    def forward(self, caption, train, force_drop_ids=None):
        if train:
            assert caption.shape[2:] == self.y_embedding.shape
        use_dropout = self.uncond_prob > 0
        if (train and use_dropout) or (force_drop_ids is not None):
            caption = self.token_drop(caption, force_drop_ids)
        caption = self.y_proj(caption)
        return caption


class PositionEmbedding2D(nn.Module):
    def __init__(self, dim: int) -> None:
        super().__init__()
        self.dim = dim
        assert dim % 4 == 0, "dim must be divisible by 4"
        half_dim = dim // 2
        inv_freq = 1.0 / (10000 ** (torch.arange(0, half_dim, 2).float() / half_dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

    def _get_sin_cos_emb(self, t: torch.Tensor):
        out = torch.einsum("i,d->id", t, self.inv_freq)
        emb_cos = torch.cos(out)
        emb_sin = torch.sin(out)
        return torch.cat((emb_sin, emb_cos), dim=-1)

    @functools.lru_cache(maxsize=512)
    def _get_cached_emb(
        self,
        device: torch.device,
        dtype: torch.dtype,
        h: int,
        w: int,
        scale: float = 1.0,
        base_size: Optional[int] = None,
    ):
        grid_h = torch.arange(h, device=device) / scale
        grid_w = torch.arange(w, device=device) / scale
        if base_size is not None:
            grid_h *= base_size / h
            grid_w *= base_size / w
        grid_h, grid_w = torch.meshgrid(
            grid_w,
            grid_h,
            indexing="ij",
        )  # here w goes first
        grid_h = grid_h.t().reshape(-1)
        grid_w = grid_w.t().reshape(-1)
        emb_h = self._get_sin_cos_emb(grid_h)
        emb_w = self._get_sin_cos_emb(grid_w)
        return torch.concat([emb_h, emb_w], dim=-1).unsqueeze(0).to(dtype)

    def forward(
        self,
        x: torch.Tensor,
        h: int,
        w: int,
        scale: Optional[float] = 1.0,
        base_size: Optional[int] = None,
    ) -> torch.Tensor:
        return self._get_cached_emb(x.device, x.dtype, h, w, scale, base_size)


# ===============================================
# Sine/Cosine Positional Embedding Functions
# ===============================================
# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py


def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, scale=1.0, base_size=None):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    if not isinstance(grid_size, tuple):
        grid_size = (grid_size, grid_size)

    grid_h = np.arange(grid_size[0], dtype=np.float32) / scale
    grid_w = np.arange(grid_size[1], dtype=np.float32) / scale
    if base_size is not None:
        grid_h *= base_size / grid_size[0]
        grid_w *= base_size / grid_size[1]
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)

    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token and extra_tokens > 0:
        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
    return pos_embed


def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0

    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)

    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
    return emb


def get_1d_sincos_pos_embed(embed_dim, length, scale=1.0):
    pos = np.arange(0, length)[..., None] / scale
    return get_1d_sincos_pos_embed_from_grid(embed_dim, pos)


def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float64)
    omega /= embed_dim / 2.0
    omega = 1.0 / 10000**omega  # (D/2,)

    pos = pos.reshape(-1)  # (M,)
    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product

    emb_sin = np.sin(out)  # (M, D/2)
    emb_cos = np.cos(out)  # (M, D/2)

    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
    return emb


================================================
FILE: Open-Sora/build/lib/opensora/models/pixart/__init__.py
================================================
from .pixart import PixArt, PixArt_1B_2, PixArt_XL_2
from .pixart_sigma import PixArt_Sigma_XL_2


================================================
FILE: Open-Sora/build/lib/opensora/models/pixart/pixart.py
================================================
# Adapted from PixArt
#
# Copyright (C) 2023  PixArt-alpha/PixArt-alpha
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
# DiT:    https://github.com/facebookresearch/DiT/tree/main
# --------------------------------------------------------

import numpy as np
import torch
import torch.nn as nn
from einops import rearrange
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp

# from .builder import MODELS
from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    MultiHeadCrossAttention,
    PatchEmbed3D,
    SeqParallelAttention,
    SeqParallelMultiHeadCrossAttention,
    SizeEmbedder,
    T2IFinalLayer,
    TimestepEmbedder,
    approx_gelu,
    get_1d_sincos_pos_embed,
    get_2d_sincos_pos_embed,
    get_layernorm,
    t2i_modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


class PixArtBlock(nn.Module):
    """
    A PixArt block with adaptive layer norm (adaLN-single) conditioning.
    """

    def __init__(
        self,
        hidden_size,
        num_heads,
        mlp_ratio=4.0,
        drop_path=0.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.enable_flash_attn = enable_flash_attn
        self._enable_sequence_parallelism = enable_sequence_parallelism

        if enable_sequence_parallelism:
            self.attn_cls = SeqParallelAttention
            self.mha_cls = SeqParallelMultiHeadCrossAttention
        else:
            self.attn_cls = Attention
            self.mha_cls = MultiHeadCrossAttention

        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = self.attn_cls(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=enable_flash_attn,
        )
        self.cross_attn = self.mha_cls(hidden_size, num_heads)
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(
            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)

    def forward(self, x, y, t, mask=None):
        B, N, C = x.shape

        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
            self.scale_shift_table[None] + t.reshape(B, 6, -1)
        ).chunk(6, dim=1)
        x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
        x = x + self.cross_attn(x, y, mask)
        x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))

        return x


@MODELS.register_module()
class PixArt(nn.Module):
    """
    Diffusion model with a Transformer backbone.
    """

    def __init__(
        self,
        input_size=(1, 32, 32),
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        pred_sigma=True,
        drop_path: float = 0.0,
        no_temporal_pos_emb=False,
        caption_channels=4096,
        model_max_length=120,
        dtype=torch.float32,
        freeze=None,
        space_scale=1.0,
        time_scale=1.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
        base_size=None,
    ):
        super().__init__()
        assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in this version."
        self.pred_sigma = pred_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if pred_sigma else in_channels
        self.hidden_size = hidden_size
        self.patch_size = patch_size
        self.input_size = input_size
        num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
        self.num_patches = num_patches
        self.num_temporal = input_size[0] // patch_size[0]
        self.num_spatial = num_patches // self.num_temporal
        if base_size is None:
            self.base_size = int(np.sqrt(self.num_spatial))
        else:
            self.base_size = base_size // patch_size[1]
        self.num_heads = num_heads
        self.dtype = dtype
        self.no_temporal_pos_emb = no_temporal_pos_emb
        self.depth = depth
        self.mlp_ratio = mlp_ratio
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel
        self.space_scale = space_scale
        self.time_scale = time_scale

        self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
        self.t_embedder = TimestepEmbedder(hidden_size)
        self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
        self.y_embedder = CaptionEmbedder(
            in_channels=caption_channels,
            hidden_size=hidden_size,
            uncond_prob=class_dropout_prob,
            act_layer=approx_gelu,
            token_num=model_max_length,
        )

        self.register_buffer("pos_embed", self.get_spatial_pos_embed())
        self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())

        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList(
            [
                PixArtBlock(
                    hidden_size,
                    num_heads,
                    mlp_ratio=mlp_ratio,
                    drop_path=drop_path[i],
                    enable_flash_attn=enable_flash_attn,
                    enable_layernorm_kernel=enable_layernorm_kernel,
                )
                for i in range(depth)
            ]
        )
        self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels)

        self.initialize_weights()
        if freeze is not None:
            assert freeze in ["text"]
            if freeze == "text":
                self.freeze_text()

    def forward(self, x, timestep, y, mask=None, **kwargs):
        """
        Forward pass of PixArt.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N, 1, 120, C) tensor of class labels
        """
        dtype = self.x_embedder.proj.weight.dtype
        B = x.size(0)
        x = x.to(dtype)
        timestep = timestep.to(dtype)
        y = y.to(dtype)

        # embedding
        x = self.x_embedder(x)  # (B, N, D)
        x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
        x = x + self.pos_embed
        if not self.no_temporal_pos_emb:
            x = rearrange(x, "b t s d -> b s t d")
            x = x + self.pos_embed_temporal
            x = rearrange(x, "b s t d -> b (t s) d")
        else:
            x = rearrange(x, "b t s d -> b (t s) d")

        t = self.t_embedder(timestep, dtype=x.dtype)  # (N, D)
        t0 = self.t_block(t)
        y = self.y_embedder(y, self.training)  # (N, 1, L, D)
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # blocks
        for block in self.blocks:
            x = auto_grad_checkpoint(block, x, y, t0, y_lens)

        # final process
        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)  # (N, out_channels, H, W)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def unpatchify(self, x):
        c = self.out_channels
        t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        pt, ph, pw = self.patch_size

        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
        x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
        return imgs

    def get_spatial_pos_embed(self, grid_size=None):
        if grid_size is None:
            grid_size = self.input_size[1:]
        pos_embed = get_2d_sincos_pos_embed(
            self.hidden_size,
            (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]),
            scale=self.space_scale,
            base_size=self.base_size,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def get_temporal_pos_embed(self):
        pos_embed = get_1d_sincos_pos_embed(
            self.hidden_size,
            self.input_size[0] // self.patch_size[0],
            scale=self.time_scale,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def freeze_text(self):
        for n, p in self.named_parameters():
            if "cross_attn" in n:
                p.requires_grad = False

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
        nn.init.normal_(self.t_block[1].weight, std=0.02)

        # Initialize caption embedding MLP:
        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)

        # Zero-out adaLN modulation layers in PixArt blocks:
        for block in self.blocks:
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)


@MODELS.register_module()
class PixArtMS(PixArt):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3"
        self.csize_embedder = SizeEmbedder(self.hidden_size // 3)
        self.ar_embedder = SizeEmbedder(self.hidden_size // 3)

    def forward(self, x, timestep, y, mask=None, data_info=None):
        """
        Forward pass of PixArt.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N, 1, 120, C) tensor of class labels
        """
        x = x.to(self.dtype)
        timestep = timestep.to(self.dtype)
        y = y.to(self.dtype)

        c_size = data_info["hw"]
        ar = data_info["ar"]
        pos_embed = self.get_spatial_pos_embed((x.shape[-2], x.shape[-1])).to(x.dtype)

        # embedding
        x = self.x_embedder(x)  # (B, N, D)
        x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
        x = x + pos_embed.to(x.device)
        if not self.no_temporal_pos_emb:
            x = rearrange(x, "b t s d -> b s t d")
            x = x + self.pos_embed_temporal
            x = rearrange(x, "b s t d -> b (t s) d")
        else:
            x = rearrange(x, "b t s d -> b (t s) d")

        t = self.t_embedder(timestep, dtype=x.dtype)  # (N, D)
        B = x.shape[0]
        csize = self.csize_embedder(c_size, B)
        ar = self.ar_embedder(ar, B)
        t = t + torch.cat([csize, ar], dim=1)

        t0 = self.t_block(t)
        y = self.y_embedder(y, self.training)  # (N, 1, L, D)
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # blocks
        for block in self.blocks:
            x = block(x, y, t0, y_lens)

        # final process
        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)  # (N, out_channels, H, W)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x


@MODELS.register_module("PixArt-XL/2")
def PixArt_XL_2(from_pretrained=None, **kwargs):
    model = PixArt(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("PixArt-1B/2")
def PixArt_1B_2(from_pretrained=None, **kwargs):
    model = PixArt(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs)
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("PixArtMS-XL/2")
def PixArtMS_XL_2(from_pretrained=None, **kwargs):
    model = PixArtMS(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/build/lib/opensora/models/pixart/pixart_sigma.py
================================================
# Adapted from PixArt
#
# Copyright (C) 2023  PixArt-alpha/PixArt-alpha
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
# DiT:    https://github.com/facebookresearch/DiT/tree/main
# --------------------------------------------------------

import numpy as np
import torch
import torch.nn as nn
from einops import rearrange
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp

# from .builder import MODELS
from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.models.layers.blocks import (
    CaptionEmbedder,
    KVCompressAttention,
    MultiHeadCrossAttention,
    PatchEmbed3D,
    T2IFinalLayer,
    TimestepEmbedder,
    approx_gelu,
    get_1d_sincos_pos_embed,
    get_2d_sincos_pos_embed,
    get_layernorm,
    t2i_modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


class PixArtBlock(nn.Module):
    """
    A PixArt block with adaptive layer norm (adaLN-single) conditioning.
    """

    def __init__(
        self,
        hidden_size,
        num_heads,
        mlp_ratio=4.0,
        drop_path=0.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
        qk_norm=False,
        sampling="conv",
        sr_ratio=1,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.enable_flash_attn = enable_flash_attn
        self._enable_sequence_parallelism = enable_sequence_parallelism
        assert not enable_sequence_parallelism, "Sequence parallelism is not supported in this version."

        self.attn_cls = KVCompressAttention
        self.mha_cls = MultiHeadCrossAttention

        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = self.attn_cls(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=enable_flash_attn,
            qk_norm=qk_norm,
            sr_ratio=sr_ratio,
            sampling=sampling,
            attn_half=True,
        )
        self.cross_attn = self.mha_cls(hidden_size, num_heads)
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(
            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)
        self.sampling = sampling
        self.sr_ratio = sr_ratio

    def forward(self, x, y, t, hw, mask=None):
        B, N, C = x.shape

        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
            self.scale_shift_table[None] + t.reshape(B, 6, -1)
        ).chunk(6, dim=1)
        x = x + self.drop_path(
            gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa), HW=hw).reshape(B, N, C)
        )
        x = x + self.cross_attn(x, y, mask)
        x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))

        return x


@MODELS.register_module()
class PixArt_Sigma(nn.Module):
    """
    Diffusion model with a Transformer backbone.
    """

    def __init__(
        self,
        input_size=(1, 32, 32),
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        pred_sigma=True,
        drop_path: float = 0.0,
        no_temporal_pos_emb=False,
        caption_channels=4096,
        model_max_length=120,
        dtype=torch.float32,
        freeze=None,
        qk_norm=False,
        space_scale=1.0,
        time_scale=1.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
        kv_compress_config=None,
    ):
        super().__init__()
        assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in this version."
        self.pred_sigma = pred_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if pred_sigma else in_channels
        self.hidden_size = hidden_size
        self.patch_size = patch_size
        self.input_size = input_size
        num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
        self.num_patches = num_patches
        self.num_temporal = input_size[0] // patch_size[0]
        self.num_spatial = num_patches // self.num_temporal
        self.base_size = int(np.sqrt(self.num_spatial))
        self.num_heads = num_heads
        self.dtype = dtype
        self.no_temporal_pos_emb = no_temporal_pos_emb
        self.depth = depth
        self.mlp_ratio = mlp_ratio
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel
        self.space_scale = space_scale
        self.time_scale = time_scale

        self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
        self.t_embedder = TimestepEmbedder(hidden_size)
        self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
        self.y_embedder = CaptionEmbedder(
            in_channels=caption_channels,
            hidden_size=hidden_size,
            uncond_prob=class_dropout_prob,
            act_layer=approx_gelu,
            token_num=model_max_length,
        )

        self.register_buffer("pos_embed", self.get_spatial_pos_embed())
        self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())

        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule

        self.kv_compress_config = kv_compress_config
        if kv_compress_config is None:
            self.kv_compress_config = {
                "sampling": None,
                "scale_factor": 1,
                "kv_compress_layer": [],
            }

        self.blocks = nn.ModuleList(
            [
                PixArtBlock(
                    hidden_size,
                    num_heads,
                    mlp_ratio=mlp_ratio,
                    drop_path=drop_path[i],
                    enable_flash_attn=enable_flash_attn,
                    enable_layernorm_kernel=enable_layernorm_kernel,
                    qk_norm=qk_norm,
                    sr_ratio=(
                        int(self.kv_compress_config["scale_factor"])
                        if i in self.kv_compress_config["kv_compress_layer"]
                        else 1
                    ),
                    sampling=self.kv_compress_config["sampling"],
                )
                for i in range(depth)
            ]
        )
        self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels)

        self.initialize_weights()
        if freeze is not None:
            assert freeze in ["text"]
            if freeze == "text":
                self.freeze_text()

    def forward(self, x, timestep, y, mask=None):
        """
        Forward pass of PixArt.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N, 1, 120, C) tensor of class labels
        """
        x = x.to(self.dtype)
        timestep = timestep.to(self.dtype)
        y = y.to(self.dtype)
        pos_embed = self.get_spatial_pos_embed((x.shape[-2], x.shape[-1])).to(x.dtype)
        hw = (x.shape[-2] // self.patch_size[-2], x.shape[-1] // self.patch_size[-1])

        # embedding
        x = self.x_embedder(x)  # (B, N, D)
        x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
        x = x + pos_embed.to(x.device)
        if not self.no_temporal_pos_emb:
            x = rearrange(x, "b t s d -> b s t d")
            x = x + self.pos_embed_temporal
            x = rearrange(x, "b s t d -> b (t s) d")
        else:
            x = rearrange(x, "b t s d -> b (t s) d")

        t = self.t_embedder(timestep, dtype=x.dtype)  # (N, D)
        t0 = self.t_block(t)
        y = self.y_embedder(y, self.training)  # (N, 1, L, D)
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # blocks
        for block in self.blocks:
            x = auto_grad_checkpoint(block, x, y, t0, hw, y_lens)

        # final process
        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)  # (N, out_channels, H, W)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def unpatchify(self, x):
        c = self.out_channels
        t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        pt, ph, pw = self.patch_size

        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
        x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
        return imgs

    def get_spatial_pos_embed(self, grid_size=None):
        if grid_size is None:
            grid_size = self.input_size[1:]
        pos_embed = get_2d_sincos_pos_embed(
            self.hidden_size,
            (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]),
            scale=self.space_scale,
            base_size=self.base_size,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def get_temporal_pos_embed(self):
        pos_embed = get_1d_sincos_pos_embed(
            self.hidden_size,
            self.input_size[0] // self.patch_size[0],
            scale=self.time_scale,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def freeze_text(self):
        for n, p in self.named_parameters():
            if "cross_attn" in n:
                p.requires_grad = False

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
        nn.init.normal_(self.t_block[1].weight, std=0.02)

        # Initialize caption embedding MLP:
        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)

        # Zero-out adaLN modulation layers in PixArt blocks:
        for block in self.blocks:
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)


@MODELS.register_module("PixArt-Sigma-XL/2")
def PixArt_Sigma_XL_2(from_pretrained=None, **kwargs):
    model = PixArt_Sigma(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/build/lib/opensora/models/stdit/__init__.py
================================================
from .stdit import STDiT
from .stdit2 import STDiT2
from .stdit3 import STDiT3


================================================
FILE: Open-Sora/build/lib/opensora/models/stdit/stdit.py
================================================
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
from einops import rearrange
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
from opensora.acceleration.parallel_states import get_sequence_parallel_group
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    MultiHeadCrossAttention,
    PatchEmbed3D,
    SeqParallelAttention,
    SeqParallelMultiHeadCrossAttention,
    T2IFinalLayer,
    TimestepEmbedder,
    approx_gelu,
    get_1d_sincos_pos_embed,
    get_2d_sincos_pos_embed,
    get_layernorm,
    t2i_modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


class STDiTBlock(nn.Module):
    def __init__(
        self,
        hidden_size,
        num_heads,
        d_s=None,
        d_t=None,
        mlp_ratio=4.0,
        drop_path=0.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.enable_flash_attn = enable_flash_attn
        self._enable_sequence_parallelism = enable_sequence_parallelism

        if enable_sequence_parallelism:
            self.attn_cls = SeqParallelAttention
            self.mha_cls = SeqParallelMultiHeadCrossAttention
        else:
            self.attn_cls = Attention
            self.mha_cls = MultiHeadCrossAttention

        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = self.attn_cls(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=enable_flash_attn,
        )
        self.cross_attn = self.mha_cls(hidden_size, num_heads)
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(
            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)

        # temporal attention
        self.d_s = d_s
        self.d_t = d_t

        if self._enable_sequence_parallelism:
            sp_size = dist.get_world_size(get_sequence_parallel_group())
            # make sure d_t is divisible by sp_size
            assert d_t % sp_size == 0
            self.d_t = d_t // sp_size

        self.attn_temp = self.attn_cls(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=self.enable_flash_attn,
        )

    def t_mask_select(self, x, masked_x, x_mask):
        # x: [B, (T, S), C]
        # mased_x: [B, (T, S), C]
        # x_mask: [B, T]
        x = rearrange(x, "B (T S) C -> B T S C", T=self.d_t, S=self.d_s)
        masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=self.d_t, S=self.d_s)
        x = torch.where(x_mask[:, :, None, None], x, masked_x)
        x = rearrange(x, "B T S C -> B (T S) C")
        return x

    def forward(self, x, y, t, mask=None, tpe=None, x_mask=None, t0=None):
        B, N, C = x.shape

        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
            self.scale_shift_table[None] + t.reshape(B, 6, -1)
        ).chunk(6, dim=1)
        x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
        if x_mask is not None:
            shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = (
                self.scale_shift_table[None] + t0.reshape(B, 6, -1)
            ).chunk(6, dim=1)
            x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero)
            x_m = self.t_mask_select(x_m, x_m_zero, x_mask)

        # spatial branch
        x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=self.d_t, S=self.d_s)
        x_s = self.attn(x_s)
        x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=self.d_t, S=self.d_s)

        if x_mask is not None:
            x_s_zero = gate_msa_zero * x_s
            x_s = gate_msa * x_s
            x_s = self.t_mask_select(x_s, x_s_zero, x_mask)
        else:
            x_s = gate_msa * x_s

        x = x + self.drop_path(x_s)

        # temporal branch
        x_t = rearrange(x, "B (T S) C -> (B S) T C", T=self.d_t, S=self.d_s)
        if tpe is not None:
            x_t = x_t + tpe
        x_t = self.attn_temp(x_t)
        x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=self.d_t, S=self.d_s)
        x = x + self.drop_path(gate_msa * x_t)

        # cross attn
        x = x + self.cross_attn(x, y, mask)

        # mlp
        x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)
        if x_mask is not None:
            x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero)
            x_m = self.t_mask_select(x_m, x_m_zero, x_mask)

        x_mlp = self.mlp(x_m)
        if x_mask is not None:
            x_mlp_zero = gate_mlp_zero * x_mlp
            x_mlp = gate_mlp * x_mlp
            x_mlp = self.t_mask_select(x_mlp, x_mlp_zero, x_mask)
        else:
            x_mlp = gate_mlp * x_mlp

        x = x + self.drop_path(x_mlp)

        return x


@MODELS.register_module()
class STDiT(nn.Module):
    def __init__(
        self,
        input_size=(1, 32, 32),
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        pred_sigma=True,
        drop_path=0.0,
        no_temporal_pos_emb=False,
        caption_channels=4096,
        model_max_length=120,
        dtype=torch.float32,
        space_scale=1.0,
        time_scale=1.0,
        freeze=None,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.pred_sigma = pred_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if pred_sigma else in_channels
        self.hidden_size = hidden_size
        self.patch_size = patch_size
        self.input_size = input_size
        num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
        self.num_patches = num_patches
        self.num_temporal = input_size[0] // patch_size[0]
        self.num_spatial = num_patches // self.num_temporal
        self.num_heads = num_heads
        self.dtype = dtype
        self.no_temporal_pos_emb = no_temporal_pos_emb
        self.depth = depth
        self.mlp_ratio = mlp_ratio
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel
        self.space_scale = space_scale
        self.time_scale = time_scale

        self.register_buffer("pos_embed", self.get_spatial_pos_embed())
        self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())

        self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
        self.t_embedder = TimestepEmbedder(hidden_size)
        self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
        self.y_embedder = CaptionEmbedder(
            in_channels=caption_channels,
            hidden_size=hidden_size,
            uncond_prob=class_dropout_prob,
            act_layer=approx_gelu,
            token_num=model_max_length,
        )

        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]
        self.blocks = nn.ModuleList(
            [
                STDiTBlock(
                    self.hidden_size,
                    self.num_heads,
                    mlp_ratio=self.mlp_ratio,
                    drop_path=drop_path[i],
                    enable_flash_attn=self.enable_flash_attn,
                    enable_layernorm_kernel=self.enable_layernorm_kernel,
                    enable_sequence_parallelism=enable_sequence_parallelism,
                    d_t=self.num_temporal,
                    d_s=self.num_spatial,
                )
                for i in range(self.depth)
            ]
        )
        self.final_layer = T2IFinalLayer(
            hidden_size,
            np.prod(self.patch_size),
            self.out_channels,
            d_t=self.num_temporal,
            d_s=self.num_spatial,
        )

        # init model
        self.initialize_weights()
        self.initialize_temporal()
        if freeze is not None:
            assert freeze in ["not_temporal", "text"]
            if freeze == "not_temporal":
                self.freeze_not_temporal()
            elif freeze == "text":
                self.freeze_text()

        # sequence parallel related configs
        self.enable_sequence_parallelism = enable_sequence_parallelism
        if enable_sequence_parallelism:
            self.sp_rank = dist.get_rank(get_sequence_parallel_group())
        else:
            self.sp_rank = None

    def forward(self, x, timestep, y, mask=None, x_mask=None, **kwargs):
        """
        Forward pass of STDiT.
        Args:
            x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W]
            timestep (torch.Tensor): diffusion time steps; of shape [B]
            y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C]
            mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token]

        Returns:
            x (torch.Tensor): output latent representation; of shape [B, C, T, H, W]
        """
        dtype = self.x_embedder.proj.weight.dtype
        x = x.to(dtype)
        timestep = timestep.to(dtype)
        y = y.to(dtype)

        # embedding
        x = self.x_embedder(x)  # [B, N, C]
        x = rearrange(x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial)
        x = x + self.pos_embed
        x = rearrange(x, "B T S C -> B (T S) C")

        # shard over the sequence dim if sp is enabled
        if self.enable_sequence_parallelism:
            x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="down")

        t = self.t_embedder(timestep, dtype=x.dtype)  # [B, C]
        t_mlp = self.t_block(t)  # [B, C]
        if x_mask is not None:
            t0_timestep = torch.zeros_like(timestep)
            t0 = self.t_embedder(t0_timestep, dtype=x.dtype)
            t0_mlp = self.t_block(t0)
        else:
            t0 = None
            t0_mlp = None
        y = self.y_embedder(y, self.training)  # [B, 1, N_token, C]

        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # blocks
        for i, block in enumerate(self.blocks):
            if i == 0:
                if self.enable_sequence_parallelism:
                    tpe = torch.chunk(
                        self.pos_embed_temporal, dist.get_world_size(get_sequence_parallel_group()), dim=1
                    )[self.sp_rank].contiguous()
                else:
                    tpe = self.pos_embed_temporal
            else:
                tpe = None
            x = auto_grad_checkpoint(block, x, y, t_mlp, y_lens, tpe, x_mask, t0_mlp)

        if self.enable_sequence_parallelism:
            x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="up")
        # x.shape: [B, N, C]

        # final process
        x = self.final_layer(x, t, x_mask, t0)  # [B, N, C=T_p * H_p * W_p * C_out]
        x = self.unpatchify(x)  # [B, C_out, T, H, W]

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def unpatchify(self, x):
        """
        Args:
            x (torch.Tensor): of shape [B, N, C]

        Return:
            x (torch.Tensor): of shape [B, C_out, T, H, W]
        """

        N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        T_p, H_p, W_p = self.patch_size
        x = rearrange(
            x,
            "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
            N_t=N_t,
            N_h=N_h,
            N_w=N_w,
            T_p=T_p,
            H_p=H_p,
            W_p=W_p,
            C_out=self.out_channels,
        )
        return x

    def unpatchify_old(self, x):
        c = self.out_channels
        t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        pt, ph, pw = self.patch_size

        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
        x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
        return imgs

    def get_spatial_pos_embed(self, grid_size=None):
        if grid_size is None:
            grid_size = self.input_size[1:]
        pos_embed = get_2d_sincos_pos_embed(
            self.hidden_size,
            (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]),
            scale=self.space_scale,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def get_temporal_pos_embed(self):
        pos_embed = get_1d_sincos_pos_embed(
            self.hidden_size,
            self.input_size[0] // self.patch_size[0],
            scale=self.time_scale,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def freeze_not_temporal(self):
        for n, p in self.named_parameters():
            if "attn_temp" not in n:
                p.requires_grad = False

    def freeze_text(self):
        for n, p in self.named_parameters():
            if "cross_attn" in n:
                p.requires_grad = False

    def initialize_temporal(self):
        for block in self.blocks:
            nn.init.constant_(block.attn_temp.proj.weight, 0)
            nn.init.constant_(block.attn_temp.proj.bias, 0)

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
        nn.init.normal_(self.t_block[1].weight, std=0.02)

        # Initialize caption embedding MLP:
        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)

        # Zero-out adaLN modulation layers in PixArt blocks:
        for block in self.blocks:
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)


@MODELS.register_module("STDiT-XL/2")
def STDiT_XL_2(from_pretrained=None, **kwargs):
    model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/build/lib/opensora/models/stdit/stdit2.py
================================================
import os

import numpy as np
import torch
import torch.nn as nn
from einops import rearrange
from rotary_embedding_torch import RotaryEmbedding
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp
from transformers import PretrainedConfig, PreTrainedModel

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    MultiHeadCrossAttention,
    PatchEmbed3D,
    PositionEmbedding2D,
    SizeEmbedder,
    T2IFinalLayer,
    TimestepEmbedder,
    approx_gelu,
    get_2d_sincos_pos_embed,
    get_layernorm,
    t2i_modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


class STDiT2Block(nn.Module):
    def __init__(
        self,
        hidden_size,
        num_heads,
        mlp_ratio=4.0,
        drop_path=0.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
        rope=None,
        qk_norm=False,
        qk_norm_legacy=False,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.enable_flash_attn = enable_flash_attn
        self._enable_sequence_parallelism = enable_sequence_parallelism

        # spatial branch
        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = Attention(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=enable_flash_attn,
            qk_norm=qk_norm,
            qk_norm_legacy=qk_norm_legacy,
        )
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)

        # cross attn
        self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads)

        # mlp branch
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(
            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

        # temporal branch
        self.norm_temp = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)  # new
        self.attn_temp = Attention(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=self.enable_flash_attn,
            rope=rope,
            qk_norm=qk_norm,
            qk_norm_legacy=qk_norm_legacy,
        )
        self.scale_shift_table_temporal = nn.Parameter(torch.randn(3, hidden_size) / hidden_size**0.5)  # new

    def t_mask_select(self, x_mask, x, masked_x, T, S):
        # x: [B, (T, S), C]
        # mased_x: [B, (T, S), C]
        # x_mask: [B, T]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S)
        x = torch.where(x_mask[:, :, None, None], x, masked_x)
        x = rearrange(x, "B T S C -> B (T S) C")
        return x

    def forward(self, x, y, t, t_tmp, mask=None, x_mask=None, t0=None, t0_tmp=None, T=None, S=None):
        B, N, C = x.shape

        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
            self.scale_shift_table[None] + t.reshape(B, 6, -1)
        ).chunk(6, dim=1)
        shift_tmp, scale_tmp, gate_tmp = (self.scale_shift_table_temporal[None] + t_tmp.reshape(B, 3, -1)).chunk(
            3, dim=1
        )
        if x_mask is not None:
            shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = (
                self.scale_shift_table[None] + t0.reshape(B, 6, -1)
            ).chunk(6, dim=1)
            shift_tmp_zero, scale_tmp_zero, gate_tmp_zero = (
                self.scale_shift_table_temporal[None] + t0_tmp.reshape(B, 3, -1)
            ).chunk(3, dim=1)

        # modulate
        x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
        if x_mask is not None:
            x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero)
            x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

        # spatial branch
        x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S)
        x_s = self.attn(x_s)
        x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=T, S=S)
        if x_mask is not None:
            x_s_zero = gate_msa_zero * x_s
            x_s = gate_msa * x_s
            x_s = self.t_mask_select(x_mask, x_s, x_s_zero, T, S)
        else:
            x_s = gate_msa * x_s
        x = x + self.drop_path(x_s)

        # modulate
        x_m = t2i_modulate(self.norm_temp(x), shift_tmp, scale_tmp)
        if x_mask is not None:
            x_m_zero = t2i_modulate(self.norm_temp(x), shift_tmp_zero, scale_tmp_zero)
            x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

        # temporal branch
        x_t = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S)
        x_t = self.attn_temp(x_t)
        x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=T, S=S)
        if x_mask is not None:
            x_t_zero = gate_tmp_zero * x_t
            x_t = gate_tmp * x_t
            x_t = self.t_mask_select(x_mask, x_t, x_t_zero, T, S)
        else:
            x_t = gate_tmp * x_t
        x = x + self.drop_path(x_t)

        # cross attn
        x = x + self.cross_attn(x, y, mask)

        # modulate
        x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)
        if x_mask is not None:
            x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero)
            x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

        # mlp
        x_mlp = self.mlp(x_m)
        if x_mask is not None:
            x_mlp_zero = gate_mlp_zero * x_mlp
            x_mlp = gate_mlp * x_mlp
            x_mlp = self.t_mask_select(x_mask, x_mlp, x_mlp_zero, T, S)
        else:
            x_mlp = gate_mlp * x_mlp
        x = x + self.drop_path(x_mlp)

        return x


class STDiT2Config(PretrainedConfig):
    model_type = "STDiT2"

    def __init__(
        self,
        input_size=(None, None, None),
        input_sq_size=32,
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        pred_sigma=True,
        drop_path=0.0,
        no_temporal_pos_emb=False,
        caption_channels=4096,
        model_max_length=120,
        freeze=None,
        qk_norm=False,
        qk_norm_legacy=False,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        **kwargs,
    ):
        self.input_size = input_size
        self.input_sq_size = input_sq_size
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.hidden_size = hidden_size
        self.depth = depth
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.class_dropout_prob = class_dropout_prob
        self.pred_sigma = pred_sigma
        self.drop_path = drop_path
        self.no_temporal_pos_emb = no_temporal_pos_emb
        self.caption_channels = caption_channels
        self.model_max_length = model_max_length
        self.freeze = freeze
        self.qk_norm = qk_norm
        self.qk_norm_legacy = qk_norm_legacy
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel
        super().__init__(**kwargs)


@MODELS.register_module()
class STDiT2(PreTrainedModel):
    config_class = STDiT2Config

    def __init__(self, config):
        super().__init__(config)
        self.pred_sigma = config.pred_sigma
        self.in_channels = config.in_channels
        self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_heads
        self.no_temporal_pos_emb = config.no_temporal_pos_emb
        self.depth = config.depth
        self.mlp_ratio = config.mlp_ratio
        self.enable_flash_attn = config.enable_flash_attn
        self.enable_layernorm_kernel = config.enable_layernorm_kernel

        # support dynamic input
        self.patch_size = config.patch_size
        self.input_size = config.input_size
        self.input_sq_size = config.input_sq_size
        self.pos_embed = PositionEmbedding2D(config.hidden_size)

        self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size)
        self.t_embedder = TimestepEmbedder(config.hidden_size)
        self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True))
        self.t_block_temp = nn.Sequential(
            nn.SiLU(), nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=True)
        )  # new
        self.y_embedder = CaptionEmbedder(
            in_channels=config.caption_channels,
            hidden_size=config.hidden_size,
            uncond_prob=config.class_dropout_prob,
            act_layer=approx_gelu,
            token_num=config.model_max_length,
        )

        drop_path = [x.item() for x in torch.linspace(0, config.drop_path, config.depth)]
        self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads)  # new
        self.blocks = nn.ModuleList(
            [
                STDiT2Block(
                    self.hidden_size,
                    self.num_heads,
                    mlp_ratio=self.mlp_ratio,
                    drop_path=drop_path[i],
                    enable_flash_attn=self.enable_flash_attn,
                    enable_layernorm_kernel=self.enable_layernorm_kernel,
                    rope=self.rope.rotate_queries_or_keys,
                    qk_norm=config.qk_norm,
                    qk_norm_legacy=config.qk_norm_legacy,
                )
                for i in range(self.depth)
            ]
        )
        self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels)

        # multi_res
        assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3"
        self.csize_embedder = SizeEmbedder(self.hidden_size // 3)
        self.ar_embedder = SizeEmbedder(self.hidden_size // 3)
        self.fl_embedder = SizeEmbedder(self.hidden_size)  # new
        self.fps_embedder = SizeEmbedder(self.hidden_size)  # new

        # init model
        self.initialize_weights()
        self.initialize_temporal()
        if config.freeze is not None:
            assert config.freeze in ["not_temporal", "text"]
            if config.freeze == "not_temporal":
                self.freeze_not_temporal()
            elif config.freeze == "text":
                self.freeze_text()

    def get_dynamic_size(self, x):
        _, _, T, H, W = x.size()
        if T % self.patch_size[0] != 0:
            T += self.patch_size[0] - T % self.patch_size[0]
        if H % self.patch_size[1] != 0:
            H += self.patch_size[1] - H % self.patch_size[1]
        if W % self.patch_size[2] != 0:
            W += self.patch_size[2] - W % self.patch_size[2]
        T = T // self.patch_size[0]
        H = H // self.patch_size[1]
        W = W // self.patch_size[2]
        return (T, H, W)

    def forward(
        self, x, timestep, y, mask=None, x_mask=None, num_frames=None, height=None, width=None, ar=None, fps=None
    ):
        """
        Forward pass of STDiT.
        Args:
            x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W]
            timestep (torch.Tensor): diffusion time steps; of shape [B]
            y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C]
            mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token]

        Returns:
            x (torch.Tensor): output latent representation; of shape [B, C, T, H, W]
        """
        B = x.shape[0]
        dtype = self.x_embedder.proj.weight.dtype
        x = x.to(dtype)
        timestep = timestep.to(dtype)
        y = y.to(dtype)

        # === process data info ===
        # 1. get dynamic size
        hw = torch.cat([height[:, None], width[:, None]], dim=1)
        rs = (height[0].item() * width[0].item()) ** 0.5
        csize = self.csize_embedder(hw, B)

        # 2. get aspect ratio
        ar = ar.unsqueeze(1)
        ar = self.ar_embedder(ar, B)
        data_info = torch.cat([csize, ar], dim=1)

        # 3. get number of frames
        fl = num_frames.unsqueeze(1)
        fps = fps.unsqueeze(1)
        fl = self.fl_embedder(fl, B)
        fl = fl + self.fps_embedder(fps, B)

        # === get dynamic shape size ===
        _, _, Tx, Hx, Wx = x.size()
        T, H, W = self.get_dynamic_size(x)
        S = H * W
        scale = rs / self.input_sq_size
        base_size = round(S**0.5)
        pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size)

        # embedding
        x = self.x_embedder(x)  # [B, N, C]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        x = x + pos_emb
        x = rearrange(x, "B T S C -> B (T S) C")

        # prepare adaIN
        t = self.t_embedder(timestep, dtype=x.dtype)  # [B, C]
        t_spc = t + data_info  # [B, C]
        t_tmp = t + fl  # [B, C]
        t_spc_mlp = self.t_block(t_spc)  # [B, 6*C]
        t_tmp_mlp = self.t_block_temp(t_tmp)  # [B, 3*C]
        if x_mask is not None:
            t0_timestep = torch.zeros_like(timestep)
            t0 = self.t_embedder(t0_timestep, dtype=x.dtype)
            t0_spc = t0 + data_info
            t0_tmp = t0 + fl
            t0_spc_mlp = self.t_block(t0_spc)
            t0_tmp_mlp = self.t_block_temp(t0_tmp)
        else:
            t0_spc = None
            t0_tmp = None
            t0_spc_mlp = None
            t0_tmp_mlp = None

        # prepare y
        y = self.y_embedder(y, self.training)  # [B, 1, N_token, C]

        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # blocks
        for _, block in enumerate(self.blocks):
            x = auto_grad_checkpoint(
                block,
                x,
                y,
                t_spc_mlp,
                t_tmp_mlp,
                y_lens,
                x_mask,
                t0_spc_mlp,
                t0_tmp_mlp,
                T,
                S,
            )
            # x.shape: [B, N, C]

        # final process
        x = self.final_layer(x, t, x_mask, t0_spc, T, S)  # [B, N, C=T_p * H_p * W_p * C_out]
        x = self.unpatchify(x, T, H, W, Tx, Hx, Wx)  # [B, C_out, T, H, W]

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w):
        """
        Args:
            x (torch.Tensor): of shape [B, N, C]

        Return:
            x (torch.Tensor): of shape [B, C_out, T, H, W]
        """

        # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        T_p, H_p, W_p = self.patch_size
        x = rearrange(
            x,
            "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
            N_t=N_t,
            N_h=N_h,
            N_w=N_w,
            T_p=T_p,
            H_p=H_p,
            W_p=W_p,
            C_out=self.out_channels,
        )
        # unpad
        x = x[:, :, :R_t, :R_h, :R_w]
        return x

    def unpatchify_old(self, x):
        c = self.out_channels
        t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        pt, ph, pw = self.patch_size

        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
        x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
        return imgs

    def get_spatial_pos_embed(self, H, W, scale=1.0, base_size=None):
        pos_embed = get_2d_sincos_pos_embed(
            self.hidden_size,
            (H, W),
            scale=scale,
            base_size=base_size,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def freeze_not_temporal(self):
        for n, p in self.named_parameters():
            if "attn_temp" not in n:
                p.requires_grad = False

    def freeze_text(self):
        for n, p in self.named_parameters():
            if "cross_attn" in n:
                p.requires_grad = False

    def initialize_temporal(self):
        for block in self.blocks:
            nn.init.constant_(block.attn_temp.proj.weight, 0)
            nn.init.constant_(block.attn_temp.proj.bias, 0)

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
        nn.init.normal_(self.t_block[1].weight, std=0.02)
        nn.init.normal_(self.t_block_temp[1].weight, std=0.02)

        # Initialize caption embedding MLP:
        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)

        # Zero-out adaLN modulation layers in PixArt blocks:
        for block in self.blocks:
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)


@MODELS.register_module("STDiT2-XL/2")
def STDiT2_XL_2(from_pretrained=None, **kwargs):
    if from_pretrained is not None:
        if os.path.isdir(from_pretrained) or os.path.isfile(from_pretrained):
            # if it is a directory or a file, we load the checkpoint manually
            config = STDiT2Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
            model = STDiT2(config)
            load_checkpoint(model, from_pretrained)
            return model
        else:
            # otherwise, we load the model from hugging face hub
            return STDiT2.from_pretrained(from_pretrained)
    else:
        # create a new model
        config = STDiT2Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
        model = STDiT2(config)
    return model


================================================
FILE: Open-Sora/build/lib/opensora/models/stdit/stdit3 copy.py
================================================
import os

import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from rotary_embedding_torch import RotaryEmbedding
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp
from transformers import PretrainedConfig, PreTrainedModel

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
from opensora.acceleration.parallel_states import get_sequence_parallel_group
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    MultiHeadCrossAttention,
    PatchEmbed3D,
    PositionEmbedding2D,
    SeqParallelAttention,
    SeqParallelMultiHeadCrossAttention,
    SizeEmbedder,
    T2IFinalLayer,
    TimestepEmbedder,
    approx_gelu,
    get_layernorm,
    t2i_modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint

from ...models.cache_functions import global_force_fresh, cache_cutfresh, update_cache, force_init, score_evaluate

class STDiT3Block(nn.Module):
    def __init__(
        self,
        hidden_size,
        num_heads,
        mlp_ratio=4.0,
        drop_path=0.0,
        rope=None,
        qk_norm=False,
        temporal=False,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.temporal = temporal
        self.hidden_size = hidden_size
        self.enable_flash_attn = enable_flash_attn
        self.enable_sequence_parallelism = enable_sequence_parallelism

        if self.enable_sequence_parallelism and not temporal:
            attn_cls = SeqParallelAttention
            mha_cls = SeqParallelMultiHeadCrossAttention
        else:
            attn_cls = Attention
            mha_cls = MultiHeadCrossAttention

        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = attn_cls(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            qk_norm=qk_norm,
            rope=rope,
            enable_flash_attn=enable_flash_attn,
        )
        self.cross_attn = mha_cls(hidden_size, num_heads)
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(
            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)

    def t_mask_select(self, x_mask, x, masked_x, T, S):
        # x: [B, (T, S), C]
        # mased_x: [B, (T, S), C]
        # x_mask: [B, T]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S)
        x = torch.where(x_mask[:, :, None, None], x, masked_x)
        x = rearrange(x, "B T S C -> B (T S) C")
        return x

    def forward(
        self,
        x,
        y,
        t,
        current,
        cache_dic,
        mask=None,  # text mask
        x_mask=None,  # temporal mask
        t0=None,  # t with timestamp=0
        T=None,  # number of frames
        S=None,  # number of pixel patches
    ):
        # prepare modulate parameters
        B, N, C = x.shape
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
            self.scale_shift_table[None] + t.reshape(B, 6, -1)
        ).chunk(6, dim=1)
        if x_mask is not None:
            shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = (
                self.scale_shift_table[None] + t0.reshape(B, 6, -1)
            ).chunk(6, dim=1)
        #attn_tick = torch.cuda.Event(enable_timing=True)
        #cross_attn_tick = torch.cuda.Event(enable_timing=True)
        #end_cross_attn_tick = torch.cuda.Event(enable_timing=True)
        #mlp_tick = torch.cuda.Event(enable_timing=True)
        #end = torch.cuda.Event(enable_timing=True)
        if self.temporal:
            current['flag'] = -1
        else:
            current['flag'] = 0
        is_force_fresh = global_force_fresh(cache_dic, current)
        current['is_force_fresh'] = is_force_fresh
        #print(is_force_fresh)
        if is_force_fresh:
            # modulate (attention)
            current['module'] = 'attn'
            #attn_tick.record()
            x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
            if x_mask is not None:
                x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero)
                x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

            # attention
            if self.temporal:
                x_m = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S)
                x_m = self.attn(x_m)
                x_m = rearrange(x_m, "(B S) T C -> B (T S) C", T=T, S=S)
            else:
                x_m = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S)
                x_m = self.attn(x_m)
                x_m = rearrange(x_m, "(B T) S C -> B (T S) C", T=T, S=S)

            cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m
            force_init(cache_dic, current, x)
            # modulate (attention)
            x_m_s = gate_msa * x_m
            if x_mask is not None:
                x_m_s_zero = gate_msa_zero * x_m
                x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S)

            # residual
            x = x + self.drop_path(x_m_s)

            # cross attention
            current['module'] = 'cross-attn'
            #cross_attn_tick.record()
            cache_dic['cache'][current['flag']][current['layer']][current['module']], cache_dic['cross_attn_map'][current['flag']][current['layer']] = self.cross_attn(x, y, mask)
            force_init(cache_dic, current, x)

            x = x + cache_dic['cache'][current['flag']][current['layer']][current['module']]

            # modulate (MLP)
            current['module'] = 'mlp'
            #mlp_tick.record()
            x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)
            if x_mask is not None:
                x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero)
                x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

            # MLP
            x_m = self.mlp(x_m)
            cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m
            # modulate (MLP)
            x_m_s = gate_mlp * x_m
            if x_mask is not None:
                x_m_s_zero = gate_mlp_zero * x_m
                x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S)

            # residual
            force_init(cache_dic, current, x)
            x = x + self.drop_path(x_m_s)
            #end.record()
            #torch.cuda.synchronize()
            #print(attn_tick.elapsed_time(cross_attn_tick),cross_attn_tick.elapsed_time(mlp_tick),mlp_tick.elapsed_time(end))
        else:
            # modulate (attention)
            current['module'] = 'attn'
            #attn_tick.record()
            #cal_attn = current['step'] % cache_dic['cal_threshold'] == 1
            cal_attn = True
            if cal_attn:
                x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
                if x_mask is not None:
                    x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero)
                    x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

                # attention
                if self.temporal:
                    x_m = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S)
                    x_m = self.attn(x_m)
                    x_m = rearrange(x_m, "(B S) T C -> B (T S) C", T=T, S=S)
                else:
                    x_m = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S)
                    x_m = self.attn(x_m)
                    x_m = rearrange(x_m, "(B T) S C -> B (T S) C", T=T, S=S)

                cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m
            
            x_m = cache_dic['cache'][current['flag']][current['layer']][current['module']]
            
            # modulate (attention)
            x_m_s = gate_msa * x_m
            if x_mask is not None:
                x_m_s_zero = gate_msa_zero * x_m
                x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S)

            # residual
            x = x + self.drop_path(x_m_s)

            # cross attention
            current['module'] = 'cross-attn'

            #cache_dic['cache'][flag][current['layer']][current['module']] = self.cross_attn(x, y, mask)
            #x = x + cache_dic['cache'][flag][current['layer']][current['module']]

            fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current) # 0.6ms

            fresh_tokens, fresh_cross_attn_map = self.cross_attn(fresh_tokens, y, mask) # 0.45ms
            #cross_attn_tick.record()
            update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_cross_attn_map) # 0.3ms
            #cache_dic['cache'][-1][current['layer']][current['module']] = self.cross_attn(x, y, mask)
            x = x + cache_dic['cache'][current['flag']][current['layer']][current['module']] 

            # modulate (MLP)
            current['module'] = 'mlp'
            #mlp_tick.record()
            x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)
            if x_mask is not None:
                x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero)
                x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)
            # MLP
            fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x_m, current)
            fresh_tokens = self.mlp(fresh_tokens)
            update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_cross_attn_map)

            x_m = cache_dic['cache'][current['flag']][current['layer']][current['module']]
            # modulate (MLP)
            x_m_s = gate_mlp * x_m
            if x_mask is not None:
                x_m_s_zero = gate_mlp_zero * x_m
                x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S)

            # residual
            x = x + self.drop_path(x_m_s)
            #end.record()
            #torch.cuda.synchronize()
            #print("Cached:",attn_tick.elapsed_time(cross_attn_tick),cross_attn_tick.elapsed_time(mlp_tick),mlp_tick.elapsed_time(end))
            #print(cross_attn_tick.elapsed_time(end_cross_attn_tick))
        return x


class STDiT3Config(PretrainedConfig):
    model_type = "STDiT3"

    def __init__(
        self,
        input_size=(None, None, None),
        input_sq_size=512,
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        pred_sigma=True,
        drop_path=0.0,
        caption_channels=4096,
        model_max_length=300,
        qk_norm=True,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
        only_train_temporal=False,
        freeze_y_embedder=False,
        skip_y_embedder=False,
        **kwargs,
    ):
        self.input_size = input_size
        self.input_sq_size = input_sq_size
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.hidden_size = hidden_size
        self.depth = depth
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.class_dropout_prob = class_dropout_prob
        self.pred_sigma = pred_sigma
        self.drop_path = drop_path
        self.caption_channels = caption_channels
        self.model_max_length = model_max_length
        self.qk_norm = qk_norm
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel
        self.enable_sequence_parallelism = enable_sequence_parallelism
        self.only_train_temporal = only_train_temporal
        self.freeze_y_embedder = freeze_y_embedder
        self.skip_y_embedder = skip_y_embedder
        super().__init__(**kwargs)


class STDiT3(PreTrainedModel):
    config_class = STDiT3Config

    def __init__(self, config):
        super().__init__(config)
        self.pred_sigma = config.pred_sigma
        self.in_channels = config.in_channels
        self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels

        # model size related
        self.depth = config.depth
        self.mlp_ratio = config.mlp_ratio
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_heads

        # computation related
        self.drop_path = config.drop_path
        self.enable_flash_attn = config.enable_flash_attn
        self.enable_layernorm_kernel = config.enable_layernorm_kernel
        self.enable_sequence_parallelism = config.enable_sequence_parallelism

        # input size related
        self.patch_size = config.patch_size
        self.input_sq_size = config.input_sq_size
        self.pos_embed = PositionEmbedding2D(config.hidden_size)
        self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads)

        # embedding
        self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size)
        self.t_embedder = TimestepEmbedder(config.hidden_size)
        self.fps_embedder = SizeEmbedder(self.hidden_size)
        self.t_block = nn.Sequential(
            nn.SiLU(),
            nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True),
        )
        self.y_embedder = CaptionEmbedder(
            in_channels=config.caption_channels,
            hidden_size=config.hidden_size,
            uncond_prob=config.class_dropout_prob,
            act_layer=approx_gelu,
            token_num=config.model_max_length,
        )

        # spatial blocks
        drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)]
        self.spatial_blocks = nn.ModuleList(
            [
                STDiT3Block(
                    hidden_size=config.hidden_size,
                    num_heads=config.num_heads,
                    mlp_ratio=config.mlp_ratio,
                    drop_path=drop_path[i],
                    qk_norm=config.qk_norm,
                    enable_flash_attn=config.enable_flash_attn,
                    enable_layernorm_kernel=config.enable_layernorm_kernel,
                    enable_sequence_parallelism=config.enable_sequence_parallelism,
                )
                for i in range(config.depth)
            ]
        )

        # temporal blocks
        drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)]
        self.temporal_blocks = nn.ModuleList(
            [
                STDiT3Block(
                    hidden_size=config.hidden_size,
                    num_heads=config.num_heads,
                    mlp_ratio=config.mlp_ratio,
                    drop_path=drop_path[i],
                    qk_norm=config.qk_norm,
                    enable_flash_attn=config.enable_flash_attn,
                    enable_layernorm_kernel=config.enable_layernorm_kernel,
                    enable_sequence_parallelism=config.enable_sequence_parallelism,
                    # temporal
                    temporal=True,
                    rope=self.rope.rotate_queries_or_keys,
                )
                for i in range(config.depth)
            ]
        )

        # final layer
        self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels)

        self.initialize_weights()
        if config.only_train_temporal:
            for param in self.parameters():
                param.requires_grad = False
            for block in self.temporal_blocks:
                for param in block.parameters():
                    param.requires_grad = True

        if config.freeze_y_embedder:
            for param in self.y_embedder.parameters():
                param.requires_grad = False

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize fps_embedder
        nn.init.normal_(self.fps_embedder.mlp[0].weight, std=0.02)
        nn.init.constant_(self.fps_embedder.mlp[0].bias, 0)
        nn.init.constant_(self.fps_embedder.mlp[2].weight, 0)
        nn.init.constant_(self.fps_embedder.mlp[2].bias, 0)

        # Initialize timporal blocks
        for block in self.temporal_blocks:
            nn.init.constant_(block.attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.mlp.fc2.weight, 0)

    def get_dynamic_size(self, x):
        _, _, T, H, W = x.size()
        if T % self.patch_size[0] != 0:
            T += self.patch_size[0] - T % self.patch_size[0]
        if H % self.patch_size[1] != 0:
            H += self.patch_size[1] - H % self.patch_size[1]
        if W % self.patch_size[2] != 0:
            W += self.patch_size[2] - W % self.patch_size[2]
        T = T // self.patch_size[0]
        H = H // self.patch_size[1]
        W = W // self.patch_size[2]
        return (T, H, W)

    def encode_text(self, y, mask=None):
        y = self.y_embedder(y, self.training)  # [B, 1, N_token, C]
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, self.hidden_size)
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, self.hidden_size)
        return y, y_lens

    def forward(self, x, timestep, y, mask=None, x_mask=None, fps=None, height=None, width=None, cache_dic=None, current=None, **kwargs):
        dtype = self.x_embedder.proj.weight.dtype
        B = x.size(0)
        x = x.to(dtype)
        timestep = timestep.to(dtype)
        y = y.to(dtype)

        # === get pos embed ===
        _, _, Tx, Hx, Wx = x.size()
        T, H, W = self.get_dynamic_size(x)
        cache_dic['dynamic_size'] = (B,T,H,W)
        # adjust for sequence parallelism
        # we need to ensure H * W is divisible by sequence parallel size
        # for simplicity, we can adjust the height to make it divisible
        if self.enable_sequence_parallelism:
            sp_size = dist.get_world_size(get_sequence_parallel_group())
            if H % sp_size != 0:
                h_pad_size = sp_size - H % sp_size
            else:
                h_pad_size = 0

            if h_pad_size > 0:
                hx_pad_size = h_pad_size * self.patch_size[1]

                # pad x along the H dimension
                H += h_pad_size
                x = F.pad(x, (0, 0, 0, hx_pad_size))

        S = H * W
        base_size = round(S**0.5)
        resolution_sq = (height[0].item() * width[0].item()) ** 0.5
        scale = resolution_sq / self.input_sq_size
        pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size)

        # === get timestep embed ===
        t = self.t_embedder(timestep, dtype=x.dtype)  # [B, C]
        fps = self.fps_embedder(fps.unsqueeze(1), B)
        t = t + fps
        t_mlp = self.t_block(t)
        t0 = t0_mlp = None
        if x_mask is not None:
            t0_timestep = torch.zeros_like(timestep)
            t0 = self.t_embedder(t0_timestep, dtype=x.dtype)
            t0 = t0 + fps
            t0_mlp = self.t_block(t0)

        # === get y embed ===
        if self.config.skip_y_embedder:
            y_lens = mask
            if isinstance(y_lens, torch.Tensor):
                y_lens = y_lens.long().tolist()
        else:
            y, y_lens = self.encode_text(y, mask)

        # === get x embed ===
        x = self.x_embedder(x)  # [B, N, C]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        x = x + pos_emb

        # shard over the sequence dim if sp is enabled
        if self.enable_sequence_parallelism:
            x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="down")
            S = S // dist.get_world_size(get_sequence_parallel_group())

        x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S)

        # === blocks ===
        for i, (spatial_block, temporal_block) in enumerate(zip(self.spatial_blocks, self.temporal_blocks)):
            current['layer'] = i
            #x = auto_grad_checkpoint(spatial_block,  x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)
            #x = auto_grad_checkpoint(temporal_block, x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)
            x = spatial_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)
            x = temporal_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)

        if self.enable_sequence_parallelism:
            x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
            x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="up")
            S = S * dist.get_world_size(get_sequence_parallel_group())
            x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S)

        # === final layer ===
        x = self.final_layer(x, t, x_mask, t0, T, S)
        x = self.unpatchify(x, T, H, W, Tx, Hx, Wx)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w):
        """
        Args:
            x (torch.Tensor): of shape [B, N, C]

        Return:
            x (torch.Tensor): of shape [B, C_out, T, H, W]
        """

        # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        T_p, H_p, W_p = self.patch_size
        x = rearrange(
            x,
            "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
            N_t=N_t,
            N_h=N_h,
            N_w=N_w,
            T_p=T_p,
            H_p=H_p,
            W_p=W_p,
            C_out=self.out_channels,
        )
        # unpad
        x = x[:, :, :R_t, :R_h, :R_w]
        return x


@MODELS.register_module("STDiT3-XL/2")
def STDiT3_XL_2(from_pretrained=None, **kwargs):
    force_huggingface = kwargs.pop("force_huggingface", False)
    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
        model = STDiT3.from_pretrained(from_pretrained, **kwargs)
    else:
        config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
        model = STDiT3(config)
        if from_pretrained is not None:
            load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("STDiT3-3B/2")
def STDiT3_3B_2(from_pretrained=None, **kwargs):
    force_huggingface = kwargs.pop("force_huggingface", False)
    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
        model = STDiT3.from_pretrained(from_pretrained, **kwargs)
    else:
        config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs)
        model = STDiT3(config)
        if from_pretrained is not None:
            load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/build/lib/opensora/models/stdit/stdit3.py
================================================
import os

import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from rotary_embedding_torch import RotaryEmbedding
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp
from transformers import PretrainedConfig, PreTrainedModel

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
from opensora.acceleration.parallel_states import get_sequence_parallel_group
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    MultiHeadCrossAttention,
    PatchEmbed3D,
    PositionEmbedding2D,
    SeqParallelAttention,
    SeqParallelMultiHeadCrossAttention,
    SizeEmbedder,
    T2IFinalLayer,
    TimestepEmbedder,
    approx_gelu,
    get_layernorm,
    t2i_modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint

from ...models.cache_functions import global_force_fresh, cache_cutfresh, update_cache, force_init, score_evaluate

class STDiT3Block(nn.Module):
    def __init__(
        self,
        hidden_size,
        num_heads,
        mlp_ratio=4.0,
        drop_path=0.0,
        rope=None,
        qk_norm=False,
        temporal=False,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.temporal = temporal
        self.hidden_size = hidden_size
        self.enable_flash_attn = enable_flash_attn
        self.enable_sequence_parallelism = enable_sequence_parallelism

        if self.enable_sequence_parallelism and not temporal:
            attn_cls = SeqParallelAttention
            mha_cls = SeqParallelMultiHeadCrossAttention
        else:
            attn_cls = Attention
            mha_cls = MultiHeadCrossAttention

        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = attn_cls(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            qk_norm=qk_norm,
            rope=rope,
            enable_flash_attn=enable_flash_attn,
        )
        self.cross_attn = mha_cls(hidden_size, num_heads)
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(
            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)

    def t_mask_select(self, x_mask, x, masked_x, T, S):
        # x: [B, (T, S), C]
        # mased_x: [B, (T, S), C]
        # x_mask: [B, T]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S)
        x = torch.where(x_mask[:, :, None, None], x, masked_x)
        x = rearrange(x, "B T S C -> B (T S) C")
        return x

    def forward(
        self,
        x,
        y,
        t,
        current,
        cache_dic,
        mask=None,  # text mask
        x_mask=None,  # temporal mask
        t0=None,  # t with timestamp=0
        T=None,  # number of frames
        S=None,  # number of pixel patches
    ):
        # prepare modulate parameters
        B, N, C = x.shape
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
            self.scale_shift_table[None] + t.reshape(B, 6, -1)
        ).chunk(6, dim=1)
        if x_mask is not None:
            shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = (
                self.scale_shift_table[None] + t0.reshape(B, 6, -1)
            ).chunk(6, dim=1)
        #attn_tick = torch.cuda.Event(enable_timing=True)
        #cross_attn_tick = torch.cuda.Event(enable_timing=True)
        #end_cross_attn_tick = torch.cuda.Event(enable_timing=True)
        #mlp_tick = torch.cuda.Event(enable_timing=True)
        #end = torch.cuda.Event(enable_timing=True)
        if self.temporal:
            current['flag'] = -1
        else:
            current['flag'] = 0
        is_force_fresh = global_force_fresh(cache_dic, current)
        current['is_force_fresh'] = is_force_fresh
        #print(is_force_fresh)
        
        # modulate (attention)
        current['module'] = 'attn'

        if is_force_fresh[current['module']]:
            #attn_tick.record()
            x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
            if x_mask is not None:
                x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero)
                x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

            # attention
            if self.temporal:
                x_m = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S)
                x_m = self.attn(x_m)
                x_m = rearrange(x_m, "(B S) T C -> B (T S) C", T=T, S=S)
            else:
                x_m = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S)
                x_m = self.attn(x_m)
                x_m = rearrange(x_m, "(B T) S C -> B (T S) C", T=T, S=S)

            cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m
            force_init(cache_dic, current, x)
        else:            
            x_m = cache_dic['cache'][current['flag']][current['layer']][current['module']]
            
        # modulate (attention)
        x_m_s = gate_msa * x_m
        if x_mask is not None:
            x_m_s_zero = gate_msa_zero * x_m
            x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S)
        # residual
        x = x + self.drop_path(x_m_s)


        # cross attention
        current['module'] = 'cross-attn'

        if is_force_fresh[current['module']]:
            #cross_attn_tick.record()
            cache_dic['cache'][current['flag']][current['layer']][current['module']], cache_dic['cross_attn_map'][current['flag']][current['layer']] = self.cross_attn(x, y, mask)
            force_init(cache_dic, current, x)

        else:
            fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current) # 0.6ms
            fresh_tokens, fresh_cross_attn_map = self.cross_attn(fresh_tokens, y, mask) # 0.45ms
            #cross_attn_tick.record()
            update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_cross_attn_map) # 0.3ms
            #cache_dic['cache'][-1][current['layer']][current['module']] = self.cross_attn(x, y, mask)
        x = x + cache_dic['cache'][current['flag']][current['layer']][current['module']]

        # modulate (MLP)
        current['module'] = 'mlp'

        #mlp_tick.record()
        x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)
        if x_mask is not None:
            x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero)
            x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)
        
        # MLP
        if is_force_fresh[current['module']]:
            x_m = self.mlp(x_m)
            cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m
            force_init(cache_dic, current, x)
        
        else:
            fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x_m, current)
            fresh_tokens = self.mlp(fresh_tokens)
            update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current)

        # modulate (MLP)
        x_m_s = gate_mlp * cache_dic['cache'][current['flag']][current['layer']][current['module']]

        if x_mask is not None:
            x_m_s_zero = gate_mlp_zero * x_m
            x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S)

            # residual    
        x = x + self.drop_path(x_m_s)

            #end.record()
            #torch.cuda.synchronize()
            #print("Cached:",attn_tick.elapsed_time(cross_attn_tick),cross_attn_tick.elapsed_time(mlp_tick),mlp_tick.elapsed_time(end))
            #print(cross_attn_tick.elapsed_time(end_cross_attn_tick))
        return x


class STDiT3Config(PretrainedConfig):
    model_type = "STDiT3"

    def __init__(
        self,
        input_size=(None, None, None),
        input_sq_size=512,
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        pred_sigma=True,
        drop_path=0.0,
        caption_channels=4096,
        model_max_length=300,
        qk_norm=True,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
        only_train_temporal=False,
        freeze_y_embedder=False,
        skip_y_embedder=False,
        **kwargs,
    ):
        self.input_size = input_size
        self.input_sq_size = input_sq_size
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.hidden_size = hidden_size
        self.depth = depth
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.class_dropout_prob = class_dropout_prob
        self.pred_sigma = pred_sigma
        self.drop_path = drop_path
        self.caption_channels = caption_channels
        self.model_max_length = model_max_length
        self.qk_norm = qk_norm
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel
        self.enable_sequence_parallelism = enable_sequence_parallelism
        self.only_train_temporal = only_train_temporal
        self.freeze_y_embedder = freeze_y_embedder
        self.skip_y_embedder = skip_y_embedder
        super().__init__(**kwargs)


class STDiT3(PreTrainedModel):
    config_class = STDiT3Config

    def __init__(self, config):
        super().__init__(config)
        self.pred_sigma = config.pred_sigma
        self.in_channels = config.in_channels
        self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels

        # model size related
        self.depth = config.depth
        self.mlp_ratio = config.mlp_ratio
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_heads

        # computation related
        self.drop_path = config.drop_path
        self.enable_flash_attn = config.enable_flash_attn
        self.enable_layernorm_kernel = config.enable_layernorm_kernel
        self.enable_sequence_parallelism = config.enable_sequence_parallelism

        # input size related
        self.patch_size = config.patch_size
        self.input_sq_size = config.input_sq_size
        self.pos_embed = PositionEmbedding2D(config.hidden_size)
        self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads)

        # embedding
        self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size)
        self.t_embedder = TimestepEmbedder(config.hidden_size)
        self.fps_embedder = SizeEmbedder(self.hidden_size)
        self.t_block = nn.Sequential(
            nn.SiLU(),
            nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True),
        )
        self.y_embedder = CaptionEmbedder(
            in_channels=config.caption_channels,
            hidden_size=config.hidden_size,
            uncond_prob=config.class_dropout_prob,
            act_layer=approx_gelu,
            token_num=config.model_max_length,
        )

        # spatial blocks
        drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)]
        self.spatial_blocks = nn.ModuleList(
            [
                STDiT3Block(
                    hidden_size=config.hidden_size,
                    num_heads=config.num_heads,
                    mlp_ratio=config.mlp_ratio,
                    drop_path=drop_path[i],
                    qk_norm=config.qk_norm,
                    enable_flash_attn=config.enable_flash_attn,
                    enable_layernorm_kernel=config.enable_layernorm_kernel,
                    enable_sequence_parallelism=config.enable_sequence_parallelism,
                )
                for i in range(config.depth)
            ]
        )

        # temporal blocks
        drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)]
        self.temporal_blocks = nn.ModuleList(
            [
                STDiT3Block(
                    hidden_size=config.hidden_size,
                    num_heads=config.num_heads,
                    mlp_ratio=config.mlp_ratio,
                    drop_path=drop_path[i],
                    qk_norm=config.qk_norm,
                    enable_flash_attn=config.enable_flash_attn,
                    enable_layernorm_kernel=config.enable_layernorm_kernel,
                    enable_sequence_parallelism=config.enable_sequence_parallelism,
                    # temporal
                    temporal=True,
                    rope=self.rope.rotate_queries_or_keys,
                )
                for i in range(config.depth)
            ]
        )

        # final layer
        self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels)

        self.initialize_weights()
        if config.only_train_temporal:
            for param in self.parameters():
                param.requires_grad = False
            for block in self.temporal_blocks:
                for param in block.parameters():
                    param.requires_grad = True

        if config.freeze_y_embedder:
            for param in self.y_embedder.parameters():
                param.requires_grad = False

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize fps_embedder
        nn.init.normal_(self.fps_embedder.mlp[0].weight, std=0.02)
        nn.init.constant_(self.fps_embedder.mlp[0].bias, 0)
        nn.init.constant_(self.fps_embedder.mlp[2].weight, 0)
        nn.init.constant_(self.fps_embedder.mlp[2].bias, 0)

        # Initialize timporal blocks
        for block in self.temporal_blocks:
            nn.init.constant_(block.attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.mlp.fc2.weight, 0)

    def get_dynamic_size(self, x):
        _, _, T, H, W = x.size()
        if T % self.patch_size[0] != 0:
            T += self.patch_size[0] - T % self.patch_size[0]
        if H % self.patch_size[1] != 0:
            H += self.patch_size[1] - H % self.patch_size[1]
        if W % self.patch_size[2] != 0:
            W += self.patch_size[2] - W % self.patch_size[2]
        T = T // self.patch_size[0]
        H = H // self.patch_size[1]
        W = W // self.patch_size[2]
        return (T, H, W)

    def encode_text(self, y, mask=None):
        y = self.y_embedder(y, self.training)  # [B, 1, N_token, C]
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, self.hidden_size)
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, self.hidden_size)
        return y, y_lens

    def forward(self, x, timestep, y, mask=None, x_mask=None, fps=None, height=None, width=None, cache_dic=None, current=None, **kwargs):
        dtype = self.x_embedder.proj.weight.dtype
        B = x.size(0)
        x = x.to(dtype)
        timestep = timestep.to(dtype)
        y = y.to(dtype)

        # === get pos embed ===
        _, _, Tx, Hx, Wx = x.size()
        T, H, W = self.get_dynamic_size(x)
        cache_dic['dynamic_size'] = (B,T,H,W)
        # adjust for sequence parallelism
        # we need to ensure H * W is divisible by sequence parallel size
        # for simplicity, we can adjust the height to make it divisible
        if self.enable_sequence_parallelism:
            sp_size = dist.get_world_size(get_sequence_parallel_group())
            if H % sp_size != 0:
                h_pad_size = sp_size - H % sp_size
            else:
                h_pad_size = 0

            if h_pad_size > 0:
                hx_pad_size = h_pad_size * self.patch_size[1]

                # pad x along the H dimension
                H += h_pad_size
                x = F.pad(x, (0, 0, 0, hx_pad_size))

        S = H * W
        base_size = round(S**0.5)
        resolution_sq = (height[0].item() * width[0].item()) ** 0.5
        scale = resolution_sq / self.input_sq_size
        pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size)

        # === get timestep embed ===
        t = self.t_embedder(timestep, dtype=x.dtype)  # [B, C]
        fps = self.fps_embedder(fps.unsqueeze(1), B)
        t = t + fps
        t_mlp = self.t_block(t)
        t0 = t0_mlp = None
        if x_mask is not None:
            t0_timestep = torch.zeros_like(timestep)
            t0 = self.t_embedder(t0_timestep, dtype=x.dtype)
            t0 = t0 + fps
            t0_mlp = self.t_block(t0)

        # === get y embed ===
        if self.config.skip_y_embedder:
            y_lens = mask
            if isinstance(y_lens, torch.Tensor):
                y_lens = y_lens.long().tolist()
        else:
            y, y_lens = self.encode_text(y, mask)

        # === get x embed ===
        x = self.x_embedder(x)  # [B, N, C]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        x = x + pos_emb

        # shard over the sequence dim if sp is enabled
        if self.enable_sequence_parallelism:
            x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="down")
            S = S // dist.get_world_size(get_sequence_parallel_group())

        x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S)

        # === blocks ===
        for i, (spatial_block, temporal_block) in enumerate(zip(self.spatial_blocks, self.temporal_blocks)):
            current['layer'] = i
            #x = auto_grad_checkpoint(spatial_block,  x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)
            #x = auto_grad_checkpoint(temporal_block, x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)
            x = spatial_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)
            x = temporal_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)

        if self.enable_sequence_parallelism:
            x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
            x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="up")
            S = S * dist.get_world_size(get_sequence_parallel_group())
            x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S)

        # === final layer ===
        x = self.final_layer(x, t, x_mask, t0, T, S)
        x = self.unpatchify(x, T, H, W, Tx, Hx, Wx)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w):
        """
        Args:
            x (torch.Tensor): of shape [B, N, C]

        Return:
            x (torch.Tensor): of shape [B, C_out, T, H, W]
        """

        # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        T_p, H_p, W_p = self.patch_size
        x = rearrange(
            x,
            "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
            N_t=N_t,
            N_h=N_h,
            N_w=N_w,
            T_p=T_p,
            H_p=H_p,
            W_p=W_p,
            C_out=self.out_channels,
        )
        # unpad
        x = x[:, :, :R_t, :R_h, :R_w]
        return x


@MODELS.register_module("STDiT3-XL/2")
def STDiT3_XL_2(from_pretrained=None, **kwargs):
    force_huggingface = kwargs.pop("force_huggingface", False)
    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
        model = STDiT3.from_pretrained(from_pretrained, **kwargs)
    else:
        config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
        model = STDiT3(config)
        if from_pretrained is not None:
            load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("STDiT3-3B/2")
def STDiT3_3B_2(from_pretrained=None, **kwargs):
    force_huggingface = kwargs.pop("force_huggingface", False)
    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
        model = STDiT3.from_pretrained(from_pretrained, **kwargs)
    else:
        config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs)
        model = STDiT3(config)
        if from_pretrained is not None:
            load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/build/lib/opensora/models/text_encoder/__init__.py
================================================
from .classes import ClassEncoder
from .clip import ClipEncoder
from .t5 import T5Encoder


================================================
FILE: Open-Sora/build/lib/opensora/models/text_encoder/classes.py
================================================
import torch

from opensora.registry import MODELS


@MODELS.register_module("classes")
class ClassEncoder:
    def __init__(self, num_classes, model_max_length=None, device="cuda", dtype=torch.float):
        self.num_classes = num_classes
        self.y_embedder = None

        self.model_max_length = model_max_length
        self.output_dim = None
        self.device = device

    def encode(self, text):
        return dict(y=torch.tensor([int(t) for t in text]).to(self.device))

    def null(self, n):
        return torch.tensor([self.num_classes] * n).to(self.device)


================================================
FILE: Open-Sora/build/lib/opensora/models/text_encoder/clip.py
================================================
# Copyright 2024 Vchitect/Latte
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.# Modified from Latte
#
# This file is adapted from the Latte project.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# Latte: https://github.com/Vchitect/Latte
# DiT:   https://github.com/facebookresearch/DiT/tree/main
# --------------------------------------------------------


import torch
import torch.nn as nn
import transformers
from transformers import CLIPTextModel, CLIPTokenizer

from opensora.registry import MODELS

transformers.logging.set_verbosity_error()


class AbstractEncoder(nn.Module):
    def __init__(self):
        super().__init__()

    def encode(self, *args, **kwargs):
        raise NotImplementedError


class FrozenCLIPEmbedder(AbstractEncoder):
    """Uses the CLIP transformer encoder for text (from Hugging Face)"""

    def __init__(self, path="openai/clip-vit-huge-patch14", device="cuda", max_length=77):
        super().__init__()
        self.tokenizer = CLIPTokenizer.from_pretrained(path)
        self.transformer = CLIPTextModel.from_pretrained(path)
        self.device = device
        self.max_length = max_length
        self._freeze()

    def _freeze(self):
        self.transformer = self.transformer.eval()
        for param in self.parameters():
            param.requires_grad = False

    def forward(self, text):
        batch_encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            return_length=True,
            return_overflowing_tokens=False,
            padding="max_length",
            return_tensors="pt",
        )
        tokens = batch_encoding["input_ids"].to(self.device)
        outputs = self.transformer(input_ids=tokens)

        z = outputs.last_hidden_state
        pooled_z = outputs.pooler_output
        return z, pooled_z

    def encode(self, text):
        return self(text)


@MODELS.register_module("clip")
class ClipEncoder:
    """
    Embeds text prompt into vector representations. Also handles text dropout for classifier-free guidance.
    """

    def __init__(
        self,
        from_pretrained,
        model_max_length=77,
        device="cuda",
        dtype=torch.float,
    ):
        super().__init__()
        assert from_pretrained is not None, "Please specify the path to the T5 model"

        self.text_encoder = FrozenCLIPEmbedder(path=from_pretrained, max_length=model_max_length).to(device, dtype)
        self.y_embedder = None

        self.model_max_length = model_max_length
        self.output_dim = self.text_encoder.transformer.config.hidden_size

    def encode(self, text):
        _, pooled_embeddings = self.text_encoder.encode(text)
        y = pooled_embeddings.unsqueeze(1).unsqueeze(1)
        return dict(y=y)

    def null(self, n):
        null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None]
        return null_y

    def to(self, dtype):
        self.text_encoder = self.text_encoder.to(dtype)
        return self


================================================
FILE: Open-Sora/build/lib/opensora/models/text_encoder/t5.py
================================================
# Adapted from PixArt
#
# Copyright (C) 2023  PixArt-alpha/PixArt-alpha
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
# T5:     https://github.com/google-research/text-to-text-transfer-transformer
# --------------------------------------------------------

import html
import re

import ftfy
import torch
from transformers import AutoTokenizer, T5EncoderModel

from opensora.registry import MODELS


class T5Embedder:
    def __init__(
        self,
        device,
        from_pretrained=None,
        *,
        cache_dir=None,
        hf_token=None,
        use_text_preprocessing=True,
        t5_model_kwargs=None,
        torch_dtype=None,
        use_offload_folder=None,
        model_max_length=120,
        local_files_only=False,
    ):
        self.device = torch.device(device)
        self.torch_dtype = torch_dtype or torch.bfloat16
        self.cache_dir = cache_dir

        if t5_model_kwargs is None:
            t5_model_kwargs = {
                "low_cpu_mem_usage": True,
                "torch_dtype": self.torch_dtype,
            }

            if use_offload_folder is not None:
                t5_model_kwargs["offload_folder"] = use_offload_folder
                t5_model_kwargs["device_map"] = {
                    "shared": self.device,
                    "encoder.embed_tokens": self.device,
                    "encoder.block.0": self.device,
                    "encoder.block.1": self.device,
                    "encoder.block.2": self.device,
                    "encoder.block.3": self.device,
                    "encoder.block.4": self.device,
                    "encoder.block.5": self.device,
                    "encoder.block.6": self.device,
                    "encoder.block.7": self.device,
                    "encoder.block.8": self.device,
                    "encoder.block.9": self.device,
                    "encoder.block.10": self.device,
                    "encoder.block.11": self.device,
                    "encoder.block.12": "disk",
                    "encoder.block.13": "disk",
                    "encoder.block.14": "disk",
                    "encoder.block.15": "disk",
                    "encoder.block.16": "disk",
                    "encoder.block.17": "disk",
                    "encoder.block.18": "disk",
                    "encoder.block.19": "disk",
                    "encoder.block.20": "disk",
                    "encoder.block.21": "disk",
                    "encoder.block.22": "disk",
                    "encoder.block.23": "disk",
                    "encoder.final_layer_norm": "disk",
                    "encoder.dropout": "disk",
                }
            else:
                t5_model_kwargs["device_map"] = {
                    "shared": self.device,
                    "encoder": self.device,
                }

        self.use_text_preprocessing = use_text_preprocessing
        self.hf_token = hf_token

        self.tokenizer = AutoTokenizer.from_pretrained(
            from_pretrained,
            cache_dir=cache_dir,
            local_files_only=local_files_only,
        )
        self.model = T5EncoderModel.from_pretrained(
            from_pretrained,
            cache_dir=cache_dir,
            local_files_only=local_files_only,
            **t5_model_kwargs,
        ).eval()
        self.model_max_length = model_max_length

    def get_text_embeddings(self, texts):
        text_tokens_and_mask = self.tokenizer(
            texts,
            max_length=self.model_max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )

        input_ids = text_tokens_and_mask["input_ids"].to(self.device)
        attention_mask = text_tokens_and_mask["attention_mask"].to(self.device)
        with torch.no_grad():
            text_encoder_embs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )["last_hidden_state"].detach()
        return text_encoder_embs, attention_mask


@MODELS.register_module("t5")
class T5Encoder:
    def __init__(
        self,
        from_pretrained=None,
        model_max_length=120,
        device="cuda",
        dtype=torch.float,
        cache_dir=None,
        shardformer=False,
        local_files_only=False,
    ):
        assert from_pretrained is not None, "Please specify the path to the T5 model"

        self.t5 = T5Embedder(
            device=device,
            torch_dtype=dtype,
            from_pretrained=from_pretrained,
            cache_dir=cache_dir,
            model_max_length=model_max_length,
            local_files_only=local_files_only,
        )
        self.t5.model.to(dtype=dtype)
        self.y_embedder = None

        self.model_max_length = model_max_length
        self.output_dim = self.t5.model.config.d_model
        self.dtype = dtype

        if shardformer:
            self.shardformer_t5()

    def shardformer_t5(self):
        from colossalai.shardformer import ShardConfig, ShardFormer

        from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy
        from opensora.utils.misc import requires_grad

        shard_config = ShardConfig(
            tensor_parallel_process_group=None,
            pipeline_stage_manager=None,
            enable_tensor_parallelism=False,
            enable_fused_normalization=False,
            enable_flash_attention=False,
            enable_jit_fused=True,
            enable_sequence_parallelism=False,
            enable_sequence_overlap=False,
        )
        shard_former = ShardFormer(shard_config=shard_config)
        optim_model, _ = shard_former.optimize(self.t5.model, policy=T5EncoderPolicy())
        self.t5.model = optim_model.to(self.dtype)

        # ensure the weights are frozen
        requires_grad(self.t5.model, False)

    def encode(self, text):
        caption_embs, emb_masks = self.t5.get_text_embeddings(text)
        caption_embs = caption_embs[:, None]
        return dict(y=caption_embs, mask=emb_masks)

    def null(self, n):
        null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None]
        return null_y


def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


BAD_PUNCT_REGEX = re.compile(
    r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
)  # noqa


def clean_caption(caption):
    import urllib.parse as ul

    from bs4 import BeautifulSoup

    caption = str(caption)
    caption = ul.unquote_plus(caption)
    caption = caption.strip().lower()
    caption = re.sub("<person>", "person", caption)
    # urls:
    caption = re.sub(
        r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    caption = re.sub(
        r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    # html:
    caption = BeautifulSoup(caption, features="html.parser").text

    # @<nickname>
    caption = re.sub(r"@[\w\d]+\b", "", caption)

    # 31C0—31EF CJK Strokes
    # 31F0—31FF Katakana Phonetic Extensions
    # 3200—32FF Enclosed CJK Letters and Months
    # 3300—33FF CJK Compatibility
    # 3400—4DBF CJK Unified Ideographs Extension A
    # 4DC0—4DFF Yijing Hexagram Symbols
    # 4E00—9FFF CJK Unified Ideographs
    caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
    caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
    caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
    caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
    caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
    caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
    caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
    #######################################################

    # все виды тире / all types of dash --> "-"
    caption = re.sub(
        r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
        "-",
        caption,
    )

    # кавычки к одному стандарту
    caption = re.sub(r"[`´«»“”¨]", '"', caption)
    caption = re.sub(r"[‘’]", "'", caption)

    # &quot;
    caption = re.sub(r"&quot;?", "", caption)
    # &amp
    caption = re.sub(r"&amp", "", caption)

    # ip adresses:
    caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)

    # article ids:
    caption = re.sub(r"\d:\d\d\s+$", "", caption)

    # \n
    caption = re.sub(r"\\n", " ", caption)

    # "#123"
    caption = re.sub(r"#\d{1,3}\b", "", caption)
    # "#12345.."
    caption = re.sub(r"#\d{5,}\b", "", caption)
    # "123456.."
    caption = re.sub(r"\b\d{6,}\b", "", caption)
    # filenames:
    caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)

    #
    caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
    caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""

    caption = re.sub(BAD_PUNCT_REGEX, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
    caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "

    # this-is-my-cute-cat / this_is_my_cute_cat
    regex2 = re.compile(r"(?:\-|\_)")
    if len(re.findall(regex2, caption)) > 3:
        caption = re.sub(regex2, " ", caption)

    caption = basic_clean(caption)

    caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
    caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
    caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231

    caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
    caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
    caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
    caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
    caption = re.sub(r"\bpage\s+\d+\b", "", caption)

    caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...

    caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)

    caption = re.sub(r"\b\s+\:\s+", r": ", caption)
    caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
    caption = re.sub(r"\s+", " ", caption)

    caption.strip()

    caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
    caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
    caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
    caption = re.sub(r"^\.\S+$", "", caption)

    return caption.strip()


def text_preprocessing(text, use_text_preprocessing: bool = True):
    if use_text_preprocessing:
        # The exact text cleaning as was in the training stage:
        text = clean_caption(text)
        text = clean_caption(text)
        return text
    else:
        return text.lower().strip()


================================================
FILE: Open-Sora/build/lib/tools/caption/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/caption/acceleration/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/caption/acceleration/llava/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/caption/acceleration/llava/policies/__init__.py
================================================
from .llama import LlavaLlamaForCausalLMPolicy
from .mistral import LlavaMistralForCausalLMPolicy


================================================
FILE: Open-Sora/build/lib/tools/caption/acceleration/llava/policies/llama.py
================================================
from typing import Dict, Union

import torch.nn as nn
from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription

__all__ = ["LlavaLlamaPolicy", "LlavaLlamaForCausalLMPolicy"]


class LlavaLlamaPolicy(Policy):
    def config_sanity_check(self):
        pass

    def preprocess(self):
        if self.shard_config.enable_tensor_parallelism:
            # Resize embedding
            self.model.config.vocab_size
            self.shard_config.tensor_parallel_size

            # if vocab_size % world_size != 0:
            #     new_vocab_size = vocab_size + world_size - vocab_size % world_size
            #     self.model.resize_token_embeddings(new_vocab_size)

        return self.model

    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
        from transformers.models.llama.modeling_llama import LlamaDecoderLayer

        policy = {}

        if self.shard_config.enable_tensor_parallelism:
            decoder_attribute_replacement = {
                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
            }
            if getattr(self.model.config, "num_key_value_heads", False):
                decoder_attribute_replacement["self_attn.num_key_value_heads"] = (
                    self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size
                )

            policy[LlamaDecoderLayer] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
                sub_module_replacement=[
                    SubModuleReplacementDescription(
                        suffix="self_attn.q_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.k_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.v_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.o_proj",
                        target_module=Linear1D_Row,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.gate_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.up_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.down_proj",
                        target_module=Linear1D_Row,
                    ),
                ],
            )

        return policy

    def postprocess(self):
        return self.model


class LlavaLlamaForCausalLMPolicy(LlavaLlamaPolicy):
    def module_policy(self):
        from transformers import LlamaForCausalLM

        policy = super().module_policy()
        if self.shard_config.enable_tensor_parallelism:
            # add a new item for casual lm
            new_item = {
                LlamaForCausalLM: ModulePolicyDescription(
                    sub_module_replacement=[
                        SubModuleReplacementDescription(
                            suffix="lm_head", target_module=Linear1D_Col, kwargs={"gather_output": True}
                        )
                    ],
                )
            }
            policy.update(new_item)
        return policy


================================================
FILE: Open-Sora/build/lib/tools/caption/acceleration/llava/policies/mistral.py
================================================
import warnings
from typing import Dict, Union

import torch.nn as nn
from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription

__all__ = ["LlavaMistralPolicy", "LlavaMistralForCausalLMPolicy"]


class LlavaMistralPolicy(Policy):
    def config_sanity_check(self):
        pass

    def preprocess(self):
        if self.shard_config.enable_tensor_parallelism:
            # Resize embedding
            vocab_size = self.model.config.vocab_size
            world_size = self.shard_config.tensor_parallel_size

            if vocab_size % world_size != 0:
                new_vocab_size = vocab_size + world_size - vocab_size % world_size
                self.model.resize_token_embeddings(new_vocab_size)

        return self.model

    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
        from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MistralModel

        policy = {}

        if self.shard_config.enable_sequence_parallelism:
            self.shard_config.enable_sequence_parallelism = False
            warnings.warn(
                "Mistral doesn't support sequence parallelism now, will ignore the sequence parallelism flag."
            )

        if self.shard_config.enable_tensor_parallelism:
            decoder_attribute_replacement = {
                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
                "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
                // self.shard_config.tensor_parallel_size,
            }

            policy[MistralDecoderLayer] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
                sub_module_replacement=[
                    SubModuleReplacementDescription(
                        suffix="self_attn.q_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.k_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.v_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.o_proj",
                        target_module=Linear1D_Row,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.gate_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.up_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.down_proj",
                        target_module=Linear1D_Row,
                    ),
                ],
            )

            self.append_or_create_submodule_replacement(
                description=SubModuleReplacementDescription(
                    suffix="embed_tokens",
                    target_module=VocabParallelEmbedding1D,
                ),
                policy=policy,
                target_key=MistralModel,
            )

        return policy

    def postprocess(self):
        return self.model


class LlavaMistralForCausalLMPolicy(LlavaMistralPolicy):
    def module_policy(self):
        from transformers import MistralForCausalLM

        policy = super().module_policy()

        if self.shard_config.enable_tensor_parallelism:
            # add a new item for casual lm
            new_item = {
                MistralForCausalLM: ModulePolicyDescription(
                    sub_module_replacement=[
                        SubModuleReplacementDescription(
                            suffix="lm_head", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
                        )
                    ]
                )
            }
            policy.update(new_item)
        return policy


================================================
FILE: Open-Sora/build/lib/tools/caption/camera_motion/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/caption/camera_motion/camera_motion.py
================================================
import os

import numpy as np
import torch

from .utils import load_video
from .visualizer import Visualizer


def transform(vector):
    x = np.mean([item[0] for item in vector])
    y = np.mean([item[1] for item in vector])
    return [x, y]


class CameraPredict:
    def __init__(self, device, submodules_list, factor=0.25):
        self.device = device
        self.grid_size = 10
        self.factor = factor
        try:
            self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device)
        except:
            # workaround for CERTIFICATE_VERIFY_FAILED (see: https://github.com/pytorch/pytorch/issues/33288#issuecomment-954160699)
            import ssl

            ssl._create_default_https_context = ssl._create_unverified_context
            self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device)

    def infer(self, video_path, save_video=False, save_dir="./saved_videos"):
        # load video
        video = load_video(video_path, return_tensor=False)
        # set scale
        height, width = video.shape[1], video.shape[2]
        self.scale = min(height, width)
        video = torch.from_numpy(video).permute(0, 3, 1, 2)[None].float().to(self.device)  # B T C H W
        pred_tracks, pred_visibility = self.model(video, grid_size=self.grid_size)  # B T N 2,  B T N 1

        if save_video:
            video_name = os.path.basename(video_path)[:-4]
            vis = Visualizer(save_dir=save_dir, pad_value=120, linewidth=3)
            vis.visualize(video, pred_tracks, pred_visibility, filename=video_name)

        return pred_tracks[0].long().detach().cpu().numpy()

    def transform_class(self, vector, min_reso):  # 768*0.05
        scale = min_reso * self.factor
        x, y = vector
        direction = []
        if x > scale:
            direction.append("right")
        elif x < -scale:
            direction.append("left")

        if y > scale:
            direction.append("down")
        elif y < -scale:
            direction.append("up")

        return direction if direction else ["static"]

    def get_edge_point(self, track):
        middle = self.grid_size // 2
        top = [list(track[0, i, :]) for i in range(middle - 2, middle + 2)]
        down = [list(track[self.grid_size - 1, i, :]) for i in range(middle - 2, middle + 2)]
        left = [list(track[i, 0, :]) for i in range(middle - 2, middle + 2)]
        right = [list(track[i, self.grid_size - 1, :]) for i in range(middle - 2, middle + 2)]

        return top, down, left, right

    def get_edge_direction(self, track1, track2):
        edge_points1 = self.get_edge_point(track1)
        edge_points2 = self.get_edge_point(track2)

        vector_results = []
        for points1, points2 in zip(edge_points1, edge_points2):
            vectors = [[end[0] - start[0], end[1] - start[1]] for start, end in zip(points1, points2)]
            vector_results.append(vectors)
        vector_results = list(map(transform, vector_results))
        class_results = [self.transform_class(vector, min_reso=self.scale) for vector in vector_results]

        return class_results

    def classify_top_down(self, top, down):
        results = []
        classes = [f"{item_t}_{item_d}" for item_t in top for item_d in down]

        results_mapping = {
            "left_left": "pan_right",
            "right_right": "pan_left",
            "down_down": "tilt_up",
            "up_up": "tilt_down",
            "up_down": "zoom_in",
            "down_up": "zoom_out",
            "static_static": "static",
        }
        results = [results_mapping.get(cls) for cls in classes if cls in results_mapping]
        return results if results else ["None"]

    def classify_left_right(self, left, right):
        results = []
        classes = [f"{item_l}_{item_r}" for item_l in left for item_r in right]
        results_mapping = {
            "left_left": "pan_right",
            "right_right": "pan_left",
            "down_down": "tilt_up",
            "up_up": "tilt_down",
            "left_right": "zoom_in",
            "right_left": "zoom_out",
            "static_static": "static",
        }
        results = [results_mapping.get(cls) for cls in classes if cls in results_mapping]
        return results if results else ["None"]

    def camera_classify(self, track1, track2):
        top, down, left, right = self.get_edge_direction(track1, track2)

        top_results = self.classify_top_down(top, down)
        left_results = self.classify_left_right(left, right)

        results = list(set(top_results + left_results))
        if "None" in results and len(results) > 1:
            results.remove("None")
        if "static" in results and len(results) > 1:
            results.remove("static")
        if len(results) == 1 and results[0] == "None":  # Tom added this to deal with edge cases
            results = ["Undetermined"]
        return results

    def predict(self, video_path):
        pred_track = self.infer(video_path)
        track1 = pred_track[0].reshape((self.grid_size, self.grid_size, 2))
        track2 = pred_track[-1].reshape((self.grid_size, self.grid_size, 2))
        results = self.camera_classify(track1, track2)
        return results


def compute_camera_motion(device, submodules_dict, video_paths, factor):
    camera = CameraPredict(device, submodules_dict, factor)
    # predict_results = camera.predict(video_path)
    # return predict_results
    all_predictions = []
    for video_path in video_paths:
        camera_motion_types = camera.predict(video_path)
        all_predictions.append("+".join(camera_motion_types))
    return all_predictions


================================================
FILE: Open-Sora/build/lib/tools/caption/camera_motion/detect.py
================================================
# Originally developed by https://github.com/Vchitect/VBench based on https://github.com/facebookresearch/co-tracker.

import argparse
from typing import List

import pandas as pd

from .camera_motion import compute_camera_motion


def process(paths: List[str], threshold: float) -> List[str]:
    device = "cuda"
    submodules = {"repo": "facebookresearch/co-tracker", "model": "cotracker2"}
    camera_motion_types = compute_camera_motion(device, submodules, paths, factor=threshold)
    return camera_motion_types


def main(args):
    output_file = args.input.replace(".csv", "_cmotion.csv")
    data = pd.read_csv(args.input)
    data["cmotion"] = process(data["path"], args.threshold)
    data.to_csv(output_file, index=False)
    print(f"Output saved to {output_file}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str)
    parser.add_argument("--threshold", type=float, default=0.25)
    args = parser.parse_args()
    main(args)


================================================
FILE: Open-Sora/build/lib/tools/caption/camera_motion/utils.py
================================================
import numpy as np
import torch
from decord import VideoReader
from PIL import Image, ImageSequence


def get_frame_indices(num_frames, vlen, sample="rand", fix_start=None, input_fps=1, max_num_frames=-1):
    if sample in ["rand", "middle"]:  # uniform sampling
        acc_samples = min(num_frames, vlen)
        # split the video into `acc_samples` intervals, and sample from each interval.
        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
        ranges = []
        for idx, interv in enumerate(intervals[:-1]):
            ranges.append((interv, intervals[idx + 1] - 1))
        if sample == "rand":
            try:
                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
            except:
                frame_indices = np.random.permutation(vlen)[:acc_samples]
                frame_indices.sort()
                frame_indices = list(frame_indices)
        elif fix_start is not None:
            frame_indices = [x[0] + fix_start for x in ranges]
        elif sample == "middle":
            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
        else:
            raise NotImplementedError

        if len(frame_indices) < num_frames:  # padded with last frame
            padded_frame_indices = [frame_indices[-1]] * num_frames
            padded_frame_indices[: len(frame_indices)] = frame_indices
            frame_indices = padded_frame_indices
    elif "fps" in sample:  # fps0.5, sequentially sample frames at 0.5 fps
        output_fps = float(sample[3:])
        duration = float(vlen) / input_fps
        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
        frame_indices = np.around(frame_seconds * input_fps).astype(int)
        frame_indices = [e for e in frame_indices if e < vlen]
        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
            frame_indices = frame_indices[:max_num_frames]
            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
    else:
        raise ValueError
    return frame_indices


def load_video(video_path, data_transform=None, num_frames=None, return_tensor=True, width=None, height=None):
    """
    Load a video from a given path and apply optional data transformations.

    The function supports loading video in GIF (.gif), PNG (.png), and MP4 (.mp4) formats.
    Depending on the format, it processes and extracts frames accordingly.

    Parameters:
    - video_path (str): The file path to the video or image to be loaded.
    - data_transform (callable, optional): A function that applies transformations to the video data.

    Returns:
    - frames (torch.Tensor): A tensor containing the video frames with shape (T, C, H, W),
      where T is the number of frames, C is the number of channels, H is the height, and W is the width.

    Raises:
    - NotImplementedError: If the video format is not supported.

    The function first determines the format of the video file by its extension.
    For GIFs, it iterates over each frame and converts them to RGB.
    For PNGs, it reads the single frame, converts it to RGB.
    For MP4s, it reads the frames using the VideoReader class and converts them to NumPy arrays.
    If a data_transform is provided, it is applied to the buffer before converting it to a tensor.
    Finally, the tensor is permuted to match the expected (T, C, H, W) format.
    """
    if video_path.endswith(".gif"):
        frame_ls = []
        img = Image.open(video_path)
        for frame in ImageSequence.Iterator(img):
            frame = frame.convert("RGB")
            frame = np.array(frame).astype(np.uint8)
            frame_ls.append(frame)
        buffer = np.array(frame_ls).astype(np.uint8)
    elif video_path.endswith(".png"):
        frame = Image.open(video_path)
        frame = frame.convert("RGB")
        frame = np.array(frame).astype(np.uint8)
        frame_ls = [frame]
        buffer = np.array(frame_ls)
    elif video_path.endswith(".mp4"):
        import decord

        decord.bridge.set_bridge("native")
        if width:
            video_reader = VideoReader(video_path, width=width, height=height, num_threads=1)
        else:
            video_reader = VideoReader(video_path, num_threads=1)
        frames = video_reader.get_batch(range(len(video_reader)))  # (T, H, W, C), torch.uint8

        buffer = frames.asnumpy().astype(np.uint8)
    else:
        raise NotImplementedError

    frames = buffer
    if num_frames:
        frame_indices = get_frame_indices(num_frames, len(frames), sample="middle")
        frames = frames[frame_indices]

    if data_transform:
        frames = data_transform(frames)
    elif return_tensor:
        frames = torch.Tensor(frames)
        frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8

    return frames


================================================
FILE: Open-Sora/build/lib/tools/caption/camera_motion/visualizer.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the cotracker github repo. https://github.com/facebookresearch/co-tracker.
import os

import imageio
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from matplotlib import cm
from PIL import Image, ImageDraw


def read_video_from_path(path):
    try:
        reader = imageio.get_reader(path)
    except Exception as e:
        print("Error opening video file: ", e)
        return None
    frames = []
    for i, im in enumerate(reader):
        frames.append(np.array(im))
    return np.stack(frames)


def draw_circle(rgb, coord, radius, color=(255, 0, 0), visible=True):
    # Create a draw object
    draw = ImageDraw.Draw(rgb)
    # Calculate the bounding box of the circle
    left_up_point = (coord[0] - radius, coord[1] - radius)
    right_down_point = (coord[0] + radius, coord[1] + radius)
    # Draw the circle
    draw.ellipse(
        [left_up_point, right_down_point],
        fill=tuple(color) if visible else None,
        outline=tuple(color),
    )
    return rgb


def draw_line(rgb, coord_y, coord_x, color, linewidth):
    draw = ImageDraw.Draw(rgb)
    draw.line(
        (coord_y[0], coord_y[1], coord_x[0], coord_x[1]),
        fill=tuple(color),
        width=linewidth,
    )
    return rgb


def add_weighted(rgb, alpha, original, beta, gamma):
    return (rgb * alpha + original * beta + gamma).astype("uint8")


class Visualizer:
    def __init__(
        self,
        save_dir: str = "./results",
        grayscale: bool = False,
        pad_value: int = 0,
        fps: int = 10,
        mode: str = "rainbow",  # 'cool', 'optical_flow'
        linewidth: int = 2,
        show_first_frame: int = 10,
        tracks_leave_trace: int = 0,  # -1 for infinite
    ):
        self.mode = mode
        self.save_dir = save_dir
        if mode == "rainbow":
            self.color_map = cm.get_cmap("gist_rainbow")
        elif mode == "cool":
            self.color_map = cm.get_cmap(mode)
        self.show_first_frame = show_first_frame
        self.grayscale = grayscale
        self.tracks_leave_trace = tracks_leave_trace
        self.pad_value = pad_value
        self.linewidth = linewidth
        self.fps = fps

    def visualize(
        self,
        video: torch.Tensor,  # (B,T,C,H,W)
        tracks: torch.Tensor,  # (B,T,N,2)
        visibility: torch.Tensor = None,  # (B, T, N, 1) bool
        gt_tracks: torch.Tensor = None,  # (B,T,N,2)
        segm_mask: torch.Tensor = None,  # (B,1,H,W)
        filename: str = "video",
        writer=None,  # tensorboard Summary Writer, used for visualization during training
        step: int = 0,
        query_frame: int = 0,
        save_video: bool = True,
        compensate_for_camera_motion: bool = False,
    ):
        if compensate_for_camera_motion:
            assert segm_mask is not None
        if segm_mask is not None:
            coords = tracks[0, query_frame].round().long()
            segm_mask = segm_mask[0, query_frame][coords[:, 1], coords[:, 0]].long()

        video = F.pad(
            video,
            (self.pad_value, self.pad_value, self.pad_value, self.pad_value),
            "constant",
            255,
        )
        print("video shape after pad is: ", video.shape)
        tracks = tracks + self.pad_value

        print(tracks)
        print("tracks shape after pad is: ", tracks.shape)

        if self.grayscale:
            transform = transforms.Grayscale()
            video = transform(video)
            video = video.repeat(1, 1, 3, 1, 1)

        res_video = self.draw_tracks_on_video(
            video=video,
            tracks=tracks,
            visibility=visibility,
            segm_mask=segm_mask,
            gt_tracks=gt_tracks,
            query_frame=query_frame,
            compensate_for_camera_motion=compensate_for_camera_motion,
        )
        if save_video:
            self.save_video(res_video, filename=filename, writer=writer, step=step)
        return res_video

    def save_video(self, video, filename, writer=None, step=0):
        if writer is not None:
            writer.add_video(
                filename,
                video.to(torch.uint8),
                global_step=step,
                fps=self.fps,
            )
        else:
            os.makedirs(self.save_dir, exist_ok=True)
            wide_list = list(video.unbind(1))
            wide_list = [wide[0].permute(1, 2, 0).cpu().numpy() for wide in wide_list]

            # Prepare the video file path
            save_path = os.path.join(self.save_dir, f"{filename}.mp4")

            # Create a writer object
            video_writer = imageio.get_writer(save_path, fps=self.fps)

            # Write frames to the video file
            for frame in wide_list[2:-1]:
                video_writer.append_data(frame)

            video_writer.close()

            print(f"Video saved to {save_path}")

    def draw_tracks_on_video(
        self,
        video: torch.Tensor,
        tracks: torch.Tensor,
        visibility: torch.Tensor = None,
        segm_mask: torch.Tensor = None,
        gt_tracks=None,
        query_frame: int = 0,
        compensate_for_camera_motion=False,
    ):
        B, T, C, H, W = video.shape
        _, _, N, D = tracks.shape

        assert D == 2
        assert C == 3
        video = video[0].permute(0, 2, 3, 1).byte().detach().cpu().numpy()  # S, H, W, C
        tracks = tracks[0].long().detach().cpu().numpy()  # S, N, 2
        if gt_tracks is not None:
            gt_tracks = gt_tracks[0].detach().cpu().numpy()

        res_video = []

        # process input video
        for rgb in video:
            res_video.append(rgb.copy())
        vector_colors = np.zeros((T, N, 3))

        if self.mode == "optical_flow":
            import flow_vis

            vector_colors = flow_vis.flow_to_color(tracks - tracks[query_frame][None])
        elif segm_mask is None:
            if self.mode == "rainbow":
                y_min, y_max = (
                    tracks[query_frame, :, 1].min(),
                    tracks[query_frame, :, 1].max(),
                )
                norm = plt.Normalize(y_min, y_max)
                for n in range(N):
                    color = self.color_map(norm(tracks[query_frame, n, 1]))
                    color = np.array(color[:3])[None] * 255
                    vector_colors[:, n] = np.repeat(color, T, axis=0)
            else:
                # color changes with time
                for t in range(T):
                    color = np.array(self.color_map(t / T)[:3])[None] * 255
                    vector_colors[t] = np.repeat(color, N, axis=0)
        else:
            if self.mode == "rainbow":
                vector_colors[:, segm_mask <= 0, :] = 255

                y_min, y_max = (
                    tracks[0, segm_mask > 0, 1].min(),
                    tracks[0, segm_mask > 0, 1].max(),
                )
                norm = plt.Normalize(y_min, y_max)
                for n in range(N):
                    if segm_mask[n] > 0:
                        color = self.color_map(norm(tracks[0, n, 1]))
                        color = np.array(color[:3])[None] * 255
                        vector_colors[:, n] = np.repeat(color, T, axis=0)

            else:
                # color changes with segm class
                segm_mask = segm_mask.cpu()
                color = np.zeros((segm_mask.shape[0], 3), dtype=np.float32)
                color[segm_mask > 0] = np.array(self.color_map(1.0)[:3]) * 255.0
                color[segm_mask <= 0] = np.array(self.color_map(0.0)[:3]) * 255.0
                vector_colors = np.repeat(color[None], T, axis=0)

        #  draw tracks
        if self.tracks_leave_trace != 0:
            for t in range(query_frame + 1, T):
                first_ind = max(0, t - self.tracks_leave_trace) if self.tracks_leave_trace >= 0 else 0
                curr_tracks = tracks[first_ind : t + 1]
                curr_colors = vector_colors[first_ind : t + 1]
                if compensate_for_camera_motion:
                    diff = (tracks[first_ind : t + 1, segm_mask <= 0] - tracks[t : t + 1, segm_mask <= 0]).mean(1)[
                        :, None
                    ]

                    curr_tracks = curr_tracks - diff
                    curr_tracks = curr_tracks[:, segm_mask > 0]
                    curr_colors = curr_colors[:, segm_mask > 0]

                res_video[t] = self._draw_pred_tracks(
                    res_video[t],
                    curr_tracks,
                    curr_colors,
                )
                if gt_tracks is not None:
                    res_video[t] = self._draw_gt_tracks(res_video[t], gt_tracks[first_ind : t + 1])

        #  draw points
        for t in range(query_frame, T):
            img = Image.fromarray(np.uint8(res_video[t]))
            for i in range(N):
                coord = (tracks[t, i, 0], tracks[t, i, 1])
                visibile = True
                if visibility is not None:
                    visibile = visibility[0, t, i]
                if coord[0] != 0 and coord[1] != 0:
                    if not compensate_for_camera_motion or (compensate_for_camera_motion and segm_mask[i] > 0):
                        img = draw_circle(
                            img,
                            coord=coord,
                            radius=int(self.linewidth * 2),
                            color=vector_colors[t, i].astype(int),
                            visible=visibile,
                        )
            res_video[t] = np.array(img)

        #  construct the final rgb sequence
        if self.show_first_frame > 0:
            res_video = [res_video[0]] * self.show_first_frame + res_video[1:]
        return torch.from_numpy(np.stack(res_video)).permute(0, 3, 1, 2)[None].byte()

    def _draw_pred_tracks(
        self,
        rgb: np.ndarray,  # H x W x 3
        tracks: np.ndarray,  # T x 2
        vector_colors: np.ndarray,
        alpha: float = 0.5,
    ):
        T, N, _ = tracks.shape
        rgb = Image.fromarray(np.uint8(rgb))
        for s in range(T - 1):
            vector_color = vector_colors[s]
            original = rgb.copy()
            alpha = (s / T) ** 2
            for i in range(N):
                coord_y = (int(tracks[s, i, 0]), int(tracks[s, i, 1]))
                coord_x = (int(tracks[s + 1, i, 0]), int(tracks[s + 1, i, 1]))
                if coord_y[0] != 0 and coord_y[1] != 0:
                    rgb = draw_line(
                        rgb,
                        coord_y,
                        coord_x,
                        vector_color[i].astype(int),
                        self.linewidth,
                    )
            if self.tracks_leave_trace > 0:
                rgb = Image.fromarray(np.uint8(add_weighted(np.array(rgb), alpha, np.array(original), 1 - alpha, 0)))
        rgb = np.array(rgb)
        return rgb

    def _draw_gt_tracks(
        self,
        rgb: np.ndarray,  # H x W x 3,
        gt_tracks: np.ndarray,  # T x 2
    ):
        T, N, _ = gt_tracks.shape
        color = np.array((211, 0, 0))
        rgb = Image.fromarray(np.uint8(rgb))
        for t in range(T):
            for i in range(N):
                gt_tracks = gt_tracks[t][i]
                #  draw a red cross
                if gt_tracks[0] > 0 and gt_tracks[1] > 0:
                    length = self.linewidth * 3
                    coord_y = (int(gt_tracks[0]) + length, int(gt_tracks[1]) + length)
                    coord_x = (int(gt_tracks[0]) - length, int(gt_tracks[1]) - length)
                    rgb = draw_line(
                        rgb,
                        coord_y,
                        coord_x,
                        color,
                        self.linewidth,
                    )
                    coord_y = (int(gt_tracks[0]) - length, int(gt_tracks[1]) + length)
                    coord_x = (int(gt_tracks[0]) + length, int(gt_tracks[1]) - length)
                    rgb = draw_line(
                        rgb,
                        coord_y,
                        coord_x,
                        color,
                        self.linewidth,
                    )
        rgb = np.array(rgb)
        return rgb


================================================
FILE: Open-Sora/build/lib/tools/caption/camera_motion_detect.py
================================================
# ref: https://github.com/antiboredom/camera-motion-detector

import argparse

import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()


def apply(df, func, **kwargs):
    if pandas_has_parallel:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)


try:
    from pandarallel import pandarallel

    pandarallel.initialize(progress_bar=True)
    pandas_has_parallel = True
except ImportError:
    pandas_has_parallel = False


def make_empty(new_w, new_h):
    empty = []
    for y in range(new_h):
        xvals = []
        for x in range(new_w):
            xvals.append([x, y])
        empty.append(xvals)

    empty = np.array(empty)
    return empty


def get_type(mag, ang, zoom_in, tau_static=1.0, tau_zoom=(0.4, 0.6)):
    if mag < tau_static:
        return "static"
    if zoom_in < tau_zoom[0]:
        return "zoom out"
    if zoom_in > tau_zoom[1]:
        return "zoom in"
    if ang < 45 or ang >= 315:
        return "pan left"
    if 45 <= ang < 135:
        return "tilt up"
    if 135 <= ang < 225:
        return "pan right"
    if 225 <= ang < 315:
        return "tilt down"
    return "unknown"


def get_video_type(frame_types):
    # count the number of each type
    counts = {}
    max_count = 0
    max_type = None
    for frame_type in frame_types:
        if frame_type not in counts:
            counts[frame_type] = 0
        counts[frame_type] += 1
        if counts[frame_type] > max_count:
            max_count = counts[frame_type]
            max_type = frame_type
    if max_count > len(frame_types) / 2:
        return max_type
    if "static" in counts:
        return "unknown"
    if "zoom in" not in counts and "zoom out" not in counts:
        return "pan/tilt"
    return "dynamic"


def process(path: str, frame_interval=15) -> str:
    cap = cv2.VideoCapture(path)
    count = 0
    prvs = None
    frame_types = []
    while cap.isOpened():
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            if count == 0:
                prvs = frame
                h, w = frame.shape
                empty = make_empty(w, h)
                empty_dists = np.sqrt(
                    np.square(empty.ravel()[::2] - (w / 2)) + np.square(empty.ravel()[1::2] - (h / 2))
                )
            else:
                flow = cv2.calcOpticalFlowFarneback(prvs, frame, None, 0.5, 3, 15, 3, 5, 1.2, 0)
                mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1], angleInDegrees=True)
                mean_mag = np.median(mag)
                mean_ang = np.median(ang)

                flow_coords = flow + empty
                xvals = flow_coords.ravel()[::2] - (w / 2)
                yvals = flow_coords.ravel()[1::2] - (h / 2)
                dists = np.sqrt(np.square(xvals) + np.square(yvals))
                dist_diff = dists >= empty_dists
                zoom_in_factor = np.count_nonzero(dist_diff) / len(dist_diff)
                frame_types.append(get_type(mean_mag, mean_ang, zoom_in_factor))
            count += frame_interval
            cap.set(cv2.CAP_PROP_POS_FRAMES, count)
        else:
            cap.release()
            break
    video_type = get_video_type(frame_types)
    return video_type


def main(args):
    output_file = args.input.replace(".csv", "_cmotion.csv")
    data = pd.read_csv(args.input)
    data["cmotion"] = apply(data["path"], process)
    data.to_csv(output_file, index=False)
    print(f"Output saved to {output_file}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str)
    parser.add_argument("--disable-parallel", action="store_true")
    args = parser.parse_args()
    if args.disable_parallel:
        pandas_has_parallel = False
    main(args)


================================================
FILE: Open-Sora/build/lib/tools/caption/caption_gpt4.py
================================================
import argparse
import base64
import csv
import os
from io import BytesIO

import requests
import tqdm

from .utils import IMG_EXTENSIONS, PROMPTS, VID_EXTENSIONS, VideoTextDataset


def to_base64(image):
    buffer = BytesIO()
    image.save(buffer, format="JPEG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")


def get_caption(frame, prompt, api_key):
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}},
                ],
            }
        ],
        "max_tokens": 300,
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=60)
    caption = response.json()["choices"][0]["message"]["content"]
    caption = caption.replace("\n", " ")
    return caption


def main(args):
    # ======================================================
    # 1. read video list
    # ======================================================
    dataset = VideoTextDataset(args.input)
    output_file = os.path.splitext(args.input)[0] + "_caption.csv"
    f = open(output_file, "w")
    writer = csv.writer(f)
    writer.writerow(["video", "text"])

    # make sure that the prompt type matches the data type
    data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1]
    prompt_type = PROMPTS[args.prompt]["type"]
    if prompt_type == "image":
        assert (
            data_extension.lower() in IMG_EXTENSIONS
        ), "The prompt is suitable for an image dataset but the data is not image."
    elif prompt_type == "video":
        assert (
            data_extension.lower() in VID_EXTENSIONS
        ), "The prompt is suitable for a video dataset but the data is not video."
    else:
        raise ValueError(f"Found invalid prompt type {prompt_type}")

    # ======================================================
    # 2. generate captions
    # ======================================================
    for sample in tqdm.tqdm(dataset):
        prompt = PROMPTS[args.prompt]["text"]
        if "text" in args.prompt:
            prompt = prompt.format(sample["text"])
        frames = sample["image"]
        frames = [to_base64(frame) for frame in frames]
        caption = get_caption(frames, prompt, args.key)

        writer.writerow((sample["path"], caption))
    f.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="Path to the input CSV file")
    parser.add_argument("--prompt", type=str, default="video-f3-detail-3ex")
    parser.add_argument("--key", type=str)
    args = parser.parse_args()

    main(args)


================================================
FILE: Open-Sora/build/lib/tools/caption/caption_llama3.py
================================================
import argparse
import csv
import os
import warnings
from datetime import timedelta

import pandas as pd
import torch
import torch.distributed as dist
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

from .utils import read_file

os.system(f"cp {__file__} ~/backup/")  # optionally backup the script
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from torch.distributed.elastic.multiprocessing.errors import record


class CSVTextDataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        # assert text is in the columns
        assert "text" in self.df.columns, "text column not found in the csv file"

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self.df):
            raise IndexError
        return self.df.iloc[idx]

    def set_rank_and_world_size(self, rank, world_size):
        self.rank = rank
        self.world_size = world_size
        self.data_per_gpu = len(self) // world_size
        self.start_index = rank * self.data_per_gpu
        self.end_index = (rank + 1) * self.data_per_gpu if rank != world_size - 1 else len(self)
        self.df = self.df.iloc[self.start_index : self.end_index]

    def write_to_csv(self, output_file, data, new_key):
        """write the part of the df to a csv file corresponding to the rank and write self.data_list as a new column"""
        writer = csv.writer(open(output_file, "w"))
        columns = self.df.columns + [new_key]
        writer.writerow(columns)
        for index, row in self.df.iterrows():
            if index < self.start_index or index >= self.end_index:
                continue
            writer.writerow([*row, data[index - self.start_index]])
        writer.close()


def pad_left(sequences, padding_value=0):
    # Determine the maximum length of the sequences
    max_len = max([s.size(0) for s in sequences])
    # Create a list to hold the padded sequences
    padded_sequences = []
    for sequence in sequences:
        # Calculate the number of padding elements needed for this sequence
        num_padding = max_len - sequence.size(0)
        # Create a tensor of padding values
        padding = torch.full((num_padding,), padding_value, dtype=sequence.dtype).to(sequence.device)
        # Concatenate the padding and the sequence to pad on the left
        padded_sequence = torch.cat([padding, sequence], dim=0)
        padded_sequences.append(padded_sequence)
    # Stack the padded sequences into a batch
    batch = torch.stack(padded_sequences)
    return batch


@record
def main(args):
    # ======================================================
    # 1. init environment
    # ======================================================
    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())

    # ======================================================
    # 2. Prep rank-wise dataloader
    # ======================================================
    dataframe = read_file(args.input)
    print("read data from {}".format(args.input))
    dataset = CSVTextDataset(args.input)
    dataset.set_rank_and_world_size(dist.get_rank(), dist.get_world_size())

    import os

    if os.getenv("DEBUG_ADDRESS") != None and dist.get_rank() == 2:
        import ptvsd

        print("waiting for debugger attachment")
        ptvsd.enable_attach(address=("localhost", int(os.getenv("DEBUG_ADDRESS"))), redirect_output=True)
        ptvsd.wait_for_attach()

    output_file = args.output_prefix + f"_rank{dist.get_rank()}" + f"_{args.key}.csv"
    output_file_handle = open(output_file, "w")
    writer = csv.writer(output_file_handle)
    columns = list(dataframe.columns) + [args.key]

    writer.writerow(columns)

    # add a new key named summary, write in csv file
    print("the processed data saved on this rank will be saved to {}".format(output_file))

    def collate_fn(batch):
        return batch

    dataloader = torch.utils.data.DataLoader(
        dataset,
        # num_workers=2,
        batch_size=args.batch_size,
        collate_fn=collate_fn,
        shuffle=False,
    )

    # ======================================================
    # 2. process using llama3 and prompt
    # ======================================================

    print("Using model with the id {}".format(args.model_id))
    model_id = args.model_id
    tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map=dist.get_rank() % torch.cuda.device_count(),
    )
    # .to(dist.get_rank() % torch.cuda.device_count())
    dist.barrier()
    print("======== Process data using LLAMA3 ========")

    def extract_batch(texts, prompt):
        input_ids_list = [
            tokenizer.apply_chat_template(
                [{"role": "system", "content": prompt}, {"role": "user", "content": text}],
                add_generation_prompt=True,
                return_tensors="pt",
            ).to(model.device)[0]
            for text in texts
        ]

        attention_mask_list = [
            torch.ones(input_ids.shape, dtype=torch.long, device=model.device) for input_ids in input_ids_list
        ]

        # input_ids_batch = pad_left(
        #     input_ids_list, padding_value=tokenizer.eos_token_id
        # )

        input_ids_batch = torch.nn.utils.rnn.pad_sequence(
            input_ids_list, batch_first=True, padding_value=tokenizer.eos_token_id
        )

        attention_mask_batch = torch.nn.utils.rnn.pad_sequence(attention_mask_list, batch_first=True, padding_value=0)

        # attention_mask_batch = pad_left(
        #     attention_mask_list, padding_value=0
        # )

        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>"),
        ]
        outputs = model.generate(
            input_ids_batch,
            max_new_tokens=512,
            attention_mask=attention_mask_batch,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=terminators,
            # do_sample=True,
            # temperature=0.6,
            # top_p=0.9,
        )

        responses = []
        for i in range(len(texts)):
            response = outputs[i][input_ids_list[i].shape[-1] :]
            response = tokenizer.decode(response, skip_special_tokens=True)
            responses.append(response)

        return responses

    print("Processing starting...")
    if args.prompt == "" and args.key == "objects":
        prompt = (
            "You are a AI assistant to extract objects from user's text. "
            "For example: user: 'In this video a dog is running around. In addition, a person is laughing at the dog.', you produce a list of objects separated by ',' and wrapped by '[' and ']': '[dog, person]' "
        )
    elif args.prompt == "" and args.key == "actions":
        prompt = (
            "You are a AI assistant to extract actions from user's text. "
            "For example: user: 'In this video a dog is running around. In addition, a person is laughing at the dog.', you produce a list of actions separated by ',' and wrapped by '[' and ']': '[run, laugh]' "
        )
    else:
        prompt = args.prompt

    print("Prompt: {}".format(prompt))

    args.batch_size
    # for i in tqdm(range(0, len(dataframe), batch_size)):
    for _, batch in enumerate(tqdm(dataloader)):
        # get the text column from the batch
        texts = [batch[i]["text"] for i in range(len(batch))]
        list_keywords = extract_batch(texts, prompt)

        for idx, keywords in enumerate(list_keywords):
            try:
                keywords_start = keywords.find("[")
                keywords_end = keywords.find("]")
                keywords = keywords[keywords_start + 1 : keywords_end]
                if (
                    "\n" in keywords or len(keywords.strip()) == 0
                ):  # we empirically observe that it produces newlines when no keywords are found
                    keywords = "NONE_FOUND"
            except:
                keywords = "NONE_FOUND"
            row = batch[idx]
            writer.writerow([*row, keywords])

    output_file_handle.close()
    dist.barrier()

    if dist.get_rank() == 0:
        collated_file = args.output_prefix + f"_{args.key}.csv"
        print("All ranks are finished. Collating the processed data to {}".format(collated_file))
        import pandas as pd

        csv_files = [args.output_prefix + f"_rank{i}" + f"_{args.key}.csv" for i in range(dist.get_world_size())]
        # List to hold DataFrames
        dataframes = []
        # Read each CSV into a DataFrame and append to list
        for file in csv_files:
            df = pd.read_csv(file)
            # scan each line in the df, if the ``key`` column is NaN, replace it with "NONE_FOUND"
            df[args.key] = df[args.key].fillna("NONE_FOUND")
            dataframes.append(df)
        # Concatenate all DataFrames
        combined_df = pd.concat(dataframes, ignore_index=True)

        # Save the combined DataFrame to a new CSV file
        combined_df.to_csv(collated_file, index=False)
        print("Collated data saved to {}".format(collated_file))
    # terminate distributed env
    dist.destroy_process_group()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-id", default="meta-llama/Meta-Llama-3-8B-Instruct")
    parser.add_argument("input", type=str, help="Path to the input CSV file")
    parser.add_argument("--output_prefix", type=str, help="Path to the output CSV file")
    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--key", type=str)
    args = parser.parse_args()

    main(args)


================================================
FILE: Open-Sora/build/lib/tools/caption/caption_llava.py
================================================
import argparse
import csv
import time
import warnings
from datetime import timedelta

import torch
import torch.distributed as dist
from colossalai.cluster import DistCoordinator, ProcessGroupMesh
from colossalai.shardformer import ShardConfig, ShardFormer
from colossalai.utils import get_current_device, set_seed
from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
from llava.conversation import conv_templates
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm

from ..datasets.utils import IMG_EXTENSIONS, VID_EXTENSIONS
from .acceleration.llava.policies import LlavaLlamaForCausalLMPolicy, LlavaMistralForCausalLMPolicy
from .utils import PROMPTS, Timer, VideoTextDataset, collate_fn

disable_torch_init()


class NoPaddingDistributedSampler(DistributedSampler):
    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0, drop_last=False):
        super().__init__(
            dataset=dataset, num_replicas=num_replicas, rank=rank, seed=seed, shuffle=False, drop_last=False
        )
        remainder = len(self.dataset) % self.num_replicas
        if remainder > 0 and (self.rank + 1) - remainder <= 0:
            # if the dataset is not divisible by num_replicas
            # the remaining items will be allocated to the first n ranks
            self.num_samples = len(self.dataset) // self.num_replicas + 1
        else:
            self.num_samples = len(self.dataset) // self.num_replicas
        self.total_size = len(dataset)

    def __iter__(self):
        if self.shuffle:
            # deterministically shuffle based on epoch and seed
            g = torch.Generator()
            g.manual_seed(self.seed + self.epoch)
            indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
        else:
            indices = list(range(len(self.dataset)))  # type: ignore[arg-type]

        # remove tail of data to make it evenly divisible.
        indices = indices[: self.total_size]

        # subsample
        indices = indices[self.rank : self.total_size : self.num_replicas]
        assert len(indices) == self.num_samples
        return iter(indices)


@torch.inference_mode()
def main(args):
    # ======================================================
    # 1. init environment
    # ======================================================
    # we set a very large timeout to avoid some processes exit early
    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
    set_seed(1024)
    coordinator = DistCoordinator()

    # prepare the dp and tp groups
    assert (
        args.dp_size * args.tp_size == coordinator.world_size
    ), f"DP size {args.dp_size} * TP size {args.tp_size} must equal to world size {coordinator.world_size}"
    mesh = ProcessGroupMesh(args.dp_size, args.tp_size)
    dp_group = mesh.get_group_along_axis(0)
    tp_group = mesh.get_group_along_axis(1)

    # ======================================================
    # 2. load model
    # ======================================================
    model_path = args.model_path
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")  # Pytorch non-meta copying warning fills out the console
        tokenizer, model, image_processor, context_len = load_pretrained_model(
            model_path=model_path,
            model_base=None,
            model_name=get_model_name_from_path(model_path),
            device=get_current_device(),
            torch_dtype=torch.float16,
            attn_implementation="flash_attention_2" if args.flash_attention else "eager",
        )
        dist.barrier()

    # ======================================================
    # 3. Apply system optimization
    # ======================================================
    tp_size = dist.get_world_size(tp_group)
    shard_config = ShardConfig(
        tensor_parallel_process_group=tp_group if tp_size > 1 else None,
        enable_tensor_parallelism=True if tp_size > 1 else False,
    )
    shard_former = ShardFormer(shard_config=shard_config)

    # check the model type
    model_name = model.__class__.__name__
    print(model_name)
    if model_name == "LlavaLlamaForCausalLM":
        model = shard_former.optimize(model, policy=LlavaLlamaForCausalLMPolicy())[0].cuda()
    elif model_name == "LlavaMistralForCausalLM":
        model = shard_former.optimize(model, policy=LlavaMistralForCausalLMPolicy())[0].cuda()
    else:
        print(f"The shardformer policy for {model_name} is not implemented, skip")
    torch.cuda.empty_cache()

    # ======================================================
    # 4. Prepare dataloader
    # ======================================================
    # prepare prompt
    query = PROMPTS[args.prompt]["text"]
    if dist.get_rank() == 0:
        print(f"Prompt: {query}")

    if "text" in args.prompt:

        def get_text_input_ids(text):
            conv = conv_templates["chatml_direct"].copy()
            query_text = query.format(text)
            conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query_text)
            prompt = conv.get_prompt()
            # add num_frames images
            t = prompt.split("<image>")
            prompt = t[0] + "<image>" * args.num_frames + t[1]
            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
            input_ids = input_ids.unsqueeze(0)
            return input_ids

    else:
        conv = conv_templates["chatml_direct"].copy()
        conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query)
        prompt = conv.get_prompt()
        # add num_frames images
        t = prompt.split("<image>")
        prompt = t[0] + "<image>" * args.num_frames + t[1]
        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
        input_ids = input_ids.unsqueeze(0)

        def get_text_input_ids(*args):
            return input_ids

    # build dataset
    def transform(imgs):
        imgs = process_images(imgs, image_processor, model.config)
        imgs = imgs.to(dtype=torch.float16)
        return imgs

    dataset = VideoTextDataset(
        args.input,
        transform=transform,
        num_frames=args.num_frames,
        get_text_input_ids=get_text_input_ids,
        resize=args.resize,
    )

    # make sure that the prompt type matches the data type
    data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1]
    prompt_type = PROMPTS[args.prompt]["type"]
    if prompt_type == "image":
        assert (
            data_extension.lower() in IMG_EXTENSIONS
        ), f"The prompt is suitable for an image dataset but the data is not image. The first data is of format {data_extension}"
    elif prompt_type == "video":
        assert (
            data_extension.lower() in VID_EXTENSIONS
        ), f"The prompt is suitable for a video dataset but the data is not video. The first data is of format {data_extension}"
    else:
        raise ValueError(f"Found invalid prompt type {prompt_type}")

    total_num_videos = len(dataset)

    # build sampler
    dp_rank = dist.get_rank(dp_group)
    dp_size = dist.get_world_size(dp_group)
    sampler = NoPaddingDistributedSampler(dataset, rank=dp_rank, num_replicas=dp_size)

    # build dataloader
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=args.bs,
        shuffle=False,
        num_workers=args.num_workers,
        pin_memory=True,
        prefetch_factor=args.prefetch_factor,
        sampler=sampler,
        collate_fn=collate_fn,
    )

    # prepare output file reader
    output_file = args.input.replace(".csv", "_caption.csv")

    # create csv writer
    has_dp_writter = dist.get_rank(tp_group) == 0

    if has_dp_writter:
        # the dp writer takes care of the files processed on the current dp rank
        # so we use write mode
        output_file_split = output_file.replace(".csv", f"_part{dp_rank}.csv")
        dp_file = open(output_file_split, "w")
        dp_writer = csv.writer(dp_file)
        dp_writer.writerow(["path", "text", "num_frames"])

    # ======================================================
    # 5. generate captions
    # ======================================================
    if dist.get_rank(tp_group) == 0:
        pbar = tqdm(dataloader, position=dp_rank, desc=f"Data Parallel Rank {dist.get_rank(dp_group)}")
    else:
        pbar = dataloader

    if args.profile:
        encode_time = []
        generate_time = []
        output_length = []
        total_time = []

    for i, batch in enumerate(pbar):
        # measure time
        if args.profile:
            torch.cuda.synchronize()
            start_time = time.time()

        video_files, frames, video_lengths, img_size_list, texts = batch

        # encode the batch of inputs
        with Timer() as encode_timer:
            samples = []
            for imgs, imgs_size, input_ids in zip(frames, img_size_list, texts):
                imgs = imgs.cuda()
                input_ids = input_ids.cuda()
                _, _, _, _, inputs_embeds, _ = model.prepare_inputs_labels_for_multimodal(
                    input_ids, None, None, None, None, images=imgs, image_sizes=imgs_size
                )
                samples.append(inputs_embeds)

        # padding
        max_len = max([sample.shape[1] for sample in samples])
        attention_mask = torch.tensor(
            [[0] * (max_len - samples[i].shape[1]) + [1] * samples[i].shape[1] for i in range(len(samples))]
        ).to(model.device)
        inputs_embeds = [
            torch.cat(
                [
                    torch.zeros(
                        (1, max_len - samples[i].shape[1], samples[i].shape[-1]),
                        device=model.device,
                        dtype=torch.float16,
                    ),
                    samples[i],
                ],
                dim=1,
            )
            for i in range(len(samples))
        ]
        inputs_embeds = torch.cat(inputs_embeds, dim=0)

        # generate outputs
        with Timer() as generate_timer:
            output_ids = super(type(model), model).generate(
                inputs_embeds=inputs_embeds,
                attention_mask=attention_mask,
                do_sample=False,  # sampling is not deterministic and may cause TP to hang
                max_new_tokens=args.max_tokens,
                use_cache=True,
            )

            # skip warmup and add profiling data
            if args.profile and i >= args.profile_warmup:
                output_length.append(output_ids.size(0) * output_ids.size(1))

            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            outputs = [output.replace("\n", " ").strip() for output in outputs]

        # skip warmup and add profiling data
        if args.profile and i >= args.profile_warmup:
            # measure time
            torch.cuda.synchronize()
            time_taken = time.time() - start_time

            total_time.append(time_taken)
            encode_time.append(encode_timer.time_taken)
            generate_time.append(generate_timer.time_taken)

        # save results
        if has_dp_writter:
            result = list(zip(video_files, outputs, video_lengths))
            for t in result:
                dp_writer.writerow(t)

    # display profiling info
    if args.profile:
        print(output_length)
        num_samples_after_warmup = total_num_videos - args.bs * args.profile_warmup * dp_size
        print(f"throughput (samples/s): {num_samples_after_warmup / sum(total_time)}")
        print(f"average encode time per sample: {sum(encode_time) / num_samples_after_warmup}")
        print(f"average generate time per sample: {sum(generate_time) / num_samples_after_warmup}")
        print(f"average number of tokens characters per sample: {sum(output_length) / num_samples_after_warmup}")
        print(f"Max GPU allocated / GB: {torch.cuda.max_memory_allocated() / 1024**3}")
        print(f"Max GPU reserved / GB: {torch.cuda.max_memory_reserved() / 1024**3}")

    # ======================================================
    # 6. shutdown
    # ======================================================
    # close file writing
    if has_dp_writter:
        dp_file.close()
    dist.barrier()

    # terminate distributed env
    dist.destroy_process_group()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="Path to the input CSV file")
    parser.add_argument("--model-path", type=str, default="liuhaotian/llava-v1.6-34b")
    parser.add_argument("--prompt", type=str, default="video-f1-detail-3ex")
    parser.add_argument("--resize", type=int, default=336)
    parser.add_argument("--num-frames", type=int, default=1)
    parser.add_argument("--max-tokens", type=int, default=300)
    # speed related
    parser.add_argument("--bs", type=int, default=16)
    parser.add_argument("--tp-size", type=int, default=2)
    parser.add_argument("--dp-size", type=int, default=4)
    parser.add_argument("--num-workers", type=int, default=8)
    parser.add_argument("--prefetch-factor", type=int, default=8, help="Prefetch factor")
    parser.add_argument(
        "--flash-attention",
        action="store_true",
        help="Whether to use flash attention. You can turn on this flag for llama model and off for mistral model.",
    )
    # debug related
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--profile-warmup", type=int, default=1)

    args = parser.parse_args()
    main(args)


================================================
FILE: Open-Sora/build/lib/tools/caption/utils.py
================================================
import time

import pandas as pd
import torch
import torchvision.transforms as transforms
from torchvision.datasets.folder import pil_loader

from tools.datasets.utils import extract_frames, is_video

IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
PROMPTS = {
    "image": {
        "text": "Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than five sentences. Remember do not exceed 5 sentences.",
        "type": "image",
    },
    "image-text": {
        "text": "Describe this image and its style in a very detailed manner. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than six sentences. Some information about the image is '{}'.",
        "type": "image",
    },
    "image-3ex": {
        "text": "An image is given. Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the video. The description should be no more than five sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick and walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
        "type": "image",
    },
    "video": {
        "text": "Describe this video and its style in a very detailed manner. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.",
        "type": "video",
    },
    "video-text": {
        "text": "Describe this video and its style in a very detailed manner. Some information about the image is '{}'. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.",
        "type": "video",
    },
    "video-f1-detail-3ex": {
        "text": "A video is given by providing the middle frame. Describe this video and its style to generate a description. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
        "type": "video",
    },
    "video-f1-detail-2ex-text": {
        "text": "A video is given by providing the middle frame. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.",
        "type": "video",
    },
    "video-f3-detail-3ex": {
        "text": "A video is given by providing three frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
        "type": "video",
    },
    "video-f3-detail-2ex-text": {
        "text": "A video is given by providing three frames in chronological order. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.",
        "type": "video",
    },
}


NUM_FRAMES_POINTS = {
    1: (0.5,),
    2: (0.25, 0.75),
    3: (0.1, 0.5, 0.9),
}


def read_file(input_path):
    if input_path.endswith(".csv"):
        return pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        return pd.read_parquet(input_path)
    else:
        raise NotImplementedError(f"Unsupported file format: {input_path}")


class VideoTextDataset(torch.utils.data.Dataset):
    def __init__(self, csv_path, transform=None, num_frames=3, get_text_input_ids=None, resize=None):
        self.csv_path = csv_path
        self.transform = transform
        self.data = read_file(csv_path)
        self.points = NUM_FRAMES_POINTS[num_frames]
        self.get_text_input_ids = get_text_input_ids
        self.use_text = False
        self.resize_size = resize
        self.resize = transforms.Resize(resize, transforms.InterpolationMode.BICUBIC) if resize is not None else None
        if "text" in self.data.columns:
            self.use_text = True

    def getitem(self, index):
        sample = self.data.iloc[index]
        path = sample["path"]
        if not is_video(path):
            images = [pil_loader(path)]
            length = 1
        else:
            images, length = extract_frames(sample["path"], points=self.points, backend="opencv", return_length=True)
        if self.resize_size is not None:
            images_r = []
            for img in images:
                if img.size[0] > self.resize_size or img.size[1] > self.resize_size:
                    img = self.resize(img)
                images_r.append(img)
            images = images_r
        imgs_size = [img.size for img in images]
        if self.transform is not None:
            images = self.transform(images)

        # we put images into a list as pytorch dataloader does not accept Pill
        out = dict(path=path, image=images, length=length, img_size=imgs_size)
        if self.get_text_input_ids is not None:
            if self.use_text:
                out["text"] = self.get_text_input_ids(sample["text"])
            else:
                out["text"] = self.get_text_input_ids()
        else:
            if self.use_text:
                out["text"] = sample["text"]
            else:
                out["text"] = ""
        return out

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.getitem(index)


def collate_fn(batch):
    paths = [item["path"] for item in batch]
    images = [item["image"] for item in batch]
    lengths = [item["length"] for item in batch]
    img_sizes = [item["img_size"] for item in batch]
    texts = [item["text"] for item in batch]
    return paths, images, lengths, img_sizes, texts


class Timer:
    def __init__(self):
        self.time_taken = 0
        self.start_time = 0
        self.end_time = 0

    def __enter__(self):
        self.start_time = time.time()
        return self

    def __exit__(self, exc_type, exc_value, exc_tb):
        self.end_time = time.time()
        self.time_taken = self.end_time - self.start_time


================================================
FILE: Open-Sora/build/lib/tools/datasets/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/datasets/analyze.py
================================================
import argparse
import os

import matplotlib.pyplot as plt
import pandas as pd


def read_file(input_path):
    if input_path.endswith(".csv"):
        return pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        return pd.read_parquet(input_path)
    else:
        raise NotImplementedError(f"Unsupported file format: {input_path}")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="Path to the input dataset")
    parser.add_argument("--save-img", type=str, default="samples/infos/", help="Path to save the image")
    return parser.parse_args()


def plot_data(data, column, bins, name):
    plt.clf()
    data.hist(column=column, bins=bins)
    os.makedirs(os.path.dirname(name), exist_ok=True)
    plt.savefig(name)
    print(f"Saved {name}")


def plot_categorical_data(data, column, name):
    plt.clf()
    data[column].value_counts().plot(kind="bar")
    os.makedirs(os.path.dirname(name), exist_ok=True)
    plt.savefig(name)
    print(f"Saved {name}")


COLUMNS = {
    "num_frames": 100,
    "resolution": 100,
    "text_len": 100,
    "aes": 100,
    "match": 100,
    "flow": 100,
    "cmotion": None,
}


def main(args):
    data = read_file(args.input)

    # === Image Data Info ===
    image_index = data["num_frames"] == 1
    if image_index.sum() > 0:
        print("=== Image Data Info ===")
        img_data = data[image_index]
        print(f"Number of images: {len(img_data)}")
        print(img_data.head())
        print(img_data.describe())
        if args.save_img:
            for column in COLUMNS:
                if column in img_data.columns and column not in ["num_frames", "cmotion"]:
                    if COLUMNS[column] is None:
                        plot_categorical_data(img_data, column, os.path.join(args.save_img, f"image_{column}.png"))
                    else:
                        plot_data(img_data, column, COLUMNS[column], os.path.join(args.save_img, f"image_{column}.png"))

    # === Video Data Info ===
    if not image_index.all():
        print("=== Video Data Info ===")
        video_data = data[~image_index]
        print(f"Number of videos: {len(video_data)}")
        if "num_frames" in video_data.columns:
            total_num_frames = video_data["num_frames"].sum()
            print(f"Number of frames: {total_num_frames}")
            DEFAULT_FPS = 30
            total_hours = total_num_frames / DEFAULT_FPS / 3600
            print(f"Total hours (30 FPS): {int(total_hours)}")
        print(video_data.head())
        print(video_data.describe())
        if args.save_img:
            for column in COLUMNS:
                if column in video_data.columns:
                    if COLUMNS[column] is None:
                        plot_categorical_data(video_data, column, os.path.join(args.save_img, f"video_{column}.png"))
                    else:
                        plot_data(
                            video_data, column, COLUMNS[column], os.path.join(args.save_img, f"video_{column}.png")
                        )


if __name__ == "__main__":
    args = parse_args()
    main(args)


================================================
FILE: Open-Sora/build/lib/tools/datasets/convert.py
================================================
import argparse
import os
import time

import pandas as pd
from torchvision.datasets import ImageNet

IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv", ".m2ts")


def scan_recursively(root):
    num = 0
    for entry in os.scandir(root):
        if entry.is_file():
            yield entry
        elif entry.is_dir():
            num += 1
            if num % 100 == 0:
                print(f"Scanned {num} directories.")
            yield from scan_recursively(entry.path)


def get_filelist(file_path, exts=None):
    filelist = []
    time_start = time.time()

    # == OS Walk ==
    # for home, dirs, files in os.walk(file_path):
    #     for filename in files:
    #         ext = os.path.splitext(filename)[-1].lower()
    #         if exts is None or ext in exts:
    #             filelist.append(os.path.join(home, filename))

    # == Scandir ==
    obj = scan_recursively(file_path)
    for entry in obj:
        if entry.is_file():
            ext = os.path.splitext(entry.name)[-1].lower()
            if exts is None or ext in exts:
                filelist.append(entry.path)

    time_end = time.time()
    print(f"Scanned {len(filelist)} files in {time_end - time_start:.2f} seconds.")
    return filelist


def split_by_capital(name):
    # BoxingPunchingBag -> Boxing Punching Bag
    new_name = ""
    for i in range(len(name)):
        if name[i].isupper() and i != 0:
            new_name += " "
        new_name += name[i]
    return new_name


def process_imagenet(root, split):
    root = os.path.expanduser(root)
    data = ImageNet(root, split=split)
    samples = [(path, data.classes[label][0]) for path, label in data.samples]
    output = f"imagenet_{split}.csv"

    df = pd.DataFrame(samples, columns=["path", "text"])
    df.to_csv(output, index=False)
    print(f"Saved {len(samples)} samples to {output}.")


def process_ucf101(root, split):
    root = os.path.expanduser(root)
    video_lists = get_filelist(os.path.join(root, split))
    classes = [x.split("/")[-2] for x in video_lists]
    classes = [split_by_capital(x) for x in classes]
    samples = list(zip(video_lists, classes))
    output = f"ucf101_{split}.csv"

    df = pd.DataFrame(samples, columns=["path", "text"])
    df.to_csv(output, index=False)
    print(f"Saved {len(samples)} samples to {output}.")


def process_vidprom(root, info):
    root = os.path.expanduser(root)
    video_lists = get_filelist(root)
    video_set = set(video_lists)
    # read info csv
    infos = pd.read_csv(info)
    abs_path = infos["uuid"].apply(lambda x: os.path.join(root, f"pika-{x}.mp4"))
    is_exist = abs_path.apply(lambda x: x in video_set)
    df = pd.DataFrame(dict(path=abs_path[is_exist], text=infos["prompt"][is_exist]))
    df.to_csv("vidprom.csv", index=False)
    print(f"Saved {len(df)} samples to vidprom.csv.")


def process_general_images(root, output):
    root = os.path.expanduser(root)
    if not os.path.exists(root):
        return
    path_list = get_filelist(root, IMG_EXTENSIONS)
    fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
    df = pd.DataFrame(dict(id=fname_list, path=path_list))

    os.makedirs(os.path.dirname(output), exist_ok=True)
    df.to_csv(output, index=False)
    print(f"Saved {len(df)} samples to {output}.")


def process_general_videos(root, output):
    root = os.path.expanduser(root)
    if not os.path.exists(root):
        return
    path_list = get_filelist(root, VID_EXTENSIONS)
    path_list = list(set(path_list))  # remove duplicates
    fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
    relpath_list = [os.path.relpath(x, root) for x in path_list]
    df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list))

    os.makedirs(os.path.dirname(output), exist_ok=True)
    df.to_csv(output, index=False)
    print(f"Saved {len(df)} samples to {output}.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101", "vidprom", "image", "video"])
    parser.add_argument("root", type=str)
    parser.add_argument("--split", type=str, default="train")
    parser.add_argument("--info", type=str, default=None)
    parser.add_argument("--output", type=str, default=None, required=True, help="Output path")
    args = parser.parse_args()

    if args.dataset == "imagenet":
        process_imagenet(args.root, args.split)
    elif args.dataset == "ucf101":
        process_ucf101(args.root, args.split)
    elif args.dataset == "vidprom":
        process_vidprom(args.root, args.info)
    elif args.dataset == "image":
        process_general_images(args.root, args.output)
    elif args.dataset == "video":
        process_general_videos(args.root, args.output)
    else:
        raise ValueError("Invalid dataset")


================================================
FILE: Open-Sora/build/lib/tools/datasets/datautil.py
================================================
import argparse
import html
import json
import os
import random
import re
from functools import partial
from glob import glob

import cv2
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

from opensora.datasets.read_video import read_video

from .utils import IMG_EXTENSIONS

tqdm.pandas()

try:
    from pandarallel import pandarallel

    PANDA_USE_PARALLEL = True
except ImportError:
    PANDA_USE_PARALLEL = False


def apply(df, func, **kwargs):
    if PANDA_USE_PARALLEL:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)


TRAIN_COLUMNS = ["path", "text", "num_frames", "fps", "height", "width", "aspect_ratio", "resolution", "text_len"]

# ======================================================
# --info
# ======================================================


def get_video_length(cap, method="header"):
    assert method in ["header", "set"]
    if method == "header":
        length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    else:
        cap.set(cv2.CAP_PROP_POS_AVI_RATIO, 1)
        length = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
    return length


def get_info_old(path):
    try:
        ext = os.path.splitext(path)[1].lower()
        if ext in IMG_EXTENSIONS:
            im = cv2.imread(path)
            if im is None:
                return 0, 0, 0, np.nan, np.nan, np.nan
            height, width = im.shape[:2]
            num_frames, fps = 1, np.nan
        else:
            cap = cv2.VideoCapture(path)
            num_frames, height, width, fps = (
                get_video_length(cap, method="header"),
                int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
                int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
                float(cap.get(cv2.CAP_PROP_FPS)),
            )
        hw = height * width
        aspect_ratio = height / width if width > 0 else np.nan
        return num_frames, height, width, aspect_ratio, fps, hw
    except:
        return 0, 0, 0, np.nan, np.nan, np.nan


def get_info(path):
    try:
        ext = os.path.splitext(path)[1].lower()
        if ext in IMG_EXTENSIONS:
            return get_image_info(path)
        else:
            return get_video_info(path)
    except:
        return 0, 0, 0, np.nan, np.nan, np.nan


def get_image_info(path, backend="pillow"):
    if backend == "pillow":
        try:
            with open(path, "rb") as f:
                img = Image.open(f)
                img = img.convert("RGB")
            width, height = img.size
            num_frames, fps = 1, np.nan
            hw = height * width
            aspect_ratio = height / width if width > 0 else np.nan
            return num_frames, height, width, aspect_ratio, fps, hw
        except:
            return 0, 0, 0, np.nan, np.nan, np.nan
    elif backend == "cv2":
        try:
            im = cv2.imread(path)
            if im is None:
                return 0, 0, 0, np.nan, np.nan, np.nan
            height, width = im.shape[:2]
            num_frames, fps = 1, np.nan
            hw = height * width
            aspect_ratio = height / width if width > 0 else np.nan
            return num_frames, height, width, aspect_ratio, fps, hw
        except:
            return 0, 0, 0, np.nan, np.nan, np.nan
    else:
        raise ValueError


def get_video_info(path, backend="torchvision"):
    if backend == "torchvision":
        try:
            vframes, infos = read_video(path)
            num_frames, height, width = vframes.shape[0], vframes.shape[2], vframes.shape[3]
            if "video_fps" in infos:
                fps = infos["video_fps"]
            else:
                fps = np.nan
            hw = height * width
            aspect_ratio = height / width if width > 0 else np.nan
            return num_frames, height, width, aspect_ratio, fps, hw
        except:
            return 0, 0, 0, np.nan, np.nan, np.nan
    elif backend == "cv2":
        try:
            cap = cv2.VideoCapture(path)
            num_frames, height, width, fps = (
                get_video_length(cap, method="header"),
                int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
                int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
                float(cap.get(cv2.CAP_PROP_FPS)),
            )
            hw = height * width
            aspect_ratio = height / width if width > 0 else np.nan
            return num_frames, height, width, aspect_ratio, fps, hw
        except:
            return 0, 0, 0, np.nan, np.nan, np.nan
    else:
        raise ValueError


# ======================================================
# --refine-llm-caption
# ======================================================

LLAVA_PREFIX = [
    "The video shows",
    "The video captures",
    "The video features",
    "The video depicts",
    "The video presents",
    "The video features",
    "The video is ",
    "In the video,",
    "The image shows",
    "The image captures",
    "The image features",
    "The image depicts",
    "The image presents",
    "The image features",
    "The image is ",
    "The image portrays",
    "In the image,",
]


def remove_caption_prefix(caption):
    for prefix in LLAVA_PREFIX:
        if caption.startswith(prefix) or caption.startswith(prefix.lower()):
            caption = caption[len(prefix) :].strip()
            if caption[0].islower():
                caption = caption[0].upper() + caption[1:]
            return caption
    return caption


# ======================================================
# --merge-cmotion
# ======================================================

CMOTION_TEXT = {
    "static": "static",
    "pan_right": "pan right",
    "pan_left": "pan left",
    "zoom_in": "zoom in",
    "zoom_out": "zoom out",
    "tilt_up": "tilt up",
    "tilt_down": "tilt down",
    # "pan/tilt": "The camera is panning.",
    # "dynamic": "The camera is moving.",
    # "unknown": None,
}
CMOTION_PROBS = {
    # hard-coded probabilities
    "static": 1.0,
    "zoom_in": 1.0,
    "zoom_out": 1.0,
    "pan_left": 1.0,
    "pan_right": 1.0,
    "tilt_up": 1.0,
    "tilt_down": 1.0,
    # "dynamic": 1.0,
    # "unknown": 0.0,
    # "pan/tilt": 1.0,
}


def merge_cmotion(caption, cmotion):
    text = CMOTION_TEXT[cmotion]
    prob = CMOTION_PROBS[cmotion]
    if text is not None and random.random() < prob:
        caption = f"{caption} Camera motion: {text}."
    return caption


# ======================================================
# --lang
# ======================================================


def build_lang_detector(lang_to_detect):
    from lingua import Language, LanguageDetectorBuilder

    lang_dict = dict(en=Language.ENGLISH)
    assert lang_to_detect in lang_dict
    valid_lang = lang_dict[lang_to_detect]
    detector = LanguageDetectorBuilder.from_all_spoken_languages().with_low_accuracy_mode().build()

    def detect_lang(caption):
        confidence_values = detector.compute_language_confidence_values(caption)
        confidence = [x.language for x in confidence_values[:5]]
        if valid_lang not in confidence:
            return False
        return True

    return detect_lang


# ======================================================
# --clean-caption
# ======================================================


def basic_clean(text):
    import ftfy

    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


BAD_PUNCT_REGEX = re.compile(
    r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
)  # noqa


def clean_caption(caption):
    import urllib.parse as ul

    from bs4 import BeautifulSoup

    caption = str(caption)
    caption = ul.unquote_plus(caption)
    caption = caption.strip().lower()
    caption = re.sub("<person>", "person", caption)
    # urls:
    caption = re.sub(
        r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    caption = re.sub(
        r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    # html:
    caption = BeautifulSoup(caption, features="html.parser").text

    # @<nickname>
    caption = re.sub(r"@[\w\d]+\b", "", caption)

    # 31C0—31EF CJK Strokes
    # 31F0—31FF Katakana Phonetic Extensions
    # 3200—32FF Enclosed CJK Letters and Months
    # 3300—33FF CJK Compatibility
    # 3400—4DBF CJK Unified Ideographs Extension A
    # 4DC0—4DFF Yijing Hexagram Symbols
    # 4E00—9FFF CJK Unified Ideographs
    caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
    caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
    caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
    caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
    caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
    caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
    caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
    #######################################################

    # все виды тире / all types of dash --> "-"
    caption = re.sub(
        r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
        "-",
        caption,
    )

    # кавычки к одному стандарту
    caption = re.sub(r"[`´«»“”¨]", '"', caption)
    caption = re.sub(r"[‘’]", "'", caption)

    # &quot;
    caption = re.sub(r"&quot;?", "", caption)
    # &amp
    caption = re.sub(r"&amp", "", caption)

    # ip adresses:
    caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)

    # article ids:
    caption = re.sub(r"\d:\d\d\s+$", "", caption)

    # \n
    caption = re.sub(r"\\n", " ", caption)

    # "#123"
    caption = re.sub(r"#\d{1,3}\b", "", caption)
    # "#12345.."
    caption = re.sub(r"#\d{5,}\b", "", caption)
    # "123456.."
    caption = re.sub(r"\b\d{6,}\b", "", caption)
    # filenames:
    caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)

    #
    caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
    caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""

    caption = re.sub(BAD_PUNCT_REGEX, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
    caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "

    # this-is-my-cute-cat / this_is_my_cute_cat
    regex2 = re.compile(r"(?:\-|\_)")
    if len(re.findall(regex2, caption)) > 3:
        caption = re.sub(regex2, " ", caption)

    caption = basic_clean(caption)

    caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
    caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
    caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231

    caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
    caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
    caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
    caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
    caption = re.sub(r"\bpage\s+\d+\b", "", caption)

    caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...

    caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)

    caption = re.sub(r"\b\s+\:\s+", r": ", caption)
    caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
    caption = re.sub(r"\s+", " ", caption)

    caption.strip()

    caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
    caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
    caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
    caption = re.sub(r"^\.\S+$", "", caption)

    return caption.strip()


def text_preprocessing(text, use_text_preprocessing: bool = True):
    if use_text_preprocessing:
        # The exact text cleaning as was in the training stage:
        text = clean_caption(text)
        text = clean_caption(text)
        return text
    else:
        return text.lower().strip()


# ======================================================
# load caption
# ======================================================


def load_caption(path, ext):
    try:
        assert ext in ["json"]
        json_path = path.split(".")[0] + ".json"
        with open(json_path, "r") as f:
            data = json.load(f)
        caption = data["caption"]
        return caption
    except:
        return ""


# ======================================================
# --clean-caption
# ======================================================

DROP_SCORE_PROB = 0.2


def score_to_text(data):
    text = data["text"]
    scores = []
    # aesthetic
    if "aes" in data:
        aes = data["aes"]
        if random.random() > DROP_SCORE_PROB:
            score_text = f"aesthetic score: {aes:.1f}"
            scores.append(score_text)
    if "flow" in data:
        flow = data["flow"]
        if random.random() > DROP_SCORE_PROB:
            score_text = f"motion score: {flow:.1f}"
            scores.append(score_text)
    if len(scores) > 0:
        text = f"{text} [{', '.join(scores)}]"
    return text


# ======================================================
# read & write
# ======================================================


def read_file(input_path):
    if input_path.endswith(".csv"):
        return pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        return pd.read_parquet(input_path)
    else:
        raise NotImplementedError(f"Unsupported file format: {input_path}")


def save_file(data, output_path):
    output_dir = os.path.dirname(output_path)
    if not os.path.exists(output_dir) and output_dir != "":
        os.makedirs(output_dir)
    if output_path.endswith(".csv"):
        return data.to_csv(output_path, index=False)
    elif output_path.endswith(".parquet"):
        return data.to_parquet(output_path, index=False)
    else:
        raise NotImplementedError(f"Unsupported file format: {output_path}")


def read_data(input_paths):
    data = []
    input_name = ""
    input_list = []
    for input_path in input_paths:
        input_list.extend(glob(input_path))
    print("Input files:", input_list)
    for i, input_path in enumerate(input_list):
        if not os.path.exists(input_path):
            continue
        data.append(read_file(input_path))
        input_name += os.path.basename(input_path).split(".")[0]
        if i != len(input_list) - 1:
            input_name += "+"
        print(f"Loaded {len(data[-1])} samples from '{input_path}'.")
    if len(data) == 0:
        print(f"No samples to process. Exit.")
        exit()
    data = pd.concat(data, ignore_index=True, sort=False)
    print(f"Total number of samples: {len(data)}")
    return data, input_name


# ======================================================
# main
# ======================================================
# To add a new method, register it in the main, parse_args, and get_output_path functions, and update the doc at /tools/datasets/README.md#documentation


def main(args):
    # reading data
    data, input_name = read_data(args.input)

    # make difference
    if args.difference is not None:
        data_diff = pd.read_csv(args.difference)
        print(f"Difference csv contains {len(data_diff)} samples.")
        data = data[~data["path"].isin(data_diff["path"])]
        input_name += f"-{os.path.basename(args.difference).split('.')[0]}"
        print(f"Filtered number of samples: {len(data)}.")

    # make intersection
    if args.intersection is not None:
        data_new = pd.read_csv(args.intersection)
        print(f"Intersection csv contains {len(data_new)} samples.")
        cols_to_use = data_new.columns.difference(data.columns)

        col_on = "path"
        # if 'id' in data.columns and 'id' in data_new.columns:
        #     col_on = 'id'
        cols_to_use = cols_to_use.insert(0, col_on)
        data = pd.merge(data, data_new[cols_to_use], on=col_on, how="inner")
        print(f"Intersection number of samples: {len(data)}.")

    # get output path
    output_path = get_output_path(args, input_name)

    # preparation
    if args.lang is not None:
        detect_lang = build_lang_detector(args.lang)
    if args.count_num_token == "t5":
        from transformers import AutoTokenizer

        tokenizer = AutoTokenizer.from_pretrained("DeepFloyd/t5-v1_1-xxl")

    # IO-related
    if args.load_caption is not None:
        assert "path" in data.columns
        data["text"] = apply(data["path"], load_caption, ext=args.load_caption)
    if args.info:
        info = apply(data["path"], get_info)
        (
            data["num_frames"],
            data["height"],
            data["width"],
            data["aspect_ratio"],
            data["fps"],
            data["resolution"],
        ) = zip(*info)
    if args.video_info:
        info = apply(data["path"], get_video_info)
        (
            data["num_frames"],
            data["height"],
            data["width"],
            data["aspect_ratio"],
            data["fps"],
            data["resolution"],
        ) = zip(*info)
    if args.ext:
        assert "path" in data.columns
        data = data[apply(data["path"], os.path.exists)]

    # filtering
    if args.remove_url:
        assert "text" in data.columns
        data = data[~data["text"].str.contains(r"(?P<url>https?://[^\s]+)", regex=True)]
    if args.lang is not None:
        assert "text" in data.columns
        data = data[data["text"].progress_apply(detect_lang)]  # cannot parallelize
    if args.remove_empty_path:
        assert "path" in data.columns
        data = data[data["path"].str.len() > 0]
        data = data[~data["path"].isna()]
    if args.remove_empty_caption:
        assert "text" in data.columns
        data = data[data["text"].str.len() > 0]
        data = data[~data["text"].isna()]
    if args.remove_path_duplication:
        assert "path" in data.columns
        data = data.drop_duplicates(subset=["path"])
    if args.path_subset:
        data = data[data["path"].str.contains(args.path_subset)]

    # processing
    if args.relpath is not None:
        data["path"] = apply(data["path"], lambda x: os.path.relpath(x, args.relpath))
    if args.abspath is not None:
        data["path"] = apply(data["path"], lambda x: os.path.join(args.abspath, x))
    if args.path_to_id:
        data["id"] = apply(data["path"], lambda x: os.path.splitext(os.path.basename(x))[0])
    if args.merge_cmotion:
        data["text"] = apply(data, lambda x: merge_cmotion(x["text"], x["cmotion"]), axis=1)
    if args.refine_llm_caption:
        assert "text" in data.columns
        data["text"] = apply(data["text"], remove_caption_prefix)
    if args.append_text is not None:
        assert "text" in data.columns
        data["text"] = data["text"] + args.append_text
    if args.score_to_text:
        data["text"] = apply(data, score_to_text, axis=1)
    if args.clean_caption:
        assert "text" in data.columns
        data["text"] = apply(
            data["text"],
            partial(text_preprocessing, use_text_preprocessing=True),
        )
    if args.count_num_token is not None:
        assert "text" in data.columns
        data["text_len"] = apply(data["text"], lambda x: len(tokenizer(x)["input_ids"]))
    if args.update_text is not None:
        data_new = pd.read_csv(args.update_text)
        num_updated = data.path.isin(data_new.path).sum()
        print(f"Number of updated samples: {num_updated}.")
        data = data.set_index("path")
        data_new = data_new[["path", "text"]].set_index("path")
        data.update(data_new)
        data = data.reset_index()

    # sort
    if args.sort is not None:
        data = data.sort_values(by=args.sort, ascending=False)
    if args.sort_ascending is not None:
        data = data.sort_values(by=args.sort_ascending, ascending=True)

    # filtering
    if args.filesize:
        assert "path" in data.columns
        data["filesize"] = apply(data["path"], lambda x: os.stat(x).st_size / 1024 / 1024)
    if args.fsmax is not None:
        assert "filesize" in data.columns
        data = data[data["filesize"] <= args.fsmax]
    if args.remove_empty_caption:
        assert "text" in data.columns
        data = data[data["text"].str.len() > 0]
        data = data[~data["text"].isna()]
    if args.fmin is not None:
        assert "num_frames" in data.columns
        data = data[data["num_frames"] >= args.fmin]
    if args.fmax is not None:
        assert "num_frames" in data.columns
        data = data[data["num_frames"] <= args.fmax]
    if args.fpsmax is not None:
        assert "fps" in data.columns
        data = data[(data["fps"] <= args.fpsmax) | np.isnan(data["fps"])]
    if args.hwmax is not None:
        if "resolution" not in data.columns:
            height = data["height"]
            width = data["width"]
            data["resolution"] = height * width
        data = data[data["resolution"] <= args.hwmax]
    if args.aesmin is not None:
        assert "aes" in data.columns
        data = data[data["aes"] >= args.aesmin]
    if args.matchmin is not None:
        assert "match" in data.columns
        data = data[data["match"] >= args.matchmin]
    if args.flowmin is not None:
        assert "flow" in data.columns
        data = data[data["flow"] >= args.flowmin]
    if args.remove_text_duplication:
        data = data.drop_duplicates(subset=["text"], keep="first")
    if args.img_only:
        data = data[data["path"].str.lower().str.endswith(IMG_EXTENSIONS)]
    if args.vid_only:
        data = data[~data["path"].str.lower().str.endswith(IMG_EXTENSIONS)]

    # process data
    if args.shuffle:
        data = data.sample(frac=1).reset_index(drop=True)  # shuffle
    if args.head is not None:
        data = data.head(args.head)

    # train columns
    if args.train_column:
        all_columns = data.columns
        columns_to_drop = all_columns.difference(TRAIN_COLUMNS)
        data = data.drop(columns=columns_to_drop)

    print(f"Filtered number of samples: {len(data)}.")

    # shard data
    if args.shard is not None:
        sharded_data = np.array_split(data, args.shard)
        for i in range(args.shard):
            output_path_part = output_path.split(".")
            output_path_s = ".".join(output_path_part[:-1]) + f"_{i}." + output_path_part[-1]
            save_file(sharded_data[i], output_path_s)
            print(f"Saved {len(sharded_data[i])} samples to {output_path_s}.")
    else:
        save_file(data, output_path)
        print(f"Saved {len(data)} samples to {output_path}.")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, nargs="+", help="path to the input dataset")
    parser.add_argument("--output", type=str, default=None, help="output path")
    parser.add_argument("--format", type=str, default="csv", help="output format", choices=["csv", "parquet"])
    parser.add_argument("--disable-parallel", action="store_true", help="disable parallel processing")
    parser.add_argument("--num-workers", type=int, default=None, help="number of workers")
    parser.add_argument("--seed", type=int, default=42, help="random seed")

    # special case
    parser.add_argument("--shard", type=int, default=None, help="shard the dataset")
    parser.add_argument("--sort", type=str, default=None, help="sort by column")
    parser.add_argument("--sort-ascending", type=str, default=None, help="sort by column (ascending order)")
    parser.add_argument("--difference", type=str, default=None, help="get difference from the dataset")
    parser.add_argument(
        "--intersection", type=str, default=None, help="keep the paths in csv from the dataset and merge columns"
    )
    parser.add_argument("--train-column", action="store_true", help="only keep the train column")

    # IO-related
    parser.add_argument("--info", action="store_true", help="get the basic information of each video and image")
    parser.add_argument("--video-info", action="store_true", help="get the basic information of each video")
    parser.add_argument("--ext", action="store_true", help="check if the file exists")
    parser.add_argument(
        "--load-caption", type=str, default=None, choices=["json", "txt"], help="load the caption from json or txt"
    )

    # path processing
    parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given")
    parser.add_argument("--abspath", type=str, default=None, help="modify the path to absolute path by root given")
    parser.add_argument("--path-to-id", action="store_true", help="add id based on path")
    parser.add_argument(
        "--path-subset", type=str, default=None, help="extract a subset data containing the given `path-subset` value"
    )
    parser.add_argument(
        "--remove-empty-path",
        action="store_true",
        help="remove rows with empty path",  # caused by transform, cannot read path
    )

    # caption filtering
    parser.add_argument(
        "--remove-empty-caption",
        action="store_true",
        help="remove rows with empty caption",
    )
    parser.add_argument("--remove-url", action="store_true", help="remove rows with url in caption")
    parser.add_argument("--lang", type=str, default=None, help="remove rows with other language")
    parser.add_argument("--remove-path-duplication", action="store_true", help="remove rows with duplicated path")
    parser.add_argument("--remove-text-duplication", action="store_true", help="remove rows with duplicated caption")

    # caption processing
    parser.add_argument("--refine-llm-caption", action="store_true", help="modify the caption generated by LLM")
    parser.add_argument(
        "--clean-caption", action="store_true", help="modify the caption according to T5 pipeline to suit training"
    )
    parser.add_argument("--merge-cmotion", action="store_true", help="merge the camera motion to the caption")
    parser.add_argument(
        "--count-num-token", type=str, choices=["t5"], default=None, help="Count the number of tokens in the caption"
    )
    parser.add_argument("--append-text", type=str, default=None, help="append text to the caption")
    parser.add_argument("--score-to-text", action="store_true", help="convert score to text")
    parser.add_argument("--update-text", type=str, default=None, help="update the text with the given text")

    # score filtering
    parser.add_argument("--filesize", action="store_true", help="get the filesize of each video and image in MB")
    parser.add_argument("--fsmax", type=int, default=None, help="filter the dataset by maximum filesize")
    parser.add_argument("--fmin", type=int, default=None, help="filter the dataset by minimum number of frames")
    parser.add_argument("--fmax", type=int, default=None, help="filter the dataset by maximum number of frames")
    parser.add_argument("--hwmax", type=int, default=None, help="filter the dataset by maximum resolution")
    parser.add_argument("--aesmin", type=float, default=None, help="filter the dataset by minimum aes score")
    parser.add_argument("--matchmin", type=float, default=None, help="filter the dataset by minimum match score")
    parser.add_argument("--flowmin", type=float, default=None, help="filter the dataset by minimum flow score")
    parser.add_argument("--fpsmax", type=float, default=None, help="filter the dataset by maximum fps")
    parser.add_argument("--img-only", action="store_true", help="only keep the image data")
    parser.add_argument("--vid-only", action="store_true", help="only keep the video data")

    # data processing
    parser.add_argument("--shuffle", default=False, action="store_true", help="shuffle the dataset")
    parser.add_argument("--head", type=int, default=None, help="return the first n rows of data")

    return parser.parse_args()


def get_output_path(args, input_name):
    if args.output is not None:
        return args.output
    name = input_name
    dir_path = os.path.dirname(args.input[0])

    # sort
    if args.sort is not None:
        assert args.sort_ascending is None
        name += "_sort"
    if args.sort_ascending is not None:
        assert args.sort is None
        name += "_sort"

    # IO-related
    # for IO-related, the function must be wrapped in try-except
    if args.info:
        name += "_info"
    if args.video_info:
        name += "_vinfo"
    if args.ext:
        name += "_ext"
    if args.load_caption:
        name += f"_load{args.load_caption}"

    # path processing
    if args.relpath is not None:
        name += "_relpath"
    if args.abspath is not None:
        name += "_abspath"
    if args.remove_empty_path:
        name += "_noemptypath"

    # caption filtering
    if args.remove_empty_caption:
        name += "_noempty"
    if args.remove_url:
        name += "_nourl"
    if args.lang is not None:
        name += f"_{args.lang}"
    if args.remove_path_duplication:
        name += "_noduppath"
    if args.remove_text_duplication:
        name += "_noduptext"
    if args.path_subset:
        name += "_subset"

    # caption processing
    if args.refine_llm_caption:
        name += "_llm"
    if args.clean_caption:
        name += "_clean"
    if args.merge_cmotion:
        name += "_cmcaption"
    if args.count_num_token:
        name += "_ntoken"
    if args.append_text is not None:
        name += "_appendtext"
    if args.score_to_text:
        name += "_score2text"
    if args.update_text is not None:
        name += "_update"

    # score filtering
    if args.filesize:
        name += "_filesize"
    if args.fsmax is not None:
        name += f"_fsmax{args.fsmax}"
    if args.fmin is not None:
        name += f"_fmin{args.fmin}"
    if args.fmax is not None:
        name += f"_fmax{args.fmax}"
    if args.fpsmax is not None:
        name += f"_fpsmax{args.fpsmax}"
    if args.hwmax is not None:
        name += f"_hwmax{args.hwmax}"
    if args.aesmin is not None:
        name += f"_aesmin{args.aesmin}"
    if args.matchmin is not None:
        name += f"_matchmin{args.matchmin}"
    if args.flowmin is not None:
        name += f"_flowmin{args.flowmin}"
    if args.img_only:
        name += "_img"
    if args.vid_only:
        name += "_vid"

    # processing
    if args.shuffle:
        name += f"_shuffled_seed{args.seed}"
    if args.head is not None:
        name += f"_first_{args.head}_data"

    output_path = os.path.join(dir_path, f"{name}.{args.format}")
    return output_path


if __name__ == "__main__":
    args = parse_args()
    if args.disable_parallel:
        PANDA_USE_PARALLEL = False
    if PANDA_USE_PARALLEL:
        if args.num_workers is not None:
            pandarallel.initialize(nb_workers=args.num_workers, progress_bar=True)
        else:
            pandarallel.initialize(progress_bar=True)
    if args.seed is not None:
        random.seed(args.seed)
        np.random.seed(args.seed)
    main(args)


================================================
FILE: Open-Sora/build/lib/tools/datasets/filter_panda10m.py
================================================
# TODO: remove this file before releasing

import argparse
import html
import os
import re

import pandas as pd
from tqdm import tqdm

tqdm.pandas()

try:
    from pandarallel import pandarallel

    pandarallel.initialize(progress_bar=True)
    pandas_has_parallel = True
except ImportError:
    pandas_has_parallel = False


def apply(df, func, **kwargs):
    if pandas_has_parallel:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)


def basic_clean(text):
    import ftfy

    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


BAD_PUNCT_REGEX = re.compile(
    r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
)  # noqa


def clean_caption(caption):
    import urllib.parse as ul

    from bs4 import BeautifulSoup

    caption = str(caption)
    caption = ul.unquote_plus(caption)
    caption = caption.strip().lower()
    caption = re.sub("<person>", "person", caption)
    # urls:
    caption = re.sub(
        r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    caption = re.sub(
        r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    # html:
    caption = BeautifulSoup(caption, features="html.parser").text

    # @<nickname>
    caption = re.sub(r"@[\w\d]+\b", "", caption)

    # 31C0—31EF CJK Strokes
    # 31F0—31FF Katakana Phonetic Extensions
    # 3200—32FF Enclosed CJK Letters and Months
    # 3300—33FF CJK Compatibility
    # 3400—4DBF CJK Unified Ideographs Extension A
    # 4DC0—4DFF Yijing Hexagram Symbols
    # 4E00—9FFF CJK Unified Ideographs
    caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
    caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
    caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
    caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
    caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
    caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
    caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
    #######################################################

    # все виды тире / all types of dash --> "-"
    caption = re.sub(
        r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
        "-",
        caption,
    )

    # кавычки к одному стандарту
    caption = re.sub(r"[`´«»“”¨]", '"', caption)
    caption = re.sub(r"[‘’]", "'", caption)

    # &quot;
    caption = re.sub(r"&quot;?", "", caption)
    # &amp
    caption = re.sub(r"&amp", "", caption)

    # ip adresses:
    caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)

    # article ids:
    caption = re.sub(r"\d:\d\d\s+$", "", caption)

    # \n
    caption = re.sub(r"\\n", " ", caption)

    # "#123"
    caption = re.sub(r"#\d{1,3}\b", "", caption)
    # "#12345.."
    caption = re.sub(r"#\d{5,}\b", "", caption)
    # "123456.."
    caption = re.sub(r"\b\d{6,}\b", "", caption)
    # filenames:
    caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)

    #
    caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
    caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""

    caption = re.sub(BAD_PUNCT_REGEX, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
    caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "

    # this-is-my-cute-cat / this_is_my_cute_cat
    regex2 = re.compile(r"(?:\-|\_)")
    if len(re.findall(regex2, caption)) > 3:
        caption = re.sub(regex2, " ", caption)

    caption = basic_clean(caption)

    caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
    caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
    caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231

    caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
    caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
    caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
    caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
    caption = re.sub(r"\bpage\s+\d+\b", "", caption)

    caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...

    caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)

    caption = re.sub(r"\b\s+\:\s+", r": ", caption)
    caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
    caption = re.sub(r"\s+", " ", caption)

    caption.strip()

    caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
    caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
    caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
    caption = re.sub(r"^\.\S+$", "", caption)

    return caption.strip()


def get_10m_set():
    meta_path_10m = "/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv"
    meta_10m = pd.read_csv(meta_path_10m)

    def process_single_caption(row):
        text_list = eval(row["caption"])
        clean_list = [clean_caption(x) for x in text_list]
        return str(clean_list)

    ret = apply(meta_10m, process_single_caption, axis=1)
    # ret = meta_10m.progress_apply(process_single_caption, axis=1)
    print("==> text processed.")

    text_list = []
    for x in ret:
        text_list += eval(x)
        # text_set = text_set.union(set(eval(x)))
    text_set = set(text_list)
    # meta_10m['caption_new'] = ret
    # meta_10m.to_csv('/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m_new-cap.csv')

    # video_id_set = set(meta_10m['videoID'])
    # id2t = {}
    # for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)):
    #     video_id = row['videoID']
    #     text_list = eval(row['caption'])
    #     id2t[video_id] = set(text_list)

    print(f"==> Loaded meta_10m from '{meta_path_10m}'")
    return text_set


def filter_panda10m_text(meta_path, text_set):
    def process_single_row(row):
        # path = row['path']
        t = row["text"]
        # fname = os.path.basename(path)
        # video_id = fname[:fname.rindex('_')]
        if t not in text_set:
            return False
        return True

    meta = pd.read_csv(meta_path)
    ret = apply(meta, process_single_row, axis=1)
    # ret = meta.progress_apply(process_single_row, axis=1)

    meta = meta[ret]
    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_filter-10m{ext}"
    meta.to_csv(out_path, index=False)
    print(f"New meta (shape={meta.shape}) saved to '{out_path}'.")


def filter_panda10m_timestamp(meta_path):
    meta_path_10m = "/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv"
    meta_10m = pd.read_csv(meta_path_10m)

    id2t = {}
    for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)):
        video_id = row["videoID"]
        timestamp = eval(row["timestamp"])
        timestamp = [str(tuple(x)) for x in timestamp]
        id2t[video_id] = timestamp

    # video_id_set_10m = set(meta_10m['videoID'])
    print(f"==> Loaded meta_10m from '{meta_path_10m}'")

    def process_single_row(row):
        path = row["path"]
        t = row["timestamp"]
        fname = os.path.basename(path)
        video_id = fname[: fname.rindex("_")]
        if video_id not in id2t:
            return False
        if t not in id2t[video_id]:
            return False
        return True
        # return video_id in video_id_set_10m

    meta = pd.read_csv(meta_path)
    ret = apply(meta, process_single_row, axis=1)

    meta = meta[ret]
    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_filter-10m{ext}"
    meta.to_csv(out_path, index=False)
    print(f"New meta (shape={meta.shape}) saved to '{out_path}'.")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--meta_path", type=str, nargs="+")
    parser.add_argument("--num_workers", default=5, type=int)

    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()

    text_set = get_10m_set()
    for x in args.meta_path:
        filter_panda10m_text(x, text_set)


================================================
FILE: Open-Sora/build/lib/tools/datasets/split.py
================================================
import argparse
from typing import List

import pandas as pd
from mmengine.config import Config

from opensora.datasets.bucket import Bucket


def split_by_bucket(
    bucket: Bucket,
    input_files: List[str],
    output_path: str,
    limit: int,
    frame_interval: int,
):
    print(f"Split {len(input_files)} files into {len(bucket)} buckets")
    total_limit = len(bucket) * limit
    bucket_cnt = {}
    # get all bucket id
    for hw_id, d in bucket.ar_criteria.items():
        for t_id, v in d.items():
            for ar_id in v.keys():
                bucket_id = (hw_id, t_id, ar_id)
                bucket_cnt[bucket_id] = 0
    output_df = None
    # split files
    for path in input_files:
        df = pd.read_csv(path)
        if output_df is None:
            output_df = pd.DataFrame(columns=df.columns)
        for i in range(len(df)):
            row = df.iloc[i]
            t, h, w = row["num_frames"], row["height"], row["width"]
            bucket_id = bucket.get_bucket_id(t, h, w, frame_interval)
            if bucket_id is None:
                continue
            if bucket_cnt[bucket_id] < limit:
                bucket_cnt[bucket_id] += 1
                output_df = pd.concat([output_df, pd.DataFrame([row])], ignore_index=True)
                if len(output_df) >= total_limit:
                    break
        if len(output_df) >= total_limit:
            break
    assert len(output_df) <= total_limit
    if len(output_df) == total_limit:
        print(f"All buckets are full ({total_limit} samples)")
    else:
        print(f"Only {len(output_df)} files are used")
    output_df.to_csv(output_path, index=False)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, nargs="+")
    parser.add_argument("-o", "--output", required=True)
    parser.add_argument("-c", "--config", required=True)
    parser.add_argument("-l", "--limit", default=200, type=int)
    args = parser.parse_args()
    assert args.limit > 0

    cfg = Config.fromfile(args.config)
    bucket_config = cfg.bucket_config
    # rewrite bucket_config
    for ar, d in bucket_config.items():
        for frames, t in d.items():
            p, bs = t
            if p > 0.0:
                p = 1.0
            d[frames] = (p, bs)
    bucket = Bucket(bucket_config)
    split_by_bucket(bucket, args.input, args.output, args.limit, cfg.dataset.frame_interval)


================================================
FILE: Open-Sora/build/lib/tools/datasets/transform.py
================================================
import argparse
import os
import random

import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

from .utils import IMG_EXTENSIONS, extract_frames

tqdm.pandas()

try:
    from pandarallel import pandarallel

    pandarallel.initialize(progress_bar=True)
    pandas_has_parallel = True
except ImportError:
    pandas_has_parallel = False


def apply(df, func, **kwargs):
    if pandas_has_parallel:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)


def get_new_path(path, input_dir, output):
    path_new = os.path.join(output, os.path.relpath(path, input_dir))
    os.makedirs(os.path.dirname(path_new), exist_ok=True)
    return path_new


def resize(path, length, input_dir, output):
    path_new = get_new_path(path, input_dir, output)
    ext = os.path.splitext(path)[1].lower()
    assert ext in IMG_EXTENSIONS
    img = cv2.imread(path)
    if img is not None:
        h, w = img.shape[:2]
        if min(h, w) > length:
            if h > w:
                new_h = length
                new_w = int(w * new_h / h)
            else:
                new_w = length
                new_h = int(h * new_w / w)
            img = cv2.resize(img, (new_w, new_h))
        cv2.imwrite(path_new, img)
    else:
        path_new = ""
    return path_new


def rand_crop(path, input_dir, output):
    ext = os.path.splitext(path)[1].lower()
    path_new = get_new_path(path, input_dir, output)
    assert ext in IMG_EXTENSIONS
    img = cv2.imread(path)
    if img is not None:
        h, w = img.shape[:2]
        width, height, _ = img.shape
        pos = random.randint(0, 3)
        if pos == 0:
            img_cropped = img[: width // 2, : height // 2]
        elif pos == 1:
            img_cropped = img[width // 2 :, : height // 2]
        elif pos == 2:
            img_cropped = img[: width // 2, height // 2 :]
        else:
            img_cropped = img[width // 2 :, height // 2 :]
        cv2.imwrite(path_new, img_cropped)
    else:
        path_new = ""
    return path_new


def main(args):
    data = pd.read_csv(args.input)
    if args.method == "img_rand_crop":
        data["path"] = apply(data["path"], lambda x: rand_crop(x, args.input_dir, args.output))
        output_csv = args.input.replace(".csv", f"_rand_crop.csv")
    elif args.method == "img_resize":
        data["path"] = apply(data["path"], lambda x: resize(x, args.length, args.input_dir, args.output))
        output_csv = args.input.replace(".csv", f"_resized{args.length}.csv")
    elif args.method == "vid_frame_extract":
        points = args.points if args.points is not None else args.points_index
        data = pd.DataFrame(np.repeat(data.values, 3, axis=0), columns=data.columns)
        num_points = len(points)
        data["point"] = np.nan
        for i, point in enumerate(points):
            if isinstance(point, int):
                data.loc[i::num_points, "point"] = point
            else:
                data.loc[i::num_points, "point"] = data.loc[i::num_points, "num_frames"] * point
        data["path"] = apply(data, lambda x: extract_frames(x["path"], args.input_dir, args.output, x["point"]), axis=1)
        output_csv = args.input.replace(".csv", f"_vid_frame_extract.csv")

    data.to_csv(output_csv, index=False)
    print(f"Saved to {output_csv}")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("method", type=str, choices=["img_resize", "img_rand_crop", "vid_frame_extract"])
    parser.add_argument("input", type=str)
    parser.add_argument("input_dir", type=str)
    parser.add_argument("output", type=str)
    parser.add_argument("--disable-parallel", action="store_true")
    parser.add_argument("--length", type=int, default=2160)
    parser.add_argument("--seed", type=int, default=42, help="seed for random")
    parser.add_argument("--points", nargs="+", type=float, default=None)
    parser.add_argument("--points_index", nargs="+", type=int, default=None)
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()
    random.seed(args.seed)
    if args.disable_parallel:
        pandas_has_parallel = False
    main(args)


================================================
FILE: Open-Sora/build/lib/tools/datasets/utils.py
================================================
import os

import cv2
import numpy as np
from PIL import Image

IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")


def is_video(filename):
    ext = os.path.splitext(filename)[-1].lower()
    return ext in VID_EXTENSIONS


def extract_frames(
    video_path,
    frame_inds=None,
    points=None,
    backend="opencv",
    return_length=False,
    num_frames=None,
):
    """
    Args:
        video_path (str): path to video
        frame_inds (List[int]): indices of frames to extract
        points (List[float]): values within [0, 1); multiply #frames to get frame indices
    Return:
        List[PIL.Image]
    """
    assert backend in ["av", "opencv", "decord"]
    assert (frame_inds is None) or (points is None)

    if backend == "av":
        import av

        container = av.open(video_path)
        if num_frames is not None:
            total_frames = num_frames
        else:
            total_frames = container.streams.video[0].frames

        if points is not None:
            frame_inds = [int(p * total_frames) for p in points]

        frames = []
        for idx in frame_inds:
            if idx >= total_frames:
                idx = total_frames - 1
            target_timestamp = int(idx * av.time_base / container.streams.video[0].average_rate)
            container.seek(target_timestamp)
            frame = next(container.decode(video=0)).to_image()
            frames.append(frame)

        if return_length:
            return frames, total_frames
        return frames

    elif backend == "decord":
        import decord

        container = decord.VideoReader(video_path, num_threads=1)
        if num_frames is not None:
            total_frames = num_frames
        else:
            total_frames = len(container)

        if points is not None:
            frame_inds = [int(p * total_frames) for p in points]

        frame_inds = np.array(frame_inds).astype(np.int32)
        frame_inds[frame_inds >= total_frames] = total_frames - 1
        frames = container.get_batch(frame_inds).asnumpy()  # [N, H, W, C]
        frames = [Image.fromarray(x) for x in frames]

        if return_length:
            return frames, total_frames
        return frames

    elif backend == "opencv":
        cap = cv2.VideoCapture(video_path)
        if num_frames is not None:
            total_frames = num_frames
        else:
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        if points is not None:
            frame_inds = [int(p * total_frames) for p in points]

        frames = []
        for idx in frame_inds:
            if idx >= total_frames:
                idx = total_frames - 1

            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)

            # HACK: sometimes OpenCV fails to read frames, return a black frame instead
            try:
                ret, frame = cap.read()
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = Image.fromarray(frame)
            except Exception as e:
                print(f"[Warning] Error reading frame {idx} from {video_path}: {e}")
                # First, try to read the first frame
                try:
                    print(f"[Warning] Try reading first frame.")
                    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
                    ret, frame = cap.read()
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    frame = Image.fromarray(frame)
                # If that fails, return a black frame
                except Exception as e:
                    print(f"[Warning] Error in reading first frame from {video_path}: {e}")
                    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                    frame = Image.new("RGB", (width, height), (0, 0, 0))

            # HACK: if height or width is 0, return a black frame instead
            if frame.height == 0 or frame.width == 0:
                height = width = 256
                frame = Image.new("RGB", (width, height), (0, 0, 0))

            frames.append(frame)

        if return_length:
            return frames, total_frames
        return frames
    else:
        raise ValueError


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/interpolation.py
================================================
# this script is modified from https://github.com/MCG-NKU/AMT/blob/main/demos/demo_2x.py
import argparse
import os
import os.path as osp

import cv2
import numpy as np
import torch

from opensora.utils.ckpt_utils import download_model

from .networks.amt_g import Model
from .utils.utils import InputPadder, img2tensor, tensor2img

hf_endpoint = os.environ.get("HF_ENDPOINT")
if hf_endpoint is None:
    hf_endpoint = "https://huggingface.co"
VID_EXT = [".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm"]
network_cfg = {
    "params": {
        "corr_radius": 3,
        "corr_lvls": 4,
        "num_flows": 5,
    },
}
device = "cuda" if torch.cuda.is_available() else "cpu"


def init():
    """
    initialize the device and the anchor resolution.
    """

    if device == "cuda":
        anchor_resolution = 1024 * 512
        anchor_memory = 1500 * 1024**2
        anchor_memory_bias = 2500 * 1024**2
        vram_avail = torch.cuda.get_device_properties(device).total_memory
        print("VRAM available: {:.1f} MB".format(vram_avail / 1024**2))
    else:
        # Do not resize in cpu mode
        anchor_resolution = 8192 * 8192
        anchor_memory = 1
        anchor_memory_bias = 0
        vram_avail = 1

    return anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail


def get_input_video_from_path(input_path):
    """
    Get the input video from the input_path.

    params:
        input_path: str, the path of the input video.
        devices: str, the device to run the model.
    returns:
        inputs: list, the list of the input frames.
        scale: float, the scale of the input frames.
        padder: InputPadder, the padder to pad the input frames.
    """

    anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail = init()

    if osp.splitext(input_path)[-1].lower() in VID_EXT:
        vcap = cv2.VideoCapture(input_path)

        inputs = []
        w = int(vcap.get(cv2.CAP_PROP_FRAME_WIDTH))
        h = int(vcap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        scale = anchor_resolution / (h * w) * np.sqrt((vram_avail - anchor_memory_bias) / anchor_memory)
        scale = 1 if scale > 1 else scale
        scale = 1 / np.floor(1 / np.sqrt(scale) * 16) * 16
        if scale < 1:
            print(f"Due to the limited VRAM, the video will be scaled by {scale:.2f}")
        padding = int(16 / scale)
        padder = InputPadder((h, w), padding)
        while True:
            ret, frame = vcap.read()
            if ret is False:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_t = img2tensor(frame).to(device)
            frame_t = padder.pad(frame_t)
            inputs.append(frame_t)
        print(f"Loading the [video] from {input_path}, the number of frames [{len(inputs)}]")
    else:
        raise TypeError("Input should be a video.")

    return inputs, scale, padder


def load_model(ckpt):
    """
    load the frame interpolation model.
    """
    params = network_cfg.get("params", {})
    model = Model(**params)
    model.load_state_dict(ckpt["state_dict"])
    model = model.to(device)
    model.eval()
    return model


def interpolater(model, inputs, scale, padder, iters=1):
    """
    interpolating with the interpolation model.

    params:
        model: nn.Module, the frame interpolation model.
        inputs: list, the list of the input frames.
        scale: float, the scale of the input frames.
        iters: int, the number of iterations of interpolation. The final frames model generating is 2 ** iters * (m - 1) + 1 and m is input frames.
    returns:
        outputs: list, the list of the output frames.
    """

    print("Start frame interpolation:")
    embt = torch.tensor(1 / 2).float().view(1, 1, 1, 1).to(device)

    for i in range(iters):
        print(f"Iter {i+1}. input_frames={len(inputs)} output_frames={2*len(inputs)-1}")
        outputs = [inputs[0]]
        for in_0, in_1 in zip(inputs[:-1], inputs[1:]):
            in_0 = in_0.to(device)
            in_1 = in_1.to(device)
            with torch.no_grad():
                imgt_pred = model(in_0, in_1, embt, scale_factor=scale, eval=True)["imgt_pred"]
            outputs += [imgt_pred.cpu(), in_1.cpu()]
        inputs = outputs

    outputs = padder.unpad(*outputs)
    return outputs


def write(outputs, input_path, output_path, fps=30):
    """
    write results to the output_path.
    """

    if osp.exists(output_path) is False:
        os.makedirs(output_path)

    size = outputs[0].shape[2:][::-1]

    _, file_name_with_extension = os.path.split(input_path)
    file_name, _ = os.path.splitext(file_name_with_extension)

    save_video_path = f"{output_path}/fps{fps}_{file_name}.mp4"
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(save_video_path, fourcc, fps, size)

    for i, imgt_pred in enumerate(outputs):
        imgt_pred = tensor2img(imgt_pred)
        imgt_pred = cv2.cvtColor(imgt_pred, cv2.COLOR_RGB2BGR)
        writer.write(imgt_pred)
    print(f"Demo video is saved to [{save_video_path}]")

    writer.release()


def process(
    model,
    image_path,
    output_path,
    fps,
    iters,
):
    inputs, scale, padder = get_input_video_from_path(image_path)
    outputs = interpolater(model, inputs, scale, padder, iters)
    write(outputs, image_path, output_path, fps)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="Input video.")
    parser.add_argument("--ckpt", type=str, default="./pretrained_models/amt-g.pth", help="The pretrained model.")
    parser.add_argument(
        "--niters",
        type=int,
        default=1,
        help="Iter of Interpolation. The number of frames will be double after per iter.",
    )
    parser.add_argument("--output_path", type=str, default="samples", help="Output path.")
    parser.add_argument("--fps", type=int, default=8, help="Frames rate of the output video.")
    parser.add_argument("--folder", action="store_true", help="If the input is a folder, set this flag.")
    args = parser.parse_args()

    times_frame = 2**args.niters
    old_fps = args.fps
    args.fps = args.fps * times_frame
    print(f"Interpolation will turn {old_fps}fps video to {args.fps}fps video.")
    args.input = os.path.expanduser(args.input)
    args.ckpt = os.path.expanduser(args.ckpt)
    args.folder = osp.splitext(args.input)[-1].lower() not in VID_EXT
    args.ckpt = download_model(local_path=args.ckpt, url=hf_endpoint + "/lalala125/AMT/resolve/main/amt-g.pth")
    return args


if __name__ == "__main__":
    args = parse_args()
    ckpt_path = args.ckpt
    input_path = args.input
    output_path = args.output_path
    iters = int(args.niters)
    fps = int(args.fps)

    model = load_model(ckpt_path)

    if args.folder:
        for file in os.listdir(input_path):
            if osp.splitext(file)[-1].lower() in VID_EXT:
                vid_path = os.path.join(input_path, file)
                process(model, vid_path, output_path, fps, iters)
    else:
        process(model, input_path, output_path, fps, iters)

    print("Interpolation is done.")
    print(f"Output path: {output_path}")


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/__init__.py
================================================
from .amt_g import Model


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/amt_g.py
================================================
import torch
import torch.nn as nn

from .blocks.feat_enc import LargeEncoder
from .blocks.ifrnet import Encoder, InitDecoder, IntermediateDecoder, resize
from .blocks.multi_flow import MultiFlowDecoder, multi_flow_combine
from .blocks.raft import BasicUpdateBlock, BidirCorrBlock, coords_grid


class Model(nn.Module):
    def __init__(self, corr_radius=3, corr_lvls=4, num_flows=5, channels=[84, 96, 112, 128], skip_channels=84):
        super(Model, self).__init__()
        self.radius = corr_radius
        self.corr_levels = corr_lvls
        self.num_flows = num_flows

        self.feat_encoder = LargeEncoder(output_dim=128, norm_fn="instance", dropout=0.0)
        self.encoder = Encoder(channels, large=True)
        self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels)
        self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels)
        self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels)
        self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows)

        self.update4 = self._get_updateblock(112, None)
        self.update3_low = self._get_updateblock(96, 2.0)
        self.update2_low = self._get_updateblock(84, 4.0)

        self.update3_high = self._get_updateblock(96, None)
        self.update2_high = self._get_updateblock(84, None)

        self.comb_block = nn.Sequential(
            nn.Conv2d(3 * self.num_flows, 6 * self.num_flows, 7, 1, 3),
            nn.PReLU(6 * self.num_flows),
            nn.Conv2d(6 * self.num_flows, 3, 7, 1, 3),
        )

    def _get_updateblock(self, cdim, scale_factor=None):
        return BasicUpdateBlock(
            cdim=cdim,
            hidden_dim=192,
            flow_dim=64,
            corr_dim=256,
            corr_dim2=192,
            fc_dim=188,
            scale_factor=scale_factor,
            corr_levels=self.corr_levels,
            radius=self.radius,
        )

    def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1):
        # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0
        # based on linear assumption
        t1_scale = 1.0 / embt
        t0_scale = 1.0 / (1.0 - embt)
        if downsample != 1:
            inv = 1 / downsample
            flow0 = inv * resize(flow0, scale_factor=inv)
            flow1 = inv * resize(flow1, scale_factor=inv)

        corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale)
        corr = torch.cat([corr0, corr1], dim=1)
        flow = torch.cat([flow0, flow1], dim=1)
        return corr, flow

    def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs):
        mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True)
        img0 = img0 - mean_
        img1 = img1 - mean_
        img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0
        img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1
        b, _, h, w = img0_.shape
        coord = coords_grid(b, h // 8, w // 8, img0.device)

        fmap0, fmap1 = self.feat_encoder([img0_, img1_])  # [1, 128, H//8, W//8]
        corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels)

        # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4]
        # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16]
        f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
        f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)

        ######################################### the 4th decoder #########################################
        up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt)
        corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, up_flow0_4, up_flow1_4, embt, downsample=1)

        # residue update with lookup corr
        delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4)
        delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1)
        up_flow0_4 = up_flow0_4 + delta_flow0_4
        up_flow1_4 = up_flow1_4 + delta_flow1_4
        ft_3_ = ft_3_ + delta_ft_3_

        ######################################### the 3rd decoder #########################################
        up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
        corr_3, flow_3 = self._corr_scale_lookup(corr_fn, coord, up_flow0_3, up_flow1_3, embt, downsample=2)

        # residue update with lookup corr
        delta_ft_2_, delta_flow_3 = self.update3_low(ft_2_, flow_3, corr_3)
        delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1)
        up_flow0_3 = up_flow0_3 + delta_flow0_3
        up_flow1_3 = up_flow1_3 + delta_flow1_3
        ft_2_ = ft_2_ + delta_ft_2_

        # residue update with lookup corr (hr)
        corr_3 = resize(corr_3, scale_factor=2.0)
        up_flow_3 = torch.cat([up_flow0_3, up_flow1_3], dim=1)
        delta_ft_2_, delta_up_flow_3 = self.update3_high(ft_2_, up_flow_3, corr_3)
        ft_2_ += delta_ft_2_
        up_flow0_3 += delta_up_flow_3[:, 0:2]
        up_flow1_3 += delta_up_flow_3[:, 2:4]

        ######################################### the 2nd decoder #########################################
        up_flow0_2, up_flow1_2, ft_1_ = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
        corr_2, flow_2 = self._corr_scale_lookup(corr_fn, coord, up_flow0_2, up_flow1_2, embt, downsample=4)

        # residue update with lookup corr
        delta_ft_1_, delta_flow_2 = self.update2_low(ft_1_, flow_2, corr_2)
        delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1)
        up_flow0_2 = up_flow0_2 + delta_flow0_2
        up_flow1_2 = up_flow1_2 + delta_flow1_2
        ft_1_ = ft_1_ + delta_ft_1_

        # residue update with lookup corr (hr)
        corr_2 = resize(corr_2, scale_factor=4.0)
        up_flow_2 = torch.cat([up_flow0_2, up_flow1_2], dim=1)
        delta_ft_1_, delta_up_flow_2 = self.update2_high(ft_1_, up_flow_2, corr_2)
        ft_1_ += delta_ft_1_
        up_flow0_2 += delta_up_flow_2[:, 0:2]
        up_flow1_2 += delta_up_flow_2[:, 2:4]

        ######################################### the 1st decoder #########################################
        up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)

        if scale_factor != 1.0:
            up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor)
            up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor)
            mask = resize(mask, scale_factor=(1.0 / scale_factor))
            img_res = resize(img_res, scale_factor=(1.0 / scale_factor))

        # Merge multiple predictions
        imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, mask, img_res, mean_)
        imgt_pred = torch.clamp(imgt_pred, 0, 1)

        if eval:
            return {
                "imgt_pred": imgt_pred,
            }
        else:
            up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w)
            up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w)
            return {
                "imgt_pred": imgt_pred,
                "flow0_pred": [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4],
                "flow1_pred": [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4],
                "ft_pred": [ft_1_, ft_2_, ft_3_],
            }


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/blocks/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/blocks/feat_enc.py
================================================
import torch
import torch.nn as nn


class BottleneckBlock(nn.Module):
    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
        super(BottleneckBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_planes, planes // 4, kernel_size=1, padding=0)
        self.conv2 = nn.Conv2d(planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride)
        self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0)
        self.relu = nn.ReLU(inplace=True)

        num_groups = planes // 8

        if norm_fn == "group":
            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4)
            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4)
            self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            if not stride == 1:
                self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)

        elif norm_fn == "batch":
            self.norm1 = nn.BatchNorm2d(planes // 4)
            self.norm2 = nn.BatchNorm2d(planes // 4)
            self.norm3 = nn.BatchNorm2d(planes)
            if not stride == 1:
                self.norm4 = nn.BatchNorm2d(planes)

        elif norm_fn == "instance":
            self.norm1 = nn.InstanceNorm2d(planes // 4)
            self.norm2 = nn.InstanceNorm2d(planes // 4)
            self.norm3 = nn.InstanceNorm2d(planes)
            if not stride == 1:
                self.norm4 = nn.InstanceNorm2d(planes)

        elif norm_fn == "none":
            self.norm1 = nn.Sequential()
            self.norm2 = nn.Sequential()
            self.norm3 = nn.Sequential()
            if not stride == 1:
                self.norm4 = nn.Sequential()

        if stride == 1:
            self.downsample = None

        else:
            self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4)

    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))
        y = self.relu(self.norm3(self.conv3(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x + y)


class ResidualBlock(nn.Module):
    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
        super(ResidualBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
        self.relu = nn.ReLU(inplace=True)

        num_groups = planes // 8

        if norm_fn == "group":
            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            if not stride == 1:
                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)

        elif norm_fn == "batch":
            self.norm1 = nn.BatchNorm2d(planes)
            self.norm2 = nn.BatchNorm2d(planes)
            if not stride == 1:
                self.norm3 = nn.BatchNorm2d(planes)

        elif norm_fn == "instance":
            self.norm1 = nn.InstanceNorm2d(planes)
            self.norm2 = nn.InstanceNorm2d(planes)
            if not stride == 1:
                self.norm3 = nn.InstanceNorm2d(planes)

        elif norm_fn == "none":
            self.norm1 = nn.Sequential()
            self.norm2 = nn.Sequential()
            if not stride == 1:
                self.norm3 = nn.Sequential()

        if stride == 1:
            self.downsample = None

        else:
            self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)

    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x + y)


class SmallEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
        super(SmallEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == "group":
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)

        elif self.norm_fn == "batch":
            self.norm1 = nn.BatchNorm2d(32)

        elif self.norm_fn == "instance":
            self.norm1 = nn.InstanceNorm2d(32)

        elif self.norm_fn == "none":
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 32
        self.layer1 = self._make_layer(32, stride=1)
        self.layer2 = self._make_layer(64, stride=2)
        self.layer3 = self._make_layer(96, stride=2)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)

        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)

        self.in_planes = dim
        return nn.Sequential(*layers)

    def forward(self, x):
        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x


class BasicEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
        super(BasicEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == "group":
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)

        elif self.norm_fn == "batch":
            self.norm1 = nn.BatchNorm2d(64)

        elif self.norm_fn == "instance":
            self.norm1 = nn.InstanceNorm2d(64)

        elif self.norm_fn == "none":
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 64
        self.layer1 = self._make_layer(64, stride=1)
        self.layer2 = self._make_layer(72, stride=2)
        self.layer3 = self._make_layer(128, stride=2)

        # output convolution
        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)

        self.in_planes = dim
        return nn.Sequential(*layers)

    def forward(self, x):
        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x


class LargeEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
        super(LargeEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == "group":
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)

        elif self.norm_fn == "batch":
            self.norm1 = nn.BatchNorm2d(64)

        elif self.norm_fn == "instance":
            self.norm1 = nn.InstanceNorm2d(64)

        elif self.norm_fn == "none":
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 64
        self.layer1 = self._make_layer(64, stride=1)
        self.layer2 = self._make_layer(112, stride=2)
        self.layer3 = self._make_layer(160, stride=2)
        self.layer3_2 = self._make_layer(160, stride=1)

        # output convolution
        self.conv2 = nn.Conv2d(self.in_planes, output_dim, kernel_size=1)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)

        self.in_planes = dim
        return nn.Sequential(*layers)

    def forward(self, x):
        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer3_2(x)

        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/blocks/ifrnet.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from tools.frame_interpolation.utils.flow_utils import warp


def resize(x, scale_factor):
    return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)


def convrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=bias),
        nn.PReLU(out_channels),
    )


class ResBlock(nn.Module):
    def __init__(self, in_channels, side_channels, bias=True):
        super(ResBlock, self).__init__()
        self.side_channels = side_channels
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias),
            nn.PReLU(side_channels),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias),
            nn.PReLU(side_channels),
        )
        self.conv5 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias)
        self.prelu = nn.PReLU(in_channels)

    def forward(self, x):
        out = self.conv1(x)

        res_feat = out[:, : -self.side_channels, ...]
        side_feat = out[:, -self.side_channels :, :, :]
        side_feat = self.conv2(side_feat)
        out = self.conv3(torch.cat([res_feat, side_feat], 1))

        res_feat = out[:, : -self.side_channels, ...]
        side_feat = out[:, -self.side_channels :, :, :]
        side_feat = self.conv4(side_feat)
        out = self.conv5(torch.cat([res_feat, side_feat], 1))

        out = self.prelu(x + out)
        return out


class Encoder(nn.Module):
    def __init__(self, channels, large=False):
        super(Encoder, self).__init__()
        self.channels = channels
        prev_ch = 3
        for idx, ch in enumerate(channels, 1):
            k = 7 if large and idx == 1 else 3
            p = 3 if k == 7 else 1
            self.register_module(
                f"pyramid{idx}", nn.Sequential(convrelu(prev_ch, ch, k, 2, p), convrelu(ch, ch, 3, 1, 1))
            )
            prev_ch = ch

    def forward(self, in_x):
        fs = []
        for idx in range(len(self.channels)):
            out_x = getattr(self, f"pyramid{idx+1}")(in_x)
            fs.append(out_x)
            in_x = out_x
        return fs


class InitDecoder(nn.Module):
    def __init__(self, in_ch, out_ch, skip_ch) -> None:
        super().__init__()
        self.convblock = nn.Sequential(
            convrelu(in_ch * 2 + 1, in_ch * 2),
            ResBlock(in_ch * 2, skip_ch),
            nn.ConvTranspose2d(in_ch * 2, out_ch + 4, 4, 2, 1, bias=True),
        )

    def forward(self, f0, f1, embt):
        h, w = f0.shape[2:]
        embt = embt.repeat(1, 1, h, w)
        out = self.convblock(torch.cat([f0, f1, embt], 1))
        flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
        ft_ = out[:, 4:, ...]
        return flow0, flow1, ft_


class IntermediateDecoder(nn.Module):
    def __init__(self, in_ch, out_ch, skip_ch) -> None:
        super().__init__()
        self.convblock = nn.Sequential(
            convrelu(in_ch * 3 + 4, in_ch * 3),
            ResBlock(in_ch * 3, skip_ch),
            nn.ConvTranspose2d(in_ch * 3, out_ch + 4, 4, 2, 1, bias=True),
        )

    def forward(self, ft_, f0, f1, flow0_in, flow1_in):
        f0_warp = warp(f0, flow0_in)
        f1_warp = warp(f1, flow1_in)
        f_in = torch.cat([ft_, f0_warp, f1_warp, flow0_in, flow1_in], 1)
        out = self.convblock(f_in)
        flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
        ft_ = out[:, 4:, ...]
        flow0 = flow0 + 2.0 * resize(flow0_in, scale_factor=2.0)
        flow1 = flow1 + 2.0 * resize(flow1_in, scale_factor=2.0)
        return flow0, flow1, ft_


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/blocks/multi_flow.py
================================================
import torch
import torch.nn as nn

from tools.frame_interpolation.utils.flow_utils import warp

from .ifrnet import ResBlock, convrelu, resize


def multi_flow_combine(comb_block, img0, img1, flow0, flow1, mask=None, img_res=None, mean=None):
    """
    A parallel implementation of multiple flow field warping
    comb_block: An nn.Seqential object.
    img shape: [b, c, h, w]
    flow shape: [b, 2*num_flows, h, w]
    mask (opt):
        If 'mask' is None, the function conduct a simple average.
    img_res (opt):
        If 'img_res' is None, the function adds zero instead.
    mean (opt):
        If 'mean' is None, the function adds zero instead.
    """
    b, c, h, w = flow0.shape
    num_flows = c // 2
    flow0 = flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
    flow1 = flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)

    mask = mask.reshape(b, num_flows, 1, h, w).reshape(-1, 1, h, w) if mask is not None else None
    img_res = img_res.reshape(b, num_flows, 3, h, w).reshape(-1, 3, h, w) if img_res is not None else 0
    img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w)
    img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w)
    mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1) if mean is not None else 0

    img0_warp = warp(img0, flow0)
    img1_warp = warp(img1, flow1)
    img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res
    img_warps = img_warps.reshape(b, num_flows, 3, h, w)
    imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w))
    return imgt_pred


class MultiFlowDecoder(nn.Module):
    def __init__(self, in_ch, skip_ch, num_flows=3):
        super(MultiFlowDecoder, self).__init__()
        self.num_flows = num_flows
        self.convblock = nn.Sequential(
            convrelu(in_ch * 3 + 4, in_ch * 3),
            ResBlock(in_ch * 3, skip_ch),
            nn.ConvTranspose2d(in_ch * 3, 8 * num_flows, 4, 2, 1, bias=True),
        )

    def forward(self, ft_, f0, f1, flow0, flow1):
        n = self.num_flows
        f0_warp = warp(f0, flow0)
        f1_warp = warp(f1, flow1)
        out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1))
        delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2 * n, 2 * n, n, 3 * n], 1)
        mask = torch.sigmoid(mask)

        flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0).repeat(1, self.num_flows, 1, 1)
        flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0).repeat(1, self.num_flows, 1, 1)

        return flow0, flow1, mask, img_res


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/networks/blocks/raft.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


def resize(x, scale_factor):
    return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)


def bilinear_sampler(img, coords, mask=False):
    """Wrapper for grid_sample, uses pixel coordinates"""
    H, W = img.shape[-2:]
    xgrid, ygrid = coords.split([1, 1], dim=-1)
    xgrid = 2 * xgrid / (W - 1) - 1
    ygrid = 2 * ygrid / (H - 1) - 1

    grid = torch.cat([xgrid, ygrid], dim=-1)
    img = F.grid_sample(img, grid, align_corners=True)

    if mask:
        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
        return img, mask.float()

    return img


def coords_grid(batch, ht, wd, device):
    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device), indexing="ij")
    coords = torch.stack(coords[::-1], dim=0).float()
    return coords[None].repeat(batch, 1, 1, 1)


class SmallUpdateBlock(nn.Module):
    def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, fc_dim, corr_levels=4, radius=3, scale_factor=None):
        super(SmallUpdateBlock, self).__init__()
        cor_planes = corr_levels * (2 * radius + 1) ** 2
        self.scale_factor = scale_factor

        self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
        self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3)
        self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1)
        self.conv = nn.Conv2d(corr_dim + flow_dim, fc_dim, 3, padding=1)

        self.gru = nn.Sequential(
            nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
        )

        self.feat_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, cdim, 3, padding=1),
        )

        self.flow_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, 4, 3, padding=1),
        )

        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)

    def forward(self, net, flow, corr):
        net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net
        cor = self.lrelu(self.convc1(corr))
        flo = self.lrelu(self.convf1(flow))
        flo = self.lrelu(self.convf2(flo))
        cor_flo = torch.cat([cor, flo], dim=1)
        inp = self.lrelu(self.conv(cor_flo))
        inp = torch.cat([inp, flow, net], dim=1)

        out = self.gru(inp)
        delta_net = self.feat_head(out)
        delta_flow = self.flow_head(out)

        if self.scale_factor is not None:
            delta_net = resize(delta_net, scale_factor=self.scale_factor)
            delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)

        return delta_net, delta_flow


class BasicUpdateBlock(nn.Module):
    def __init__(
        self,
        cdim,
        hidden_dim,
        flow_dim,
        corr_dim,
        corr_dim2,
        fc_dim,
        corr_levels=4,
        radius=3,
        scale_factor=None,
        out_num=1,
    ):
        super(BasicUpdateBlock, self).__init__()
        cor_planes = corr_levels * (2 * radius + 1) ** 2

        self.scale_factor = scale_factor
        self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
        self.convc2 = nn.Conv2d(corr_dim, corr_dim2, 3, padding=1)
        self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3)
        self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1)
        self.conv = nn.Conv2d(flow_dim + corr_dim2, fc_dim, 3, padding=1)

        self.gru = nn.Sequential(
            nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
        )

        self.feat_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, cdim, 3, padding=1),
        )

        self.flow_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, 4 * out_num, 3, padding=1),
        )

        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)

    def forward(self, net, flow, corr):
        net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net
        cor = self.lrelu(self.convc1(corr))
        cor = self.lrelu(self.convc2(cor))
        flo = self.lrelu(self.convf1(flow))
        flo = self.lrelu(self.convf2(flo))
        cor_flo = torch.cat([cor, flo], dim=1)
        inp = self.lrelu(self.conv(cor_flo))
        inp = torch.cat([inp, flow, net], dim=1)

        out = self.gru(inp)
        delta_net = self.feat_head(out)
        delta_flow = self.flow_head(out)

        if self.scale_factor is not None:
            delta_net = resize(delta_net, scale_factor=self.scale_factor)
            delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)
        return delta_net, delta_flow


class BidirCorrBlock:
    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
        self.num_levels = num_levels
        self.radius = radius
        self.corr_pyramid = []
        self.corr_pyramid_T = []

        corr = BidirCorrBlock.corr(fmap1, fmap2)
        batch, h1, w1, dim, h2, w2 = corr.shape
        corr_T = corr.clone().permute(0, 4, 5, 3, 1, 2)

        corr = corr.reshape(batch * h1 * w1, dim, h2, w2)
        corr_T = corr_T.reshape(batch * h2 * w2, dim, h1, w1)

        self.corr_pyramid.append(corr)
        self.corr_pyramid_T.append(corr_T)

        for _ in range(self.num_levels - 1):
            corr = F.avg_pool2d(corr, 2, stride=2)
            corr_T = F.avg_pool2d(corr_T, 2, stride=2)
            self.corr_pyramid.append(corr)
            self.corr_pyramid_T.append(corr_T)

    def __call__(self, coords0, coords1):
        r = self.radius
        coords0 = coords0.permute(0, 2, 3, 1)
        coords1 = coords1.permute(0, 2, 3, 1)
        assert coords0.shape == coords1.shape, f"coords0 shape: [{coords0.shape}] is not equal to [{coords1.shape}]"
        batch, h1, w1, _ = coords0.shape

        out_pyramid = []
        out_pyramid_T = []
        for i in range(self.num_levels):
            corr = self.corr_pyramid[i]
            corr_T = self.corr_pyramid_T[i]

            dx = torch.linspace(-r, r, 2 * r + 1, device=coords0.device)
            dy = torch.linspace(-r, r, 2 * r + 1, device=coords0.device)
            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1)
            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)

            centroid_lvl_0 = coords0.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
            centroid_lvl_1 = coords1.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
            coords_lvl_0 = centroid_lvl_0 + delta_lvl
            coords_lvl_1 = centroid_lvl_1 + delta_lvl

            corr = bilinear_sampler(corr, coords_lvl_0)
            corr_T = bilinear_sampler(corr_T, coords_lvl_1)
            corr = corr.view(batch, h1, w1, -1)
            corr_T = corr_T.view(batch, h1, w1, -1)
            out_pyramid.append(corr)
            out_pyramid_T.append(corr_T)

        out = torch.cat(out_pyramid, dim=-1)
        out_T = torch.cat(out_pyramid_T, dim=-1)
        return out.permute(0, 3, 1, 2).contiguous().float(), out_T.permute(0, 3, 1, 2).contiguous().float()

    @staticmethod
    def corr(fmap1, fmap2):
        batch, dim, ht, wd = fmap1.shape
        fmap1 = fmap1.view(batch, dim, ht * wd)
        fmap2 = fmap2.view(batch, dim, ht * wd)

        corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
        corr = corr.view(batch, ht, wd, 1, ht, wd)
        return corr / torch.sqrt(torch.tensor(dim).float())


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/utils/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/utils/dist_utils.py
================================================
import os

import torch


def get_world_size():
    """Find OMPI world size without calling mpi functions
    :rtype: int
    """
    if os.environ.get("PMI_SIZE") is not None:
        return int(os.environ.get("PMI_SIZE") or 1)
    elif os.environ.get("OMPI_COMM_WORLD_SIZE") is not None:
        return int(os.environ.get("OMPI_COMM_WORLD_SIZE") or 1)
    else:
        return torch.cuda.device_count()


def get_global_rank():
    """Find OMPI world rank without calling mpi functions
    :rtype: int
    """
    if os.environ.get("PMI_RANK") is not None:
        return int(os.environ.get("PMI_RANK") or 0)
    elif os.environ.get("OMPI_COMM_WORLD_RANK") is not None:
        return int(os.environ.get("OMPI_COMM_WORLD_RANK") or 0)
    else:
        return 0


def get_local_rank():
    """Find OMPI local rank without calling mpi functions
    :rtype: int
    """
    if os.environ.get("MPI_LOCALRANKID") is not None:
        return int(os.environ.get("MPI_LOCALRANKID") or 0)
    elif os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") is not None:
        return int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") or 0)
    else:
        return 0


def get_master_ip():
    if os.environ.get("AZ_BATCH_MASTER_NODE") is not None:
        return os.environ.get("AZ_BATCH_MASTER_NODE").split(":")[0]
    elif os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") is not None:
        return os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE")
    else:
        return "127.0.0.1"


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/utils/flow_utils.py
================================================
import numpy as np
import torch
import torch.nn.functional as F
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


def warp(img, flow):
    B, _, H, W = flow.shape
    xx = torch.linspace(-1.0, 1.0, W).view(1, 1, 1, W).expand(B, -1, H, -1)
    yy = torch.linspace(-1.0, 1.0, H).view(1, 1, H, 1).expand(B, -1, -1, W)
    grid = torch.cat([xx, yy], 1).to(img)
    flow_ = torch.cat([flow[:, 0:1, :, :] / ((W - 1.0) / 2.0), flow[:, 1:2, :, :] / ((H - 1.0) / 2.0)], 1)
    grid_ = (grid + flow_).permute(0, 2, 3, 1)
    output = F.grid_sample(input=img, grid=grid_, mode="bilinear", padding_mode="border", align_corners=True)
    return output


def make_colorwheel():
    """
    Generates a color wheel for optical flow visualization as presented in:
        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
    Code follows the original C++ source code of Daniel Scharstein.
    Code follows the Matlab source code of Deqing Sun.
    Returns:
        np.ndarray: Color wheel
    """

    RY = 15
    YG = 6
    GC = 4
    CB = 11
    BM = 13
    MR = 6

    ncols = RY + YG + GC + CB + BM + MR
    colorwheel = np.zeros((ncols, 3))
    col = 0

    # RY
    colorwheel[0:RY, 0] = 255
    colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY)
    col = col + RY
    # YG
    colorwheel[col : col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG)
    colorwheel[col : col + YG, 1] = 255
    col = col + YG
    # GC
    colorwheel[col : col + GC, 1] = 255
    colorwheel[col : col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC)
    col = col + GC
    # CB
    colorwheel[col : col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB)
    colorwheel[col : col + CB, 2] = 255
    col = col + CB
    # BM
    colorwheel[col : col + BM, 2] = 255
    colorwheel[col : col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM)
    col = col + BM
    # MR
    colorwheel[col : col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR)
    colorwheel[col : col + MR, 0] = 255
    return colorwheel


def flow_uv_to_colors(u, v, convert_to_bgr=False):
    """
    Applies the flow color wheel to (possibly clipped) flow components u and v.
    According to the C++ source code of Daniel Scharstein
    According to the Matlab source code of Deqing Sun
    Args:
        u (np.ndarray): Input horizontal flow of shape [H,W]
        v (np.ndarray): Input vertical flow of shape [H,W]
        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
    Returns:
        np.ndarray: Flow visualization image of shape [H,W,3]
    """
    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
    colorwheel = make_colorwheel()  # shape [55x3]
    ncols = colorwheel.shape[0]
    rad = np.sqrt(np.square(u) + np.square(v))
    a = np.arctan2(-v, -u) / np.pi
    fk = (a + 1) / 2 * (ncols - 1)
    k0 = np.floor(fk).astype(np.int32)
    k1 = k0 + 1
    k1[k1 == ncols] = 0
    f = fk - k0
    for i in range(colorwheel.shape[1]):
        tmp = colorwheel[:, i]
        col0 = tmp[k0] / 255.0
        col1 = tmp[k1] / 255.0
        col = (1 - f) * col0 + f * col1
        idx = rad <= 1
        col[idx] = 1 - rad[idx] * (1 - col[idx])
        col[~idx] = col[~idx] * 0.75  # out of range
        # Note the 2-i => BGR instead of RGB
        ch_idx = 2 - i if convert_to_bgr else i
        flow_image[:, :, ch_idx] = np.floor(255 * col)
    return flow_image


def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
    """
    Expects a two dimensional flow image of shape.
    Args:
        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
    Returns:
        np.ndarray: Flow visualization image of shape [H,W,3]
    """
    assert flow_uv.ndim == 3, "input flow must have three dimensions"
    assert flow_uv.shape[2] == 2, "input flow must have shape [H,W,2]"
    if clip_flow is not None:
        flow_uv = np.clip(flow_uv, 0, clip_flow)
    u = flow_uv[:, :, 0]
    v = flow_uv[:, :, 1]
    rad = np.sqrt(np.square(u) + np.square(v))
    rad_max = np.max(rad)
    epsilon = 1e-5
    u = u / (rad_max + epsilon)
    v = v / (rad_max + epsilon)
    return flow_uv_to_colors(u, v, convert_to_bgr)


================================================
FILE: Open-Sora/build/lib/tools/frame_interpolation/utils/utils.py
================================================
import random
import re
import sys

import numpy as np
import torch
import torch.nn.functional as F
from imageio import imread, imwrite
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0.0
        self.avg = 0.0
        self.sum = 0.0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class AverageMeterGroups:
    def __init__(self) -> None:
        self.meter_dict = dict()

    def update(self, dict, n=1):
        for name, val in dict.items():
            if self.meter_dict.get(name) is None:
                self.meter_dict[name] = AverageMeter()
            self.meter_dict[name].update(val, n)

    def reset(self, name=None):
        if name is None:
            for v in self.meter_dict.values():
                v.reset()
        else:
            meter = self.meter_dict.get(name)
            if meter is not None:
                meter.reset()

    def avg(self, name):
        meter = self.meter_dict.get(name)
        if meter is not None:
            return meter.avg


class InputPadder:
    """Pads images such that dimensions are divisible by divisor"""

    def __init__(self, dims, divisor=16):
        self.ht, self.wd = dims[-2:]
        pad_ht = (((self.ht // divisor) + 1) * divisor - self.ht) % divisor
        pad_wd = (((self.wd // divisor) + 1) * divisor - self.wd) % divisor
        self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2]

    def pad(self, *inputs):
        if len(inputs) == 1:
            return F.pad(inputs[0], self._pad, mode="replicate")
        else:
            return [F.pad(x, self._pad, mode="replicate") for x in inputs]

    def unpad(self, *inputs):
        if len(inputs) == 1:
            return self._unpad(inputs[0])
        else:
            return [self._unpad(x) for x in inputs]

    def _unpad(self, x):
        ht, wd = x.shape[-2:]
        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
        return x[..., c[0] : c[1], c[2] : c[3]]


def img2tensor(img):
    if img.shape[-1] > 3:
        img = img[:, :, :3]
    return torch.tensor(img).permute(2, 0, 1).unsqueeze(0) / 255.0


def tensor2img(img_t):
    return (img_t * 255.0).detach().squeeze(0).permute(1, 2, 0).cpu().numpy().clip(0, 255).astype(np.uint8)


def seed_all(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def read(file):
    if file.endswith(".float3"):
        return readFloat(file)
    elif file.endswith(".flo"):
        return readFlow(file)
    elif file.endswith(".ppm"):
        return readImage(file)
    elif file.endswith(".pgm"):
        return readImage(file)
    elif file.endswith(".png"):
        return readImage(file)
    elif file.endswith(".jpg"):
        return readImage(file)
    elif file.endswith(".pfm"):
        return readPFM(file)[0]
    else:
        raise Exception("don't know how to read %s" % file)


def write(file, data):
    if file.endswith(".float3"):
        return writeFloat(file, data)
    elif file.endswith(".flo"):
        return writeFlow(file, data)
    elif file.endswith(".ppm"):
        return writeImage(file, data)
    elif file.endswith(".pgm"):
        return writeImage(file, data)
    elif file.endswith(".png"):
        return writeImage(file, data)
    elif file.endswith(".jpg"):
        return writeImage(file, data)
    elif file.endswith(".pfm"):
        return writePFM(file, data)
    else:
        raise Exception("don't know how to write %s" % file)


def readPFM(file):
    file = open(file, "rb")

    color = None
    width = None
    height = None
    scale = None
    endian = None

    header = file.readline().rstrip()
    if header.decode("ascii") == "PF":
        color = True
    elif header.decode("ascii") == "Pf":
        color = False
    else:
        raise Exception("Not a PFM file.")

    dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
    if dim_match:
        width, height = list(map(int, dim_match.groups()))
    else:
        raise Exception("Malformed PFM header.")

    scale = float(file.readline().decode("ascii").rstrip())
    if scale < 0:
        endian = "<"
        scale = -scale
    else:
        endian = ">"

    data = np.fromfile(file, endian + "f")
    shape = (height, width, 3) if color else (height, width)

    data = np.reshape(data, shape)
    data = np.flipud(data)
    return data, scale


def writePFM(file, image, scale=1):
    file = open(file, "wb")

    color = None

    if image.dtype.name != "float32":
        raise Exception("Image dtype must be float32.")

    image = np.flipud(image)

    if len(image.shape) == 3 and image.shape[2] == 3:
        color = True
    elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1:
        color = False
    else:
        raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")

    file.write("PF\n" if color else "Pf\n".encode())
    file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))

    endian = image.dtype.byteorder

    if endian == "<" or endian == "=" and sys.byteorder == "little":
        scale = -scale

    file.write("%f\n".encode() % scale)

    image.tofile(file)


def readFlow(name):
    if name.endswith(".pfm") or name.endswith(".PFM"):
        return readPFM(name)[0][:, :, 0:2]

    f = open(name, "rb")

    header = f.read(4)
    if header.decode("utf-8") != "PIEH":
        raise Exception("Flow file header does not contain PIEH")

    width = np.fromfile(f, np.int32, 1).squeeze()
    height = np.fromfile(f, np.int32, 1).squeeze()

    flow = np.fromfile(f, np.float32, width * height * 2).reshape((height, width, 2))

    return flow.astype(np.float32)


def readImage(name):
    if name.endswith(".pfm") or name.endswith(".PFM"):
        data = readPFM(name)[0]
        if len(data.shape) == 3:
            return data[:, :, 0:3]
        else:
            return data
    return imread(name)


def writeImage(name, data):
    if name.endswith(".pfm") or name.endswith(".PFM"):
        return writePFM(name, data, 1)
    return imwrite(name, data)


def writeFlow(name, flow):
    f = open(name, "wb")
    f.write("PIEH".encode("utf-8"))
    np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
    flow = flow.astype(np.float32)
    flow.tofile(f)


def readFloat(name):
    f = open(name, "rb")

    if (f.readline().decode("utf-8")) != "float\n":
        raise Exception("float file %s did not contain <float> keyword" % name)

    dim = int(f.readline())

    dims = []
    count = 1
    for i in range(0, dim):
        d = int(f.readline())
        dims.append(d)
        count *= d

    dims = list(reversed(dims))

    data = np.fromfile(f, np.float32, count).reshape(dims)
    if dim > 2:
        data = np.transpose(data, (2, 1, 0))
        data = np.transpose(data, (1, 0, 2))

    return data


def writeFloat(name, data):
    f = open(name, "wb")

    dim = len(data.shape)
    if dim > 3:
        raise Exception("bad float file dimension: %d" % dim)

    f.write(("float\n").encode("ascii"))
    f.write(("%d\n" % dim).encode("ascii"))

    if dim == 1:
        f.write(("%d\n" % data.shape[0]).encode("ascii"))
    else:
        f.write(("%d\n" % data.shape[1]).encode("ascii"))
        f.write(("%d\n" % data.shape[0]).encode("ascii"))
        for i in range(2, dim):
            f.write(("%d\n" % data.shape[i]).encode("ascii"))

    data = data.astype(np.float32)
    if dim == 2:
        data.tofile(f)

    else:
        np.transpose(data, (2, 0, 1)).tofile(f)


def check_dim_and_resize(tensor_list):
    shape_list = []
    for t in tensor_list:
        shape_list.append(t.shape[2:])

    if len(set(shape_list)) > 1:
        desired_shape = shape_list[0]
        print(f"Inconsistent size of input video frames. All frames will be resized to {desired_shape}")

        resize_tensor_list = []
        for t in tensor_list:
            resize_tensor_list.append(torch.nn.functional.interpolate(t, size=tuple(desired_shape), mode="bilinear"))

        tensor_list = resize_tensor_list

    return tensor_list


================================================
FILE: Open-Sora/build/lib/tools/scene_cut/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/scene_cut/convert_id_to_path.py
================================================
import argparse
import json
import os
from functools import partial

import cv2
import numpy as np
import pandas as pd
from mmengine.logging import print_log
from moviepy.editor import VideoFileClip
from pandarallel import pandarallel
from tqdm import tqdm

tqdm.pandas()


def is_intact_video(video_path, mode="moviepy", verbose=False, logger=None):
    if not os.path.exists(video_path):
        if verbose:
            print_log(f"Could not find '{video_path}'", logger=logger)
        return False

    if mode == "moviepy":
        try:
            VideoFileClip(video_path)
            if verbose:
                print_log(f"The video file '{video_path}' is intact.", logger=logger)
            return True
        except Exception as e:
            if verbose:
                print_log(f"Error: {e}", logger=logger)
                print_log(f"The video file '{video_path}' is not intact.", logger=logger)
            return False
    elif mode == "cv2":
        try:
            cap = cv2.VideoCapture(video_path)
            if cap.isOpened():
                if verbose:
                    print_log(f"The video file '{video_path}' is intact.", logger=logger)
                return True
        except Exception as e:
            if verbose:
                print_log(f"Error: {e}", logger=logger)
                print_log(f"The video file '{video_path}' is not intact.", logger=logger)
            return False
    else:
        raise ValueError


def has_downloaded_success(json_path):
    if not os.path.exists(json_path):
        return False

    try:
        with open(json_path, "r") as f:
            data = json.load(f)
            if "success" not in data or isinstance(data["success"], bool) is False or data["success"] is False:
                return False
    except Exception:
        return False

    return True


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str)
    parser.add_argument("--folder_path", type=str, required=True)
    parser.add_argument("--mode", type=str, default=None)
    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")

    args = parser.parse_args()
    return args


def main():
    args = parse_args()

    meta_path = args.meta_path
    folder_path = args.folder_path
    mode = args.mode

    def is_intact(row, mode=None):
        video_id = row["id"]
        video_path = os.path.join(folder_path, f"{video_id}.mp4")
        row["path"] = video_path

        if mode == ".mp4":
            if is_intact_video(video_path):
                return True, video_path
            return False, video_path
        elif mode == ".json":
            # json_path = os.path.join(root_raw, f"data/{split}/{video_id}.json")
            json_path = os.path.join(folder_path, f"{video_id}.json")
            if has_downloaded_success(json_path):
                return True, video_path
            return False, video_path
        elif mode is None:
            return True, video_path
        else:
            raise ValueError

    meta_dirpath = os.path.dirname(meta_path)
    meta_fname = os.path.basename(meta_path)
    wo_ext, ext = os.path.splitext(meta_fname)

    if args.num_workers is not None:
        pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
    else:
        pandarallel.initialize(progress_bar=True)
    is_intact_partial = partial(is_intact, mode=mode)

    meta = pd.read_csv(meta_path)
    ret = meta.parallel_apply(is_intact_partial, axis=1)
    intact, paths = list(zip(*ret))

    meta["intact"] = intact
    meta["path"] = paths
    out_path = os.path.join(meta_dirpath, f"{wo_ext}_path_intact.csv")
    meta.to_csv(out_path, index=False)
    print(f"New meta (shape={meta.shape}) with intact info saved to '{out_path}'")

    meta_format = meta[np.array(intact)]
    meta_format.drop("intact", axis=1, inplace=True)
    out_path = os.path.join(meta_dirpath, f"{wo_ext}_path-filtered.csv")
    meta_format.to_csv(out_path, index=False)
    print(f"New meta (shape={meta_format.shape}) with format info saved to '{out_path}'")


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/build/lib/tools/scene_cut/cut.py
================================================
import cv2  # isort:skip

import argparse
import os
import subprocess
from functools import partial

import pandas as pd
from imageio_ffmpeg import get_ffmpeg_exe
from pandarallel import pandarallel
from scenedetect import FrameTimecode
from tqdm import tqdm

tqdm.pandas()


def print_log(s, logger=None):
    if logger is not None:
        logger.info(s)
    else:
        print(s)


def process_single_row(row, args):
    video_path = row["path"]

    logger = None

    # check mp4 integrity
    # if not is_intact_video(video_path, logger=logger):
    #     return False
    try:
        if "timestamp" in row:
            timestamp = row["timestamp"]
            if not (timestamp.startswith("[") and timestamp.endswith("]")):
                return False
            scene_list = eval(timestamp)
            scene_list = [(FrameTimecode(s, fps=100), FrameTimecode(t, fps=100)) for s, t in scene_list]
        else:
            scene_list = [None]
        if args.drop_invalid_timestamps:
            return True
    except Exception as e:
        if args.drop_invalid_timestamps:
            return False

    if "relpath" in row:
        save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"]))
        os.makedirs(save_dir, exist_ok=True)
    else:
        save_dir = args.save_dir

    shorter_size = args.shorter_size
    if (shorter_size is not None) and ("height" in row) and ("width" in row):
        min_size = min(row["height"], row["width"])
        if min_size <= shorter_size:
            shorter_size = None

    split_video(
        video_path,
        scene_list,
        save_dir=save_dir,
        min_seconds=args.min_seconds,
        max_seconds=args.max_seconds,
        target_fps=args.target_fps,
        shorter_size=shorter_size,
        logger=logger,
    )
    return True

def split_video(
    video_path,
    scene_list,
    save_dir,
    min_seconds=2,
    max_seconds=15,
    target_fps=30,
    shorter_size=None,
    verbose=False,
    logger=None,
):
    """
    scenes shorter than min_seconds will be ignored;
    scenes longer than max_seconds will be cut to save the beginning max_seconds.
    Currently, the saved file name pattern is f'{fname}_scene-{idx}'.mp4

    Args:
        scene_list (List[Tuple[FrameTimecode, FrameTimecode]]): each element is (s, t): start and end of a scene.
        min_seconds (float | None)
        max_seconds (float | None)
        target_fps (int | None)
        shorter_size (int | None)
    """
    FFMPEG_PATH = get_ffmpeg_exe()

    save_path_list = []
    for idx, scene in enumerate(scene_list):
        if scene is not None:
            s, t = scene  # FrameTimecode
            if min_seconds is not None:
                if (t - s).get_seconds() < min_seconds:
                    continue

            duration = t - s
            if max_seconds is not None:
                fps = s.framerate
                max_duration = FrameTimecode(max_seconds, fps=fps)
                duration = min(max_duration, duration)

        # save path
        fname = os.path.basename(video_path)
        fname_wo_ext = os.path.splitext(fname)[0]
        # TODO: fname pattern
        save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
        if os.path.exists(save_path):
            # print_log(f"File '{save_path}' already exists. Skip.", logger=logger)
            continue
        
        # ffmpeg cmd
        cmd = [FFMPEG_PATH]

        # Only show ffmpeg output for the first call, which will display any
        # errors if it fails, and then break the loop. We only show error messages
        # for the remaining calls.
        # cmd += ['-v', 'error']

        # clip to cut
        # Note: -ss after -i is very slow; put -ss before -i !!!
        if scene is None:
            cmd += ["-nostdin", "-y", "-i", video_path]
        else:
            cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-i", video_path, "-t", str(duration.get_seconds())]

        # target fps
        if target_fps is not None:
            cmd += ["-r", f"{target_fps}"]

        # aspect ratio
        if shorter_size is not None:
            cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"]
            # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]

        cmd += ["-map", "0:v", save_path]
        # print(cmd)
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        stdout, stderr = proc.communicate()
        # stdout = stdout.decode("utf-8")
        # print_log(stdout, logger=logger)

        save_path_list.append(video_path)
        if verbose:
            print_log(f"Video clip saved to '{save_path}'", logger=logger)

    return save_path_list


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str)
    parser.add_argument("--save_dir", type=str)
    parser.add_argument(
        "--min_seconds", type=float, default=None, help="if not None, clip shorter than min_seconds is ignored"
    )
    parser.add_argument(
        "--max_seconds", type=float, default=None, help="if not None, clip longer than max_seconds is truncated"
    )
    parser.add_argument("--target_fps", type=int, default=None, help="target fps of clips")
    parser.add_argument(
        "--shorter_size", type=int, default=None, help="resize the shorter size by keeping ratio; will not do upscale"
    )
    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
    parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
    parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps")
    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    meta_path = args.meta_path
    if not os.path.exists(meta_path):
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

    # create save_dir
    os.makedirs(args.save_dir, exist_ok=True)

    # initialize pandarallel
    if not args.disable_parallel:
        if args.num_workers is not None:
            pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
        else:
            pandarallel.initialize(progress_bar=True)
    process_single_row_partial = partial(process_single_row, args=args)

    # process
    meta = pd.read_csv(args.meta_path)
    if not args.disable_parallel:
        results = meta.parallel_apply(process_single_row_partial, axis=1)
    else:
        results = meta.apply(process_single_row_partial, axis=1)
    if args.drop_invalid_timestamps:
        meta = meta[results]
        assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv"
        meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False)
        print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'")
if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/build/lib/tools/scene_cut/scene_detect.py
================================================
import argparse
import os

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from scenedetect import AdaptiveDetector, detect
from tqdm import tqdm

tqdm.pandas()


def process_single_row(row):
    # windows
    # from scenedetect import detect, ContentDetector, AdaptiveDetector

    video_path = row["path"]

    detector = AdaptiveDetector(
        adaptive_threshold=3.0,
        # luma_only=True,
    )
    # detector = ContentDetector()
    # TODO: catch error here
    try:
        scene_list = detect(video_path, detector, start_in_scene=True)
        timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list]
        return True, str(timestamp)
    except Exception as e:
        print(f"Video '{video_path}' with error {e}")
        return False, ""


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str)
    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")

    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    meta_path = args.meta_path
    if not os.path.exists(meta_path):
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

    if args.num_workers is not None:
        pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
    else:
        pandarallel.initialize(progress_bar=True)

    meta = pd.read_csv(meta_path)
    ret = meta.parallel_apply(process_single_row, axis=1)

    succ, timestamps = list(zip(*ret))
    meta["timestamp"] = timestamps
    meta = meta[np.array(succ)]

    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_timestamp{ext}"
    meta.to_csv(out_path, index=False)
    print(f"New meta (shape={meta.shape}) with timestamp saved to '{out_path}'.")


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/build/lib/tools/scoring/aesthetic/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/scoring/aesthetic/inference.py
================================================
# adapted from https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/main/simple_inference.py
import cv2  # isort:skip

import argparse
import gc
import os
from datetime import timedelta

import clip
import numpy as np
import pandas as pd
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from torch.utils.data import DataLoader, DistributedSampler
from torchvision.datasets.folder import pil_loader
from tqdm import tqdm

from tools.datasets.utils import extract_frames, is_video

NUM_FRAMES_POINTS = {
    1: (0.5,),
    2: (0.25, 0.5),
    3: (0.1, 0.5, 0.9),
}


def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
    # reorder
    indices_list = list(map(lambda x: x[0], gathered_list))
    scores_list = list(map(lambda x: x[1], gathered_list))

    flat_indices = []
    for x in zip(*indices_list):
        flat_indices.extend(x)
    flat_scores = []
    for x in zip(*scores_list):
        flat_scores.extend(x)
    flat_indices = np.array(flat_indices)
    flat_scores = np.array(flat_scores)

    # filter duplicates
    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
    meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]

    # drop indices in meta not in unique_indices
    meta = meta.loc[unique_indices]
    return meta


class VideoTextDataset(torch.utils.data.Dataset):
    def __init__(self, meta_path, transform=None, num_frames=3):
        self.meta_path = meta_path
        self.meta = pd.read_csv(meta_path)
        self.transform = transform
        self.points = NUM_FRAMES_POINTS[num_frames]

    def __getitem__(self, index):
        sample = self.meta.iloc[index]
        path = sample["path"]

        # extract frames
        if not is_video(path):
            images = [pil_loader(path)]
        else:
            num_frames = sample["num_frames"] if "num_frames" in sample else None
            images = extract_frames(sample["path"], points=self.points, backend="opencv", num_frames=num_frames)

        # transform
        images = [self.transform(img) for img in images]

        # stack
        images = torch.stack(images)

        ret = dict(index=index, images=images)
        return ret

    def __len__(self):
        return len(self.meta)


class MLP(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.input_size = input_size
        self.layers = nn.Sequential(
            nn.Linear(self.input_size, 1024),
            nn.Dropout(0.2),
            nn.Linear(1024, 128),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.Dropout(0.1),
            nn.Linear(64, 16),
            nn.Linear(16, 1),
        )

    def forward(self, x):
        return self.layers(x)


class AestheticScorer(nn.Module):
    def __init__(self, input_size, device):
        super().__init__()
        self.mlp = MLP(input_size)
        self.clip, self.preprocess = clip.load("ViT-L/14", device=device)

        self.eval()
        self.to(device)

    def forward(self, x):
        image_features = self.clip.encode_image(x)
        image_features = F.normalize(image_features, p=2, dim=-1).float()
        return self.mlp(image_features)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
    parser.add_argument("--bs", type=int, default=1024, help="Batch size")
    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
    parser.add_argument("--prefetch_factor", type=int, default=3, help="Prefetch factor")
    parser.add_argument("--num_frames", type=int, default=3, help="Number of frames to extract")
    parser.add_argument("--skip_if_existing", action="store_true")
    args = parser.parse_args()

    return args


def main():
    args = parse_args()

    meta_path = args.meta_path
    if not os.path.exists(meta_path):
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_aes{ext}"
    if args.skip_if_existing and os.path.exists(out_path):
        print(f"Output meta file '{out_path}' already exists. Exit.")
        exit()

    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())

    # build model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AestheticScorer(768, device)
    model.mlp.load_state_dict(torch.load("pretrained_models/aesthetic.pth", map_location=device))
    preprocess = model.preprocess

    # build dataset
    dataset = VideoTextDataset(args.meta_path, transform=preprocess, num_frames=args.num_frames)
    dataloader = DataLoader(
        dataset,
        batch_size=args.bs,
        num_workers=args.num_workers,
        sampler=DistributedSampler(
            dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=False,
            drop_last=False,
        ),
    )

    # compute aesthetic scores
    indices_list = []
    scores_list = []
    model.eval()
    for batch in tqdm(dataloader, disable=dist.get_rank() != 0):
        indices = batch["index"]
        images = batch["images"].to(device, non_blocking=True)

        B = images.shape[0]
        images = rearrange(images, "B N C H W -> (B N) C H W")

        # compute score
        with torch.no_grad():
            scores = model(images)

        scores = rearrange(scores, "(B N) 1 -> B N", B=B)
        scores = scores.mean(dim=1)
        scores_np = scores.to(torch.float32).cpu().numpy()

        indices_list.extend(indices.tolist())
        scores_list.extend(scores_np.tolist())

    # save local results
    meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="aes")
    save_dir_local = os.path.join(os.path.dirname(out_path), "parts")
    os.makedirs(save_dir_local, exist_ok=True)
    out_path_local = os.path.join(
        save_dir_local, os.path.basename(out_path).replace(".csv", f"_part_{dist.get_rank()}.csv")
    )
    meta_local.to_csv(out_path_local, index=False)

    # wait for all ranks to finish data processing
    dist.barrier()

    torch.cuda.empty_cache()
    gc.collect()
    gathered_list = [None] * dist.get_world_size()
    dist.all_gather_object(gathered_list, (indices_list, scores_list))
    if dist.get_rank() == 0:
        meta_new = merge_scores(gathered_list, dataset.meta, column="aes")
        meta_new.to_csv(out_path, index=False)
        print(f"New meta with aesthetic scores saved to '{out_path}'.")


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/build/lib/tools/scoring/matching/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/tools/scoring/matching/inference.py
================================================
import argparse
import os

import clip
import colossalai
import numpy as np
import pandas as pd
import torch
import torch.distributed as dist
import torch.nn.functional as F
from torch.utils.data import DataLoader, DistributedSampler
from torchvision.datasets.folder import pil_loader
from tqdm import tqdm

from tools.datasets.utils import extract_frames, is_video


def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
    # reorder
    indices_list = list(map(lambda x: x[0], gathered_list))
    scores_list = list(map(lambda x: x[1], gathered_list))

    flat_indices = []
    for x in zip(*indices_list):
        flat_indices.extend(x)
    flat_scores = []
    for x in zip(*scores_list):
        flat_scores.extend(x)
    flat_indices = np.array(flat_indices)
    flat_scores = np.array(flat_scores)

    # filter duplicates
    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
    meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]
    return meta


class VideoTextDataset(torch.utils.data.Dataset):
    def __init__(self, meta_path, transform):
        self.meta_path = meta_path
        self.meta = pd.read_csv(meta_path)
        self.transform = transform

    def __getitem__(self, index):
        row = self.meta.iloc[index]
        path = row["path"]

        if is_video(path):
            img = extract_frames(path, points=[0.5], backend="opencv")[0]
        else:
            img = pil_loader(path)

        img = self.transform(img)

        text = row["text"]
        text = clip.tokenize(text, truncate=True).squeeze()

        return img, text, index

    def __len__(self):
        return len(self.meta)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
    parser.add_argument("--bs", type=int, default=16, help="Batch size")
    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
    parser.add_argument("--skip_if_existing", action="store_true")
    args = parser.parse_args()
    return args


def main():
    args = parse_args()

    meta_path = args.meta_path
    if not os.path.exists(meta_path):
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_match{ext}"
    if args.skip_if_existing and os.path.exists(out_path):
        print(f"Output meta file '{out_path}' already exists. Exit.")
        exit()

    colossalai.launch_from_torch({})

    # build model
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model, preprocess = clip.load("ViT-L/14", device=device)
    logit_scale = model.logit_scale.exp().item()

    # build dataset
    dataset = VideoTextDataset(meta_path=meta_path, transform=preprocess)
    dataloader = DataLoader(
        dataset,
        batch_size=args.bs,
        num_workers=args.num_workers,
        sampler=DistributedSampler(
            dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=False,
            drop_last=False,
        ),
    )

    # compute scores
    indices_list = []
    scores_list = []
    model.eval()
    for imgs, text, indices in tqdm(dataloader, disable=dist.get_rank() != 0):
        imgs = imgs.to(device)
        text = text.to(device)

        with torch.no_grad():
            feat_img = model.encode_image(imgs)
            feat_text = model.encode_text(text)

        feat_img = F.normalize(feat_img, dim=1)
        feat_text = F.normalize(feat_text, dim=1)
        clip_scores = logit_scale * (feat_img * feat_text).sum(dim=1)
        clip_scores = clip_scores.cpu().tolist()
        indices_list.extend(indices)
        scores_list.extend(clip_scores)

    gathered_list = [None] * dist.get_world_size()
    dist.all_gather_object(gathered_list, (indices_list, scores_list))
    if dist.get_rank() == 0:
        meta_new = merge_scores(gathered_list, dataset.meta, column="match")
        meta_new.to_csv(out_path, index=False)
        print(f"New meta with matching scores saved to '{out_path}'.")


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/build/lib/vbench/__init__.py
================================================
import os

from .utils import get_prompt_from_filename, init_submodules, save_json, load_json
import importlib
from itertools import chain
from pathlib import Path

class VBench(object):
    def __init__(self, device, full_info_dir, output_path):
        self.device = device                        # cuda or cpu
        self.full_info_dir = full_info_dir          # full json file that VBench originally provides
        self.output_path = output_path              # output directory to save VBench results
        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path, exist_ok=False)

    def build_full_dimension_list(self, ):
        return ["subject_consistency", "background_consistency", "aesthetic_quality", "imaging_quality", "object_class", "multiple_objects", "color", "spatial_relationship", "scene", "temporal_style", 'overall_consistency', "human_action", "temporal_flickering", "motion_smoothness", "dynamic_degree", "appearance_style"]        

    def check_dimension_requires_extra_info(self, dimension_list):
        dim_custom_not_supported = set(dimension_list) & set([
            'background_consistency', 'object_class', 'multiple_objects', 'scene', 'appearance_style', 'color', 'spatial_relationship'
        ])
        assert len(dim_custom_not_supported) == 0, f"dimensions : {dim_custom_not_supported} not supported for custom input"


    def build_full_info_json(self, videos_path, name, dimension_list, prompt_list=[], special_str='', verbose=False, mode='vbench_standard', **kwargs):
        cur_full_info_list=[] # to save the prompt and video path info for the current dimensions
        if mode=='custom_input':
            self.check_dimension_requires_extra_info(dimension_list)
            if os.path.isfile(videos_path):
                cur_full_info_list = [{"prompt_en": get_prompt_from_filename(videos_path), "dimension": dimension_list, "video_list": [videos_path]}]
                if len(prompt_list) == 1:
                    cur_full_info_list[0]["prompt_en"] = prompt_list[0]
            else:
                video_names = os.listdir(videos_path)

                cur_full_info_list = []

                for filename in video_names:
                    postfix = Path(os.path.join(videos_path, filename)).suffix
                    if postfix.lower() not in ['.mp4', '.gif', '.jpg', '.png']:
                        continue
                    cur_full_info_list.append({
                        "prompt_en": get_prompt_from_filename(filename), 
                        "dimension": dimension_list, 
                        "video_list": [os.path.join(videos_path, filename)]
                    })

                if len(prompt_list) > 0:
                    prompt_list = {os.path.join(videos_path, path): prompt_list[path] for path in prompt_list}
                    assert len(prompt_list) >= len(cur_full_info_list), """
                        Number of prompts should match with number of videos.\n
                        Got {len(prompt_list)=}, {len(cur_full_info_list)=}\n
                        To read the prompt from filename, delete --prompt_file and --prompt_list
                        """

                    all_video_path = [os.path.abspath(file) for file in list(chain.from_iterable(vid["video_list"] for vid in cur_full_info_list))]
                    backslash = "\n"
                    assert len(set(all_video_path) - set([os.path.abspath(path_key) for path_key in prompt_list])) == 0, f"""
                    The prompts for the following videos are not found in the prompt file: \n
                    {backslash.join(set(all_video_path) - set([os.path.abspath(path_key) for path_key in prompt_list]))}
                    """

                    video_map = {}
                    for prompt_key in prompt_list:
                        video_map[os.path.abspath(prompt_key)] = prompt_list[prompt_key]

                    for video_info in cur_full_info_list:
                        video_info["prompt_en"] = video_map[os.path.abspath(video_info["video_list"][0])]

        elif mode=='vbench_category':
            self.check_dimension_requires_extra_info(dimension_list)
            CUR_DIR = os.path.dirname(os.path.abspath(__file__))
            category_supported = [ Path(category).stem for category in os.listdir(f'prompts/prompts_per_category') ]# TODO: probably need refactoring again
            if 'category' not in kwargs:
                category = category_supported
            else:
                category = kwargs['category']

            assert category is not None, "Please specify the category to be evaluated with --category"
            assert category in category_supported, f'''
            The following category is not supported, {category}.
            '''

            video_names = os.listdir(videos_path)
            postfix = Path(video_names[0]).suffix

            with open(f'{CUR_DIR}/prompts_per_category/{category}.txt', 'r') as f:
                video_prompts = [line.strip() for line in f.readlines()]

            for prompt in video_prompts:
                video_list = []
                for filename in video_names:
                    if (not Path(filename).stem.startswith(prompt)):
                        continue
                    postfix = Path(os.path.join(videos_path, filename)).suffix
                    if postfix.lower() not in ['.mp4', '.gif', '.jpg', '.png']:
                        continue
                    video_list.append(os.path.join(videos_path, filename))

                cur_full_info_list.append({
                    "prompt_en": prompt, 
                    "dimension": dimension_list, 
                    "video_list": video_list 
                })

        else:
            full_info_list = load_json(self.full_info_dir)
            video_names = os.listdir(videos_path)
            postfix = Path(video_names[0]).suffix
            for prompt_dict in full_info_list:
                # if the prompt belongs to any dimension we want to evaluate
                if set(dimension_list) & set(prompt_dict["dimension"]): 
                    prompt = prompt_dict['prompt_en']
                    prompt_dict['video_list'] = []
                    for i in range(5): # video index for the same prompt
                        intended_video_name = f'{prompt}{special_str}-{str(i)}{postfix}'
                        if intended_video_name in video_names: # if the video exists
                            intended_video_path = os.path.join(videos_path, intended_video_name)
                            prompt_dict['video_list'].append(intended_video_path)
                            if verbose:
                                print(f'Successfully found video: {intended_video_name}')
                        else:
                            print(f'WARNING!!! This required video is not found! Missing benchmark videos can lead to unfair evaluation result. The missing video is: {intended_video_name}')
                    cur_full_info_list.append(prompt_dict)

        
        cur_full_info_path = os.path.join(self.output_path, name+'_full_info.json')
        save_json(cur_full_info_list, cur_full_info_path)
        print(f'Evaluation meta data saved to {cur_full_info_path}')
        return cur_full_info_path


    def evaluate(self, videos_path, name, prompt_list=[], dimension_list=None, local=False, read_frame=False, mode='vbench_standard', **kwargs):
        results_dict = {}
        if dimension_list is None:
            dimension_list = self.build_full_dimension_list()
        submodules_dict = init_submodules(dimension_list, local=local, read_frame=read_frame)

        cur_full_info_path = self.build_full_info_json(videos_path, name, dimension_list, prompt_list, mode=mode, **kwargs)
        
        for dimension in dimension_list:
            try:
                dimension_module = importlib.import_module(f'vbench.{dimension}')
                evaluate_func = getattr(dimension_module, f'compute_{dimension}')
            except Exception as e:
                raise NotImplementedError(f'UnImplemented dimension {dimension}!, {e}')
            submodules_list = submodules_dict[dimension]
            print(f'cur_full_info_path: {cur_full_info_path}') # TODO: to delete
            results = evaluate_func(cur_full_info_path, self.device, submodules_list, **kwargs)
            results_dict[dimension] = results
        output_name = os.path.join(self.output_path, name+'_eval_results.json')
        save_json(results_dict, output_name)
        print(f'Evaluation results saved to {output_name}')


================================================
FILE: Open-Sora/build/lib/vbench/aesthetic_quality.py
================================================
import os
import clip
import torch
import torch.nn as nn
import torch.nn.functional as F
import subprocess
from urllib.request import urlretrieve
from vbench.utils import load_video, load_dimension_info, clip_transform
from tqdm import tqdm


def get_aesthetic_model(cache_folder):
    """load the aethetic model"""
    path_to_model = cache_folder + "/sa_0_4_vit_l_14_linear.pth"
    if not os.path.exists(path_to_model):
        os.makedirs(cache_folder, exist_ok=True)
        url_model = (
            "https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_l_14_linear.pth?raw=true"
        )
        # download aesthetic predictor
        if not os.path.isfile(path_to_model):
            try:
                print(f'trying urlretrieve to download {url_model} to {path_to_model}')
                urlretrieve(url_model, path_to_model) # unable to download https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_l_14_linear.pth?raw=true to pretrained/aesthetic_model/emb_reader/sa_0_4_vit_l_14_linear.pth 
            except:
                print(f'unable to download {url_model} to {path_to_model} using urlretrieve, trying wget')
                wget_command = ['wget', url_model, '-P', os.path.dirname(path_to_model)]
                subprocess.run(wget_command)
    m = nn.Linear(768, 1)
    s = torch.load(path_to_model)
    m.load_state_dict(s)
    m.eval()
    return m


def laion_aesthetic(aesthetic_model, clip_model, video_list, device):
    aesthetic_model.eval()
    clip_model.eval()
    aesthetic_avg = 0.0
    num = 0
    video_results = []
    for video_path in tqdm(video_list):
        images = load_video(video_path)
        image_transform = clip_transform(224)
        images = image_transform(images)
        images = images.to(device)
        image_feats = clip_model.encode_image(images).to(torch.float32)
        image_feats = F.normalize(image_feats, dim=-1, p=2)
        aesthetic_scores = aesthetic_model(image_feats).squeeze()
        normalized_aesthetic_scores = aesthetic_scores/10
        cur_avg = torch.mean(normalized_aesthetic_scores, dim=0, keepdim=True)
        aesthetic_avg += cur_avg.item()
        num += 1
        video_results.append({'video_path': video_path, 'video_results': cur_avg.item()})
    aesthetic_avg /= num
    return aesthetic_avg, video_results


def compute_aesthetic_quality(json_dir, device, submodules_list, **kwargs):
    vit_path = submodules_list[0]
    aes_path = submodules_list[1]
    aesthetic_model = get_aesthetic_model(aes_path).to(device)
    clip_model, preprocess = clip.load(vit_path, device=device)
    video_list, _ = load_dimension_info(json_dir, dimension='aesthetic_quality', lang='en')
    all_results, video_results = laion_aesthetic(aesthetic_model, clip_model, video_list, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/appearance_style.py
================================================
import os
import json
import numpy as np
from tqdm import tqdm

import torch
import clip
from PIL import Image
from vbench.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, clip_transform_Image

def get_text_features(model, input_text, tokenizer, text_feature_dict={}):
    if input_text in text_feature_dict:
        return text_feature_dict[input_text]
    text_template= f"{input_text}"
    with torch.no_grad():
        text_features = model.encode_text(text_template).float()
        text_features /= text_features.norm(dim=-1, keepdim=True)      
        text_feature_dict[input_text] = text_features
    return text_features

def get_vid_features(model, input_frames):
    with torch.no_grad():
        clip_feat = model.encode_vision(input_frames,test=True).float()
        clip_feat /= clip_feat.norm(dim=-1, keepdim=True)    
    return clip_feat

def get_predict_label(clip_feature, text_feats_tensor, top=5):
    label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1)
    top_probs, top_labels = label_probs.cpu().topk(top, dim=-1)
    return top_probs, top_labels

def appearance_style(clip_model, video_dict, device, sample="rand"):
    sim = 0.0
    cnt = 0
    video_results = []
    image_transform = clip_transform_Image(224)
    for info in tqdm(video_dict):
        if 'auxiliary_info' not in info:
            raise "Auxiliary info is not in json, please check your json."
        query = info['auxiliary_info']['appearance_style']
        text = clip.tokenize([query]).to(device)
        video_list = info['video_list']
        for video_path in video_list:
            cur_video = []
            with torch.no_grad():
                video_arrays = load_video(video_path, return_tensor=False)
                images = [Image.fromarray(i) for i in video_arrays]
                for image in images:
                    image = image_transform(image)
                    image = image.to(device)
                    logits_per_image, logits_per_text = clip_model(image.unsqueeze(0), text)
                    cur_sim = float(logits_per_text[0][0].cpu())
                    cur_sim = cur_sim / 100
                    cur_video.append(cur_sim)
                    sim += cur_sim
                    cnt +=1
                video_sim = np.mean(cur_video)
                video_results.append({'video_path': video_path, 'video_results': video_sim, 'frame_results':cur_video})
    sim_per_frame = sim / cnt
    return sim_per_frame, video_results

def compute_appearance_style(json_dir, device, submodules_list, **kwargs):
    clip_model, preprocess = clip.load(device=device, **submodules_list)
    _, video_dict = load_dimension_info(json_dir, dimension='appearance_style', lang='en')
    all_results, video_results = appearance_style(clip_model, video_dict, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/background_consistency.py
================================================
import os
import json
import logging
import numpy as np
import clip
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from vbench.utils import load_video, load_dimension_info, clip_transform
from tqdm import tqdm


def background_consistency(clip_model, preprocess, video_list, device, read_frame):
    sim = 0.0
    cnt = 0
    video_results = []
    image_transform = clip_transform(224)
    for video_path in tqdm(video_list):
        video_sim = 0.0
        if read_frame:
            video_path = video_path[:-4].replace('videos', 'frames').replace(' ', '_')
            tmp_paths = [os.path.join(video_path, f) for f in sorted(os.listdir(video_path))]
            images = []
            for tmp_path in tmp_paths:
                images.append(preprocess(Image.open(tmp_path)))
            images = torch.stack(images)
        else:
            images = load_video(video_path)
            images = image_transform(images)
        images = images.to(device)
        image_features = clip_model.encode_image(images)
        image_features = F.normalize(image_features, dim=-1, p=2)
        for i in range(len(image_features)):
            image_feature = image_features[i].unsqueeze(0)
            if i == 0:
                first_image_feature = image_feature
            else:
                sim_pre = max(0.0, F.cosine_similarity(former_image_feature, image_feature).item())
                sim_fir = max(0.0, F.cosine_similarity(first_image_feature, image_feature).item())
                cur_sim = (sim_pre + sim_fir) / 2
                video_sim += cur_sim
                cnt += 1
            former_image_feature = image_feature
        sim_per_image = video_sim / (len(image_features) - 1)
        sim += video_sim
        video_results.append({'video_path': video_path, 'video_results': sim_per_image})
    sim_per_video = sim / (len(video_list) - 1)
    sim_per_frame = sim / cnt
    return sim_per_frame, video_results


def compute_background_consistency(json_dir, device, submodules_list, **kwargs):
    vit_path, read_frame = submodules_list[0], submodules_list[1]
    clip_model, preprocess = clip.load(vit_path, device=device)
    video_list, _ = load_dimension_info(json_dir, dimension='background_consistency', lang='en')
    all_results, video_results = background_consistency(clip_model, preprocess, video_list, device, read_frame)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/cli/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/cli/evaluate.py
================================================
import torch
import os
from vbench import VBench
from datetime import datetime
import argparse
import json

CUR_DIR = os.path.dirname(os.path.abspath(__file__))
def register_subparsers(subparser):
    parser = subparser.add_parser('evaluate', formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        "--output_path",
        type=str,
        default='./evaluation_results/',
        help="output path to save the evaluation results",
    )
    parser.add_argument(
        "--full_json_dir",
        type=str,
        default=f'{CUR_DIR}/../VBench_full_info.json',
        help="path to save the json file that contains the prompt and dimension information",
    )
    parser.add_argument(
        "--videos_path",
        type=str,
        required=True,
        help="folder that contains the sampled videos",
    )
    parser.add_argument(
        "--dimension",
        nargs='+',
        required=True,
        help="list of evaluation dimensions, usage: --dimension <dim_1> <dim_2>",
    )
    parser.add_argument(
        "--load_ckpt_from_local",
        type=bool,
        required=False,
        help="whether load checkpoints from local default paths (assuming you have downloaded the checkpoints locally",
    )
    parser.add_argument(
        "--read_frame",
        type=bool,
        required=False,
        help="whether directly read frames, or directly read videos",
    )
    parser.add_argument(
        "--mode",
        choices=['custom_input', 'vbench_standard', 'vbench_category'],
        default='vbench_standard',
        help="""This flags determine the mode of evaluations, choose one of the following:
        1. "custom_input": receive input prompt from either --prompt/--prompt_file flags or the filename
        2. "vbench_standard": evaluate on standard prompt suite of VBench
        3. "vbench_category": evaluate on specific category
        """,
    )
    parser.add_argument(
        "--custom_input",
        action="store_true",
        required=False,
        help="(deprecated) use --mode=\"custom_input\" instead",
    )
    parser.add_argument(
        "--prompt",
        type=str,
        default="",
        help="""Specify the input prompt
        If not specified, filenames will be used as input prompts
        * Mutually exclusive to --prompt_file.
        ** This option must be used with --custom_input flag
        """
    )
    parser.add_argument(
        "--prompt_file",
        type=str,
        required=False,
        help="""Specify the path of the file that contains prompt lists
        If not specified, filenames will be used as input prompts
        * Mutually exclusive to --prompt.
        ** This option must be used with --custom_input flag
        """
    )
    parser.add_argument(
        "--category",
        type=str,
        required=False,
        help="""This is for mode=='vbench_category'
        The category to evaluate on, usage: --category=animal.
        """,
    )

    ## for dimension specific params ###
    parser.add_argument(
        "--imaging_quality_preprocessing_mode",
        type=str,
        required=False,
        default='longer',
        help="""This is for setting preprocessing in imaging_quality
        1. 'shorter': if the shorter side is more than 512, the image is resized so that the shorter side is 512.
        2. 'longer': if the longer side is more than 512, the image is resized so that the longer side is 512.
        3. 'shorter_centercrop': if the shorter side is more than 512, the image is resized so that the shorter side is 512. 
        Then the center 512 x 512 after resized is used for evaluation.
        4. 'None': no preprocessing
        """,
    )
    parser.set_defaults(func=evaluate)

def evaluate(args):
    print(f'args: {args}')

    device = torch.device("cuda")
    my_VBench = VBench(device, args.full_json_dir, args.output_path)
    
    print(f'start evaluation')
    
    current_time = datetime.now().strftime('%Y-%m-%d-%H:%M:%S')

    kwargs = {}

    prompt = []

    assert args.custom_input == False, "(Deprecated) use --mode=custom_input instead"
    
    if (args.prompt_file is not None) and (args.prompt != ""):
        raise Exception("--prompt_file and --prompt cannot be used together")
    if (args.prompt_file is not None or args.prompt != "") and (not args.mode=='custom_input'):
        raise Exception("must set --mode=custom_input for using external prompt")

    if args.prompt_file:
        with open(args.prompt_file, 'r') as f:
            prompt = json.load(f)
        assert type(prompt) == dict, "Invalid prompt file format. The correct format is {\"video_path\": prompt, ... }"
    elif args.prompt != "":
        prompt = [args.prompt]

    if args.category != "":
        kwargs['category'] = args.category

    kwargs['imaging_quality_preprocessing_mode'] = args.imaging_quality_preprocessing_mode

    my_VBench.evaluate(
        videos_path = args.videos_path,
        name = f'results_{current_time}',
        prompt_list=prompt, # pass in [] to read prompt from filename
        dimension_list = args.dimension,
        local=args.load_ckpt_from_local,
        read_frame=args.read_frame,
        mode=args.mode,
        **kwargs
    )
    print('done')


================================================
FILE: Open-Sora/build/lib/vbench/cli/static_filter.py
================================================
import os
import cv2
import glob
import numpy as np
import torch
from tqdm import tqdm
from pathlib import Path
import json
import shutil

import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

from vbench.utils import CACHE_DIR, get_prompt_from_filename, load_json
from vbench.third_party.RAFT.core.raft import RAFT
from vbench.third_party.RAFT.core.utils_core.utils import InputPadder


CUR_DIR = os.path.dirname(os.path.abspath(__file__))
DEVICE = 'cuda'


class StaticFilter:
    def __init__(self, args, device):
        self.args = args
        self.device = device
        self.load_model()


    def load_model(self):
        self.model = torch.nn.DataParallel(RAFT(self.args))
        self.model.load_state_dict(torch.load(self.args.model))

        self.model = self.model.module
        self.model.to(self.device)
        self.model.eval()


    def get_score(self, img, flo):
        img = img[0].permute(1,2,0).cpu().numpy()
        flo = flo[0].permute(1,2,0).cpu().numpy()

        u = flo[:,:,0]
        v = flo[:,:,1]
        rad = np.sqrt(np.square(u) + np.square(v))
        
        h, w = rad.shape
        rad_flat = rad.flatten()
        cut_index = int(h*w*0.02)

        max_rad = np.mean(abs(np.sort(-rad_flat))[:cut_index])

        return max_rad


    def check_static(self, score_list):
        thres = self.params["thres"]
        count_num = self.params["count_num"]
        count = 0
        for score in score_list[:-2]:
            if score > thres:
                count += 1
            if count > count_num:
                return False
        for score in score_list[-2:]:
            if score > thres*count_num*2:
                return False
        return True
    

    def set_params(self, frame, count):
        scale = min(list(frame.shape)[-2:])
        self.params = {"thres":3.0*(scale/256.0), "count_num":round(2*(count/16.0))}


    def infer(self, path):
        with torch.no_grad():
            frames = self.get_frames(path)
            self.set_params(frame=frames[0], count=len(frames))
            static_score = []
            for image1, image2 in zip(frames[:-1]+[frames[0],frames[-1]], frames[1:]+[frames[-1],frames[0]]):
                padder = InputPadder(image1.shape)
                image1, image2 = padder.pad(image1, image2)
                _, flow_up = self.model(image1, image2, iters=20, test_mode=True)
                max_rad = self.get_score(image1, flow_up)
                static_score.append(max_rad)
            whether_static = self.check_static(static_score)
            return whether_static


    def get_frames(self, video_path):
        frame_list = []
        video = cv2.VideoCapture(video_path)
        while video.isOpened():
            success, frame = video.read()
            if success:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # convert to rgb
                frame = torch.from_numpy(frame.astype(np.uint8)).permute(2, 0, 1).float()
                frame = frame[None].to(DEVICE)
                frame_list.append(frame)
            else:
                break
        video.release()
        assert frame_list != []
        return frame_list

def check_and_move(args, filter_results, target_path=None):
    if target_path is None:
         target_path = os.path.join(args.result_path, "filtered_videos")
    os.makedirs(target_path, exist_ok=True)
    for prompt, v in filter_results.items():
        if v["static_count"] < 5 and args.filter_scope=='temporal_flickering':
            logger.warning(f"Prompt: '{prompt}' has fewer than 5 filter results.")
        for i, video_path in enumerate(v["static_path"]):
            target_name = os.path.join(target_path, f"{prompt}-{i}.mp4")
            shutil.copy(video_path, target_name)
    logger.info(f"All filtered videos are saved in the '{target_path}' path")

def static_filter(args):
    static_filter = StaticFilter(args, device=DEVICE)
    prompt_dict = {}
    prompt_list = []
    paths = sorted(glob.glob(os.path.join(args.videos_path, "*.mp4")))
    
    if args.filter_scope=='temporal_flickering':
        full_prompt_list = load_json(f"{CUR_DIR}/../VBench_full_info.json")
        for prompt in full_prompt_list:
            if 'temporal_flickering' in prompt['dimension']:
                prompt_dict[prompt['prompt_en']] = {"static_count":0, "static_path":[]}
                prompt_list.append(prompt['prompt_en'])

    elif args.filter_scope=='all':
        for prompt in paths:
            prompt = get_prompt_from_filename(prompt)
            prompt_dict[prompt] = {"static_count":0, "static_path":[]}
            prompt_list.append(prompt)

    else:
        assert os.path.isfile(args.filter_scope) and Path(args.filter_scope).suffix.lower() == '.json', f"""
        --filter_scope flag is not correctly set, set to 'all' to filter all videos in the --videos_path directory, 
        or provide the correct path to the JSON file
        """
        full_prompt_list = load_json(args.filter_scope)
        for prompt in full_prompt_list:
            prompt = get_prompt_from_filename(prompt)
            prompt_dict[prompt] = {"static_count":0, "static_path":[]}
            prompt_list.append(prompt)
    
    for path in tqdm(paths):
        name = get_prompt_from_filename(path)
        if name in prompt_list:
            if prompt_dict[name]["static_count"] < 5 or args.filter_scope != 'temporal_flickering':
                if static_filter.infer(path):
                    prompt_dict[name]["static_count"] += 1
                    prompt_dict[name]["static_path"].append(path)

    os.makedirs(args.result_path, exist_ok=True)
    info_file = os.path.join(args.result_path, args.store_name)
    json.dump(prompt_dict, open(info_file, "w"))
    logger.info(f"Filtered results info is saved in the '{info_file}' file")
    check_and_move(args, prompt_dict)

def register_subparsers(subparser):
    parser = subparser.add_parser('static_filter')
    parser.add_argument('--model', type=str, default=f"{CACHE_DIR}/raft_model/models/raft-things.pth", help="restore checkpoint")
    parser.add_argument('--videos_path', default="", required=True, help="video path for filtering")
    parser.add_argument('--result_path', type=str, default="./filter_results", help='result save path')
    parser.add_argument('--store_name', type=str, default="filtered_static_video.json", help='result file name')
    parser.add_argument('--small', action='store_true', help='use small model')
    parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
    parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation')
    parser.add_argument('--filter_scope', default='temporal_flickering', help=f'''For specifying the scope for filtering videos
        1. 'temporal_flickering' (default): filter videos based on matches with temporal_flickering dimension of VBench.
        2. 'all': filter all video in the current directory.
        3. '$filename': if a filepath to a JSON file is provided, only the filename exists in JSON file will be filtered.
                >       usage: --filter_scope example.json
    ''')
    parser.set_defaults(func=static_filter)


================================================
FILE: Open-Sora/build/lib/vbench/cli/vbench.py
================================================
import argparse
import importlib
import subprocess

vbench_cmd = ['evaluate', 'static_filter']

def main():
    parser = argparse.ArgumentParser(prog="vbench", formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(title='vbench subcommands')

    for cmd in vbench_cmd:
        module = importlib.import_module(f'vbench.cli.{cmd}')
        module.register_subparsers(subparsers)
    parser.set_defaults(func=help)
    args = parser.parse_args()
    args.func(args)

def help(args):
    subprocess.run(['vbench', '-h'], check=True)


================================================
FILE: Open-Sora/build/lib/vbench/color.py
================================================
import os
import json

import torch
import numpy as np
from tqdm import tqdm
from vbench.utils import load_video, load_dimension_info, read_frames_decord_by_fps
from vbench.third_party.grit_model import DenseCaptioning

import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_dect_from_grit(model, image_arrays):
    pred = []
    if type(image_arrays) is not list and type(image_arrays) is not np.ndarray:
        image_arrays = image_arrays.numpy()
    with torch.no_grad():
        for frame in image_arrays:
            ret = model.run_caption_tensor(frame)
            cur_pred = []
            if len(ret[0])<1:
                cur_pred.append(['',''])
            else:
                for idx, cap_det in enumerate(ret[0]):
                    cur_pred.append([cap_det[0], cap_det[2][0]])
            pred.append(cur_pred)
    return pred

def check_generate(color_key, object_key, predictions):
    cur_object_color, cur_object = 0, 0
    for frame_pred in predictions:
        object_flag, color_flag = False, False
        for pred in frame_pred:
            if object_key == pred[1]:
                for color_query in ["white","red","pink","blue","silver","purple","orange","green","gray","yellow","black","grey"]:
                    if color_query in pred[0]:
                        object_flag =True
                if color_key in pred[0]:
                    color_flag = True
        if color_flag:
            cur_object_color+=1
        if object_flag:
            cur_object +=1
    return cur_object, cur_object_color

def color(model, video_dict, device):
    success_frame_count_all, video_count = 0, 0
    video_results = []
    for info in tqdm(video_dict):
        if 'auxiliary_info' not in info:
            raise "Auxiliary info is not in json, please check your json."
        # print(info)
        color_info = info['auxiliary_info']['color']
        object_info = info['prompt']
        object_info = object_info.replace('a ','').replace('an ','').replace(color_info,'').strip()
        for video_path in info['video_list']:
            video_arrays = load_video(video_path, num_frames=16, return_tensor=False)
            cur_video_pred = get_dect_from_grit(model ,video_arrays)
            cur_object, cur_object_color = check_generate(color_info, object_info, cur_video_pred)
            if cur_object>0:
                cur_success_frame_rate = cur_object_color/cur_object
                success_frame_count_all += cur_success_frame_rate
                video_count += 1
                video_results.append({'video_path': video_path, 'video_results': cur_success_frame_rate})
    success_rate = success_frame_count_all / video_count
    return success_rate, video_results
        

def compute_color(json_dir, device, submodules_dict, **kwargs):
    dense_caption_model = DenseCaptioning(device)
    dense_caption_model.initialize_model(**submodules_dict)
    logger.info("Initialize detection model success")
    _, prompt_dict_ls = load_dimension_info(json_dir, dimension='color', lang='en')
    all_results, video_results = color(dense_caption_model, prompt_dict_ls, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/dynamic_degree.py
================================================
import argparse
import os
import cv2
import glob
import numpy as np
import torch
from tqdm import tqdm
from easydict import EasyDict as edict

from vbench.utils import load_dimension_info

from vbench.third_party.RAFT.core.raft import RAFT
from vbench.third_party.RAFT.core.utils_core.utils import InputPadder

class DynamicDegree:
    def __init__(self, args, device):
        self.args = args
        self.device = device
        self.load_model()
    

    def load_model(self):
        self.model = torch.nn.DataParallel(RAFT(self.args))
        self.model.load_state_dict(torch.load(self.args.model))

        self.model = self.model.module
        self.model.to(self.device)
        self.model.eval()


    def get_score(self, img, flo):
        img = img[0].permute(1,2,0).cpu().numpy()
        flo = flo[0].permute(1,2,0).cpu().numpy()

        u = flo[:,:,0]
        v = flo[:,:,1]
        rad = np.sqrt(np.square(u) + np.square(v))
        
        h, w = rad.shape
        rad_flat = rad.flatten()
        cut_index = int(h*w*0.05)

        max_rad = np.mean(abs(np.sort(-rad_flat))[:cut_index])

        return max_rad.item()


    def set_params(self, frame, count):
        scale = min(list(frame.shape)[-2:])
        self.params = {"thres":6.0*(scale/256.0), "count_num":round(4*(count/16.0))}


    def infer(self, video_path):
        with torch.no_grad():
            if video_path.endswith('.mp4'):
                frames = self.get_frames(video_path)
            elif os.path.isdir(video_path):
                frames = self.get_frames_from_img_folder(video_path)
            else:
                raise NotImplementedError
            self.set_params(frame=frames[0], count=len(frames))
            static_score = []
            for image1, image2 in zip(frames[:-1], frames[1:]):
                padder = InputPadder(image1.shape)
                image1, image2 = padder.pad(image1, image2)
                _, flow_up = self.model(image1, image2, iters=20, test_mode=True)
                max_rad = self.get_score(image1, flow_up)
                static_score.append(max_rad)
            whether_move = self.check_move(static_score)
            return whether_move


    def check_move(self, score_list):
        thres = self.params["thres"]
        count_num = self.params["count_num"]
        count = 0
        for score in score_list:
            if score > thres:
                count += 1
            if count >= count_num:
                return True
        return False


    def get_frames(self, video_path):
        frame_list = []
        video = cv2.VideoCapture(video_path)
        fps = video.get(cv2.CAP_PROP_FPS) # get fps
        interval = round(fps/8)
        while video.isOpened():
            success, frame = video.read()
            if success:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # convert to rgb
                frame = torch.from_numpy(frame.astype(np.uint8)).permute(2, 0, 1).float()
                frame = frame[None].to(self.device)
                frame_list.append(frame)
            else:
                break
        video.release()
        assert frame_list != []
        frame_list = self.extract_frame(frame_list, interval)
        return frame_list 
    
    
    def extract_frame(self, frame_list, interval=1):
        extract = []
        for i in range(0, len(frame_list), interval):
            extract.append(frame_list[i])
        return extract


    def get_frames_from_img_folder(self, img_folder):
        exts = ['jpg', 'png', 'jpeg', 'bmp', 'tif', 
        'tiff', 'JPG', 'PNG', 'JPEG', 'BMP', 
        'TIF', 'TIFF']
        frame_list = []
        imgs = sorted([p for p in glob.glob(os.path.join(img_folder, "*")) if os.path.splitext(p)[1][1:] in exts])
        # imgs = sorted(glob.glob(os.path.join(img_folder, "*.png")))
        for img in imgs:
            frame = cv2.imread(img, cv2.IMREAD_COLOR)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = torch.from_numpy(frame.astype(np.uint8)).permute(2, 0, 1).float()
            frame = frame[None].to(self.device)
            frame_list.append(frame)
        assert frame_list != []
        return frame_list


def dynamic_degree(dynamic, video_list):
    sim = []
    video_results = []
    for video_path in tqdm(video_list):
        score_per_video = dynamic.infer(video_path)
        video_results.append({'video_path': video_path, 'video_results': score_per_video})
        sim.append(score_per_video)
    avg_score = np.mean(sim)
    return avg_score, video_results


def compute_dynamic_degree(json_dir, device, submodules_list, **kwargs):
    model_path = submodules_list["model"] 
    # set_args
    args_new = edict({"model":model_path, "small":False, "mixed_precision":False, "alternate_corr":False})
    dynamic = DynamicDegree(args_new, device)
    video_list, _ = load_dimension_info(json_dir, dimension='dynamic_degree', lang='en')
    all_results, video_results = dynamic_degree(dynamic, video_list)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/human_action.py
================================================
import os
import json
import numpy as np
import clip
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from vbench.utils import load_video, load_dimension_info
from vbench.third_party.umt.datasets.video_transforms import (
    Compose, Resize, CenterCrop, Normalize,
    create_random_augment, random_short_side_scale_jitter, 
    random_crop, random_resized_crop_with_shift, random_resized_crop,
    horizontal_flip, random_short_side_scale_jitter, uniform_crop, 
)
from vbench.third_party.umt.datasets.volume_transforms import ClipToTensor
from timm.models import create_model
from vbench.third_party.umt.models.modeling_finetune import vit_large_patch16_224
from tqdm import tqdm

def build_dict():
    CUR_DIR = os.path.dirname(os.path.abspath(__file__))
    path = f'{CUR_DIR}/third_party/umt/kinetics_400_categories.txt'
    results = {}
    with open(path, 'r') as f:
        cat_list = f.readlines()
        cat_list = [c.strip() for c in cat_list]
        for line in cat_list:
            cat, number = line.split('\t')
            results[number] = cat.lower()
    return results


def human_action(umt_path, video_list, device):
    state_dict = torch.load(umt_path, map_location='cpu')
    model = create_model(
        "vit_large_patch16_224",
        pretrained=False,
        num_classes=400,
        all_frames=16,
        tubelet_size=1,
        use_learnable_pos_emb=False,
        fc_drop_rate=0.,
        drop_rate=0.,
        drop_path_rate=0.2,
        attn_drop_rate=0.,
        drop_block_rate=None,
        use_checkpoint=False,
        checkpoint_num=16,
        use_mean_pooling=True,
        init_scale=0.001,
    )
    data_transform = Compose([
        Resize(256, interpolation='bilinear'),
        CenterCrop(size=(224, 224)),
        ClipToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    model = model.to(device)
    model.load_state_dict(state_dict, strict=False)
    model.eval()
    cat_dict = build_dict()
    cnt= 0
    cor_num = 0
    video_results = []
    for video_path in tqdm(video_list):
        video_label_ls = video_path.split('/')[-1].lower().split('-')[0].split("person is ")[-1].split('_')[0]
        cnt += 1
        images = load_video(video_path, data_transform, num_frames=16)
        images = images.unsqueeze(0)
        images = images.to(device)
        with torch.no_grad():
            logits = torch.sigmoid(model(images))
            results, indices = torch.topk(logits, 5, dim=1)
        indices = indices.squeeze().tolist()
        results = results.squeeze().tolist()
        results = [round(f, 4) for f in results]
        cat_ls = []
        for i in range(5):
            if results[i] >= 0.85:
                cat_ls.append(cat_dict[str(indices[i])])
        flag = False
        for cat in cat_ls:
            if cat == video_label_ls:
                cor_num += 1
                flag = True
                # print(f"{cnt}: {video_path} correct, top-5: {cat_ls}, logits: {results}", flush=True)
                break
        if flag is False:
            # print(f"{cnt}: {video_path} false, gt: {video_label_ls}, top-5: {cat_ls}, logits: {results}", flush=True)
            pass
        video_results.append({'video_path': video_path, 'video_results': flag})
    # print(f"cor num: {cor_num}, total: {cnt}")
    acc = cor_num / cnt
    return acc, video_results


def compute_human_action(json_dir, device, submodules_list, **kwargs):
    umt_path = submodules_list[0]
    video_list, _ = load_dimension_info(json_dir, dimension='human_action', lang='en')
    all_results, video_results = human_action(umt_path, video_list, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/imaging_quality.py
================================================
import torch
from tqdm import tqdm
from torchvision import transforms
from pyiqa.archs.musiq_arch import MUSIQ
from vbench.utils import load_video, load_dimension_info

def transform(images, preprocess_mode='shorter'):
    if preprocess_mode.startswith('shorter'):
        _, _, h, w = images.size()
        if min(h,w) > 512:
            scale = 512./min(h,w)
            images = transforms.Resize(size=( int(scale * h), int(scale * w) ))(images)
            if preprocess_mode == 'shorter_centercrop':
                images = transforms.CenterCrop(512)(images)

    elif preprocess_mode == 'longer':
        _, _, h, w = images.size()
        if max(h,w) > 512:
            scale = 512./max(h,w)
            images = transforms.Resize(size=( int(scale * h), int(scale * w) ))(images)

    elif preprocess_mode == 'None':
        return images / 255.

    else:
        raise ValueError("Please recheck imaging_quality_mode")
    return images / 255.

def technical_quality(model, video_list, device, **kwargs):
    preprocess_mode = kwargs['imaging_quality_preprocessing_mode']
    video_results = []
    for video_path in tqdm(video_list):
        images = load_video(video_path)
        images = transform(images, preprocess_mode)
        acc_score_video = 0.
        for i in range(len(images)):
            frame = images[i].unsqueeze(0).to(device)
            score = model(frame)
            acc_score_video += float(score)
        video_results.append({'video_path': video_path, 'video_results': acc_score_video/len(images)})
    average_score = sum([o['video_results'] for o in video_results]) / len(video_results)
    average_score = average_score / 100.
    return average_score, video_results


def compute_imaging_quality(json_dir, device, submodules_list, **kwargs):
    model_path = submodules_list['model_path']

    model = MUSIQ(pretrained_model_path=model_path)
    model.to(device)
    model.training = False
    
    video_list, _ = load_dimension_info(json_dir, dimension='imaging_quality', lang='en')
    all_results, video_results = technical_quality(model, video_list, device, **kwargs)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/motion_smoothness.py
================================================
import os
import cv2
import glob
import torch
import numpy as np
from tqdm import tqdm
from omegaconf import OmegaConf

from vbench.utils import load_dimension_info

from vbench.third_party.amt.utils.utils import (
    img2tensor, tensor2img,
    check_dim_and_resize
    )
from vbench.third_party.amt.utils.build_utils import build_from_cfg
from vbench.third_party.amt.utils.utils import InputPadder


class FrameProcess:
    def __init__(self):
        pass


    def get_frames(self, video_path):
        frame_list = []
        video = cv2.VideoCapture(video_path)
        while video.isOpened():
            success, frame = video.read()
            if success:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # convert to rgb
                frame_list.append(frame)
            else:
                break
        video.release()
        assert frame_list != []
        return frame_list 
    

    def get_frames_from_img_folder(self, img_folder):
        exts = ['jpg', 'png', 'jpeg', 'bmp', 'tif', 
                'tiff', 'JPG', 'PNG', 'JPEG', 'BMP', 
                'TIF', 'TIFF']
        frame_list = []
        imgs = sorted([p for p in glob.glob(os.path.join(img_folder, "*")) if os.path.splitext(p)[1][1:] in exts])
        # imgs = sorted(glob.glob(os.path.join(img_folder, "*.png")))
        for img in imgs:
            frame = cv2.imread(img, cv2.IMREAD_COLOR)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_list.append(frame)
        assert frame_list != []
        return frame_list


    def extract_frame(self, frame_list, start_from=0):
        extract = []
        for i in range(start_from, len(frame_list), 2):
            extract.append(frame_list[i])
        return extract


class MotionSmoothness:
    def __init__(self, config, ckpt, device):
        self.device = device
        self.config = config
        self.ckpt = ckpt
        self.niters = 1
        self.initialization()
        self.load_model()

    
    def load_model(self):
        cfg_path = self.config
        ckpt_path = self.ckpt
        network_cfg = OmegaConf.load(cfg_path).network
        network_name = network_cfg.name
        print(f'Loading [{network_name}] from [{ckpt_path}]...')
        self.model = build_from_cfg(network_cfg)
        ckpt = torch.load(ckpt_path)
        self.model.load_state_dict(ckpt['state_dict'])
        self.model = self.model.to(self.device)
        self.model.eval()


    def initialization(self):
        if self.device == 'cuda':
            self.anchor_resolution = 1024 * 512
            self.anchor_memory = 1500 * 1024**2
            self.anchor_memory_bias = 2500 * 1024**2
            self.vram_avail = torch.cuda.get_device_properties(self.device).total_memory
            print("VRAM available: {:.1f} MB".format(self.vram_avail / 1024 ** 2))
        else:
            # Do not resize in cpu mode
            self.anchor_resolution = 8192*8192
            self.anchor_memory = 1
            self.anchor_memory_bias = 0
            self.vram_avail = 1

        self.embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(self.device)
        self.fp = FrameProcess()


    def motion_score(self, video_path):
        iters = int(self.niters)
        # get inputs
        if video_path.endswith('.mp4'):
            frames = self.fp.get_frames(video_path)
        elif os.path.isdir(video_path):
            frames = self.fp.get_frames_from_img_folder(video_path)
        else:
            raise NotImplementedError
        frame_list = self.fp.extract_frame(frames, start_from=0)
        # print(f'Loading [images] from [{video_path}], the number of images = [{len(frame_list)}]')
        inputs = [img2tensor(frame).to(self.device) for frame in frame_list]
        assert len(inputs) > 1, f"The number of input should be more than one (current {len(inputs)})"
        inputs = check_dim_and_resize(inputs)
        h, w = inputs[0].shape[-2:]
        scale = self.anchor_resolution / (h * w) * np.sqrt((self.vram_avail - self.anchor_memory_bias) / self.anchor_memory)
        scale = 1 if scale > 1 else scale
        scale = 1 / np.floor(1 / np.sqrt(scale) * 16) * 16
        if scale < 1:
            print(f"Due to the limited VRAM, the video will be scaled by {scale:.2f}")
        padding = int(16 / scale)
        padder = InputPadder(inputs[0].shape, padding)
        inputs = padder.pad(*inputs)

        # -----------------------  Interpolater ----------------------- 
        # print(f'Start frame interpolation:')
        for i in range(iters):
            # print(f'Iter {i+1}. input_frames={len(inputs)} output_frames={2*len(inputs)-1}')
            outputs = [inputs[0]]
            for in_0, in_1 in zip(inputs[:-1], inputs[1:]):
                in_0 = in_0.to(self.device)
                in_1 = in_1.to(self.device)
                with torch.no_grad():
                    imgt_pred = self.model(in_0, in_1, self.embt, scale_factor=scale, eval=True)['imgt_pred']
                outputs += [imgt_pred.cpu(), in_1.cpu()]
            inputs = outputs

        # -----------------------  cal_vfi_score ----------------------- 
        outputs = padder.unpad(*outputs)
        outputs = [tensor2img(out) for out in outputs]
        vfi_score = self.vfi_score(frames, outputs)
        norm = (255.0 - vfi_score)/255.0
        return norm


    def vfi_score(self, ori_frames, interpolate_frames):
        ori = self.fp.extract_frame(ori_frames, start_from=1)
        interpolate = self.fp.extract_frame(interpolate_frames, start_from=1)
        scores = []
        for i in range(len(interpolate)):
            scores.append(self.get_diff(ori[i], interpolate[i]))
        return np.mean(np.array(scores))


    def get_diff(self, img1, img2):
        img = cv2.absdiff(img1, img2)
        return np.mean(img)


def motion_smoothness(motion, video_list):
    sim = []
    video_results = []
    for video_path in tqdm(video_list):
        score_per_video = motion.motion_score(video_path)
        video_results.append({'video_path': video_path, 'video_results': score_per_video})
        sim.append(score_per_video)
    avg_score = np.mean(sim)
    return avg_score, video_results


def compute_motion_smoothness(json_dir, device, submodules_list, **kwargs):
    config = submodules_list["config"] # pretrained/amt_model/AMT-S.yaml
    ckpt = submodules_list["ckpt"] # pretrained/amt_model/amt-s.pth
    motion = MotionSmoothness(config, ckpt, device)
    video_list, _ = load_dimension_info(json_dir, dimension='motion_smoothness', lang='en')
    all_results, video_results = motion_smoothness(motion, video_list)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/multiple_objects.py
================================================
import os
import json

import torch
import numpy as np
from tqdm import tqdm
from vbench.utils import load_video, load_dimension_info
from vbench.third_party.grit_model import DenseCaptioning

import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_dect_from_grit(model, image_arrays):
    pred = []
    if type(image_arrays) is not list:
        image_arrays = image_arrays.numpy()
    with torch.no_grad():
        for frame in image_arrays:
            ret = model.run_caption_tensor(frame)
            if len(ret[0])>0:
                pred.append(set(ret[0][0][2]))
            else:
                pred.append(set([]))
    return pred

def check_generate(key_info, predictions):
    cur_cnt = 0
    key_a, key_b = key_info.split(' and ')
    key_a = key_a.strip()
    key_b = key_b.strip()
    for pred in predictions:
        if key_a in pred and key_b in pred:
            cur_cnt+=1
    return cur_cnt

def multiple_objects(model, video_dict, device):
    success_frame_count, frame_count = 0,0
    video_results = []
    for info in tqdm(video_dict):
        if 'auxiliary_info' not in info:
            raise "Auxiliary info is not in json, please check your json."
        object_info = info['auxiliary_info']['object']
        for video_path in info['video_list']:
            video_tensor = load_video(video_path, num_frames=16)
            cur_video_pred = get_dect_from_grit(model, video_tensor.permute(0,2,3,1))
            cur_success_frame_count = check_generate(object_info, cur_video_pred)
            cur_success_frame_rate = cur_success_frame_count/len(cur_video_pred)
            success_frame_count += cur_success_frame_count
            frame_count += len(cur_video_pred)
            video_results.append({'video_path': video_path, 'video_results': cur_success_frame_rate})
    success_rate = success_frame_count / frame_count
    return success_rate, video_results
        

def compute_multiple_objects(json_dir, device, submodules_dict, **kwargs):
    dense_caption_model = DenseCaptioning(device)
    dense_caption_model.initialize_model_det(**submodules_dict)
    logger.info("Initialize detection model success")
    _, prompt_dict_ls = load_dimension_info(json_dir, dimension='multiple_objects', lang='en')
    all_results, video_results = multiple_objects(dense_caption_model, prompt_dict_ls, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/object_class.py
================================================
import os
import json

import torch
import numpy as np
from tqdm import tqdm
from vbench.utils import load_video, load_dimension_info
from vbench.third_party.grit_model import DenseCaptioning

import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_dect_from_grit(model, image_arrays):
    pred = []
    if type(image_arrays) is not list:
        image_arrays = image_arrays.numpy()
    with torch.no_grad():
        for frame in image_arrays:
            try:
                pred.append(set(model.run_caption_tensor(frame)[0][0][2]))
            except:
                pred.append(set())
    return pred

def check_generate(key_info, predictions):
    cur_cnt = 0
    for pred in predictions:
        if key_info in pred:
            cur_cnt+=1
    return cur_cnt

def object_class(model, video_dict, device):
    success_frame_count, frame_count = 0,0
    video_results = []
    for info in tqdm(video_dict):
        if 'auxiliary_info' not in info:
            raise "Auxiliary info is not in json, please check your json."
        object_info = info['auxiliary_info']['object']
        for video_path in info['video_list']:
            video_tensor = load_video(video_path, num_frames=16)
            cur_video_pred = get_dect_from_grit(model, video_tensor.permute(0,2,3,1))
            cur_success_frame_count = check_generate(object_info, cur_video_pred)
            cur_success_frame_rate = cur_success_frame_count/len(cur_video_pred)
            success_frame_count += cur_success_frame_count
            frame_count += len(cur_video_pred)
            video_results.append({'video_path': video_path, 'video_results': cur_success_frame_rate})
    success_rate = success_frame_count / frame_count
    return success_rate, video_results
        

def compute_object_class(json_dir, device, submodules_dict, **kwargs):
    dense_caption_model = DenseCaptioning(device)
    dense_caption_model.initialize_model_det(**submodules_dict)
    logger.info("Initialize detection model success")
    _, prompt_dict_ls = load_dimension_info(json_dir, dimension='object_class', lang='en')
    all_results, video_results = object_class(dense_caption_model, prompt_dict_ls, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/overall_consistency.py
================================================
import os
import json
import numpy as np

import torch
import clip
from tqdm import tqdm
from vbench.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, CACHE_DIR
from vbench.third_party.ViCLIP.viclip import ViCLIP
from vbench.third_party.ViCLIP.simple_tokenizer import SimpleTokenizer

def get_text_features(model, input_text, tokenizer, text_feature_dict={}):
    if input_text in text_feature_dict:
        return text_feature_dict[input_text]
    text_template= f"{input_text}"
    with torch.no_grad():
        text_features = model.encode_text(text_template).float()
        text_features /= text_features.norm(dim=-1, keepdim=True)      
        text_feature_dict[input_text] = text_features
    return text_features

def get_vid_features(model, input_frames):
    with torch.no_grad():
        clip_feat = model.encode_vision(input_frames,test=True).float()
        clip_feat /= clip_feat.norm(dim=-1, keepdim=True)    
    return clip_feat

def get_predict_label(clip_feature, text_feats_tensor, top=5):
    label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1)
    top_probs, top_labels = label_probs.cpu().topk(top, dim=-1)
    return top_probs, top_labels

def overall_consistency(clip_model, video_dict, tokenizer, device, sample="middle"):
    sim = []
    video_results = []
    image_transform = clip_transform(224)
    for info in tqdm(video_dict):
        query = info['prompt']
        text = clip.tokenize([query]).to(device)
        video_list = info['video_list']
        for video_path in video_list:
            cur_video = []
            with torch.no_grad():
                images = read_frames_decord_by_fps(video_path, num_frames=8, sample=sample)
                images = image_transform(images)
                images = images.to(device)
                clip_feat = get_vid_features(clip_model,images.unsqueeze(0))
                text_feat = get_text_features(clip_model, query, tokenizer)
                logit_per_text =  clip_feat @ text_feat.T
                score_per_video =  float(logit_per_text[0][0].cpu())
                sim.append(score_per_video)
                video_results.append({'video_path': video_path, 'video_results': score_per_video})
    avg_score = np.mean(sim)
    return avg_score, video_results

def compute_overall_consistency(json_dir, device, submodules_list, **kwargs):
    tokenizer = SimpleTokenizer(os.path.join(CACHE_DIR, "ViCLIP/bpe_simple_vocab_16e6.txt.gz"))
    viclip = ViCLIP(tokenizer= tokenizer, **submodules_list).to(device)
    _, video_dict = load_dimension_info(json_dir, dimension='overall_consistency', lang='en')
    all_results, video_results = overall_consistency(viclip, video_dict, tokenizer, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/scene.py
================================================
import os
import json

import torch
import numpy as np
from tqdm import tqdm
from vbench.utils import load_video, load_dimension_info, tag2text_transform
from vbench.third_party.tag2Text.tag2text import tag2text_caption

import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_caption(model, image_arrays):
    caption, tag_predict = model.generate(image_arrays, tag_input = None, return_tag_predict = True)
    return caption

def check_generate(key_info, predictions):
    cur_cnt = 0
    key = key_info['scene']
    for pred in predictions:
        q_flag = [q in pred for q in key.split(' ')]
        if len(q_flag) == sum(q_flag):
            cur_cnt +=1
    return cur_cnt

def scene(model, video_dict, device):
    success_frame_count, frame_count = 0,0
    video_results = []
    transform = tag2text_transform(384)
    for info in tqdm(video_dict):
        if 'auxiliary_info' not in info:
            raise "Auxiliary info is not in json, please check your json."
        scene_info = info['auxiliary_info']['scene']
        for video_path in info['video_list']:
            video_array = load_video(video_path, num_frames=16, return_tensor=False, width=384, height=384)
            video_tensor_list = []
            for i in video_array:
                video_tensor_list.append(transform(i).to(device).unsqueeze(0))
            video_tensor = torch.cat(video_tensor_list)
            cur_video_pred = get_caption(model, video_tensor)
            cur_success_frame_count = check_generate(scene_info, cur_video_pred)
            cur_success_frame_rate = cur_success_frame_count/len(cur_video_pred)
            success_frame_count += cur_success_frame_count
            frame_count += len(cur_video_pred)
            video_results.append({'video_path': video_path, 'video_results': cur_success_frame_rate})
    success_rate = success_frame_count / frame_count
    return success_rate, video_results
        

def compute_scene(json_dir, device, submodules_dict, **kwargs):
    model = tag2text_caption(**submodules_dict)
    model.eval()
    model = model.to(device)
    logger.info("Initialize caption model success")
    _, prompt_dict_ls = load_dimension_info(json_dir, dimension='scene', lang='en')
    all_results, video_results = scene(model, prompt_dict_ls, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/spatial_relationship.py
================================================
import os
import json

import torch
import numpy as np
from tqdm import tqdm
from vbench.utils import load_video, load_dimension_info
from vbench.third_party.grit_model import DenseCaptioning

import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_position_score(locality, obj1,obj2, iou_threshold=0.1):
    # input obj1 and obj2 should be [x0,y0,x1,y1]
    # Calculate centers of bounding boxes
    box1 = {
        'x_min': obj1[0],
        'y_min': obj1[1],
        'x_max': obj1[2],
        'y_max': obj1[3],
        'width': obj1[2] - obj1[0],
        'height': obj1[3] - obj1[1]
    }

    box2 = {
        'x_min': obj2[0],
        'y_min': obj2[1],
        'x_max': obj2[2],
        'y_max': obj2[3],
        'width': obj2[2] - obj2[0],
        'height': obj2[3] - obj2[1]
    }
    
    # Get the object center
    box1_center = ((box1['x_min'] + box1['x_max']) / 2, (box1['y_min'] + box1['y_max']) / 2)
    box2_center = ((box2['x_min'] + box2['x_max']) / 2, (box2['y_min'] + box2['y_max']) / 2)

    # Calculate horizontal and vertical distances
    x_distance = box2_center[0] - box1_center[0]
    y_distance = box2_center[1] - box1_center[1]

    # Calculate IoU
    x_overlap = max(0, min(box1['x_max'], box2['x_max']) - max(box1['x_min'], box2['x_min']))
    y_overlap = max(0, min(box1['y_max'], box2['y_max']) - max(box1['y_min'], box2['y_min']))
    intersection = x_overlap * y_overlap
    box1_area = (box1['x_max'] - box1['x_min']) * (box1['y_max'] - box1['y_min'])
    box2_area = (box2['x_max'] - box2['x_min']) * (box2['y_max'] - box2['y_min'])
    union = box1_area + box2_area - intersection
    iou = intersection / union

    # get max object width and max object height
    max_width = max(box1['width'], box2['width'])
    max_height = max(box1['height'], box2['height'])

    score=0
    if locality in 'on the right of' or locality in 'on the left of':
        if abs(x_distance) > abs(y_distance) and iou < iou_threshold:
            score=1
        elif abs(x_distance) > abs(y_distance) and iou >= iou_threshold:
            score=iou_threshold/iou
        else:
            score=0
    elif locality in 'on the bottom of' or locality in 'on the top of':
        if abs(y_distance) > abs(x_distance) and iou < iou_threshold:
            score=1
        elif abs(y_distance) > abs(x_distance) and iou >= iou_threshold:
            score=iou_threshold/iou
        else:
            score = 0
    return score

def get_dect_from_grit(model, image_arrays):
    pred = []
    if type(image_arrays) is not list:
        image_arrays = image_arrays.numpy()
    with torch.no_grad():
        for frame in image_arrays:
            ret = model.run_caption_tensor(frame)
            pred_cur = []
            if len(ret[0])>0:
                for info in ret[0]:
                    pred_cur.append([info[0],info[1]])
            pred.append(pred_cur)
    return pred

def check_generate(key_info, predictions):
    key_a = key_info['object_a']
    key_b = key_info['object_b']
    relation = key_info['relationship']
    frame_score =[]
    for frame_pred in predictions:
        # filter the target object
        frame_obj_locats = []
        cur_score = [0]
        for item in frame_pred:
            if (key_a == item[0]) or (key_b == item[0]):
                frame_obj_locats.append(item[1])
            for c_obj1 in range(len(frame_obj_locats)-1):
                for c_obj2 in range(c_obj1+1 ,len(frame_obj_locats)):
                    score_obj1_obj2 = get_position_score(relation, frame_obj_locats[c_obj1], frame_obj_locats[c_obj2])
                    cur_score.append(score_obj1_obj2)
        frame_score.append(max(cur_score))
    return frame_score

def spatial_relationship(model, video_dict, device):
    video_results = []
    frame_score_overall = []
    for info in tqdm(video_dict):
        if 'auxiliary_info' not in info:
            raise "Auxiliary info is not in json, please check your json."
        object_info = info['auxiliary_info']['spatial_relationship']
        for video_path in info['video_list']:
            video_tensor = load_video(video_path, num_frames=16)
            cur_video_pred = get_dect_from_grit(model, video_tensor.permute(0,2,3,1))
            cur_video_frame_score = check_generate(object_info, cur_video_pred)
            cur_success_frame_rate = np.mean(cur_video_frame_score)
            frame_score_overall.extend(cur_video_frame_score)
            video_results.append({'video_path': video_path, 'video_results': cur_success_frame_rate, 'frame_results':cur_video_frame_score})
    success_rate = np.mean(frame_score_overall)
    return success_rate, video_results
        

def compute_spatial_relationship(json_dir, device, submodules_dict, **kwargs):
    dense_caption_model = DenseCaptioning(device)
    dense_caption_model.initialize_model_det(**submodules_dict)
    logger.info("Initialize detection model success")
    _, prompt_dict_ls = load_dimension_info(json_dir, dimension='spatial_relationship', lang='en')
    all_results, video_results = spatial_relationship(dense_caption_model, prompt_dict_ls, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/subject_consistency.py
================================================
import io
import os
import cv2
import json
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms

from vbench.utils import load_video, load_dimension_info, dino_transform, dino_transform_Image
import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def subject_consistency(model, video_list, device, read_frame):
    sim = 0.0
    cnt = 0
    video_results = []
    if read_frame:
        image_transform = dino_transform_Image(224)
    else:
        image_transform = dino_transform(224)
    for video_path in tqdm(video_list):
        video_sim = 0.0
        if read_frame:
            video_path = video_path[:-4].replace('videos', 'frames').replace(' ', '_')
            tmp_paths = [os.path.join(video_path, f) for f in sorted(os.listdir(video_path))]
            images = []
            for tmp_path in tmp_paths:
                images.append(image_transform(Image.open(tmp_path)))
        else:
            images = load_video(video_path)
            images = image_transform(images)
        for i in range(len(images)):
            with torch.no_grad():
                image = images[i].unsqueeze(0)
                image = image.to(device)
                image_features = model(image)
                image_features = F.normalize(image_features, dim=-1, p=2)
                if i == 0:
                    first_image_features = image_features
                else:
                    sim_pre = max(0.0, F.cosine_similarity(former_image_features, image_features).item())
                    sim_fir = max(0.0, F.cosine_similarity(first_image_features, image_features).item())
                    cur_sim = (sim_pre + sim_fir) / 2
                    video_sim += cur_sim
                    cnt += 1
            former_image_features = image_features
        sim += video_sim
        video_results.append({'video_path': video_path, 'video_results': video_sim})
    sim_per_video = sim / (len(video_list) - 1)
    sim_per_frame = sim / cnt
    return sim_per_frame, video_results


def compute_subject_consistency(json_dir, device, submodules_list, **kwargs):
    dino_model = torch.hub.load(**submodules_list).to(device)
    read_frame = submodules_list['read_frame']
    logger.info("Initialize DINO success")
    video_list, _ = load_dimension_info(json_dir, dimension='subject_consistency', lang='en')
    all_results, video_results = subject_consistency(dino_model, video_list, device, read_frame)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/temporal_flickering.py
================================================
import numpy as np
from tqdm import tqdm
import cv2
from vbench.utils import load_dimension_info


def get_frames(video_path):
        frames = []
        video = cv2.VideoCapture(video_path)
        while video.isOpened():
            success, frame = video.read()
            if success:
                frames.append(frame)
            else:
                break
        video.release()
        assert frames != []
        return frames


def mae_seq(frames):
    ssds = []
    for i in range(len(frames)-1):
        ssds.append(calculate_mae(frames[i], frames[i+1]))
    return np.array(ssds)


def calculate_mae(img1, img2):
    """Computing the mean absolute error (MAE) between two images."""
    if img1.shape != img2.shape:
        print("Images don't have the same shape.")
        return
    return np.mean(cv2.absdiff(np.array(img1, dtype=np.float32), np.array(img2, dtype=np.float32)))


def cal_score(video_path):
    """please ensure the video is static"""
    frames = get_frames(video_path)
    score_seq = mae_seq(frames)
    return (255.0 - np.mean(score_seq).item())/255.0


def temporal_flickering(video_list):
    sim = []
    video_results = []
    for video_path in tqdm(video_list):
        try:
            score_per_video = cal_score(video_path)
        except AssertionError:
            continue
        video_results.append({'video_path': video_path, 'video_results': score_per_video})
        sim.append(score_per_video)
    avg_score = np.mean(sim)
    return avg_score, video_results


def compute_temporal_flickering(json_dir, device, submodules_list, **kwargs):
    video_list, _ = load_dimension_info(json_dir, dimension='temporal_flickering', lang='en')
    all_results, video_results = temporal_flickering(video_list)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/temporal_style.py
================================================
import os
import json
import numpy as np

import torch
import clip
from tqdm import tqdm
from vbench.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, CACHE_DIR
from vbench.third_party.ViCLIP.viclip import ViCLIP
from vbench.third_party.ViCLIP.simple_tokenizer import SimpleTokenizer

def get_text_features(model, input_text, tokenizer, text_feature_dict={}):
    if input_text in text_feature_dict:
        return text_feature_dict[input_text]
    text_template= f"{input_text}"
    with torch.no_grad():
        text_features = model.encode_text(text_template).float()
        text_features /= text_features.norm(dim=-1, keepdim=True)      
        text_feature_dict[input_text] = text_features
    return text_features

def get_vid_features(model, input_frames):
    with torch.no_grad():
        clip_feat = model.encode_vision(input_frames,test=True).float()
        clip_feat /= clip_feat.norm(dim=-1, keepdim=True)    
    return clip_feat

def get_predict_label(clip_feature, text_feats_tensor, top=5):
    label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1)
    top_probs, top_labels = label_probs.cpu().topk(top, dim=-1)
    return top_probs, top_labels

def temporal_style(clip_model, video_dict, tokenizer, device, sample="middle"):
    sim = []
    video_results = []
    image_transform = clip_transform(224)
    for info in tqdm(video_dict):
        query = info['prompt']
        text = clip.tokenize([query]).to(device)
        video_list = info['video_list']
        for video_path in video_list:
            cur_video = []
            with torch.no_grad():
                # images = load_video(video_path, num_frames=8)
                images = read_frames_decord_by_fps(video_path, num_frames=8, sample=sample)
                images = image_transform(images)
                images = images.to(device)
                clip_feat = get_vid_features(clip_model,images.unsqueeze(0))
                text_feat = get_text_features(clip_model, query, tokenizer)
                logit_per_text =  clip_feat @ text_feat.T
                score_per_video =  float(logit_per_text[0][0].cpu())
                sim.append(score_per_video)
                video_results.append({'video_path': video_path, 'video_results': score_per_video})
    avg_score = np.mean(sim)
    return avg_score, video_results

def compute_temporal_style(json_dir, device, submodules_list, **kwargs):
    tokenizer = SimpleTokenizer(os.path.join(CACHE_DIR, "ViCLIP/bpe_simple_vocab_16e6.txt.gz"))
    viclip = ViCLIP(tokenizer= tokenizer, **submodules_list).to(device)
    _, video_dict = load_dimension_info(json_dir, dimension='temporal_style', lang='en')
    all_results, video_results = temporal_style(viclip, video_dict, tokenizer, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/0.txt
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/corr.py
================================================
import torch
import torch.nn.functional as F
from .utils_core.utils import bilinear_sampler, coords_grid

try:
    import alt_cuda_corr
except:
    # alt_cuda_corr is not compiled
    pass


class CorrBlock:
    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
        self.num_levels = num_levels
        self.radius = radius
        self.corr_pyramid = []

        # all pairs correlation
        corr = CorrBlock.corr(fmap1, fmap2)

        batch, h1, w1, dim, h2, w2 = corr.shape
        corr = corr.reshape(batch*h1*w1, dim, h2, w2)
        
        self.corr_pyramid.append(corr)
        for i in range(self.num_levels-1):
            corr = F.avg_pool2d(corr, 2, stride=2)
            self.corr_pyramid.append(corr)

    def __call__(self, coords):
        r = self.radius
        coords = coords.permute(0, 2, 3, 1)
        batch, h1, w1, _ = coords.shape

        out_pyramid = []
        for i in range(self.num_levels):
            corr = self.corr_pyramid[i]
            dx = torch.linspace(-r, r, 2*r+1, device=coords.device)
            dy = torch.linspace(-r, r, 2*r+1, device=coords.device)
            delta = torch.stack(torch.meshgrid(dy, dx), axis=-1)

            centroid_lvl = coords.reshape(batch*h1*w1, 1, 1, 2) / 2**i
            delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2)
            coords_lvl = centroid_lvl + delta_lvl

            corr = bilinear_sampler(corr, coords_lvl)
            corr = corr.view(batch, h1, w1, -1)
            out_pyramid.append(corr)

        out = torch.cat(out_pyramid, dim=-1)
        return out.permute(0, 3, 1, 2).contiguous().float()

    @staticmethod
    def corr(fmap1, fmap2):
        batch, dim, ht, wd = fmap1.shape
        fmap1 = fmap1.view(batch, dim, ht*wd)
        fmap2 = fmap2.view(batch, dim, ht*wd) 
        
        corr = torch.matmul(fmap1.transpose(1,2), fmap2)
        corr = corr.view(batch, ht, wd, 1, ht, wd)
        return corr  / torch.sqrt(torch.tensor(dim).float())


class AlternateCorrBlock:
    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
        self.num_levels = num_levels
        self.radius = radius

        self.pyramid = [(fmap1, fmap2)]
        for i in range(self.num_levels):
            fmap1 = F.avg_pool2d(fmap1, 2, stride=2)
            fmap2 = F.avg_pool2d(fmap2, 2, stride=2)
            self.pyramid.append((fmap1, fmap2))

    def __call__(self, coords):
        coords = coords.permute(0, 2, 3, 1)
        B, H, W, _ = coords.shape
        dim = self.pyramid[0][0].shape[1]

        corr_list = []
        for i in range(self.num_levels):
            r = self.radius
            fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous()
            fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous()

            coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous()
            corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r)
            corr_list.append(corr.squeeze(1))

        corr = torch.stack(corr_list, dim=1)
        corr = corr.reshape(B, -1, H, W)
        return corr / torch.sqrt(torch.tensor(dim).float())


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/datasets.py
================================================
# Data loading based on https://github.com/NVIDIA/flownet2-pytorch

import numpy as np
import torch
import torch.utils.data as data
import torch.nn.functional as F

import os
import math
import random
from glob import glob
import os.path as osp

from utils_core import frame_utils
from utils_core.augmentor import FlowAugmentor, SparseFlowAugmentor


class FlowDataset(data.Dataset):
    def __init__(self, aug_params=None, sparse=False):
        self.augmentor = None
        self.sparse = sparse
        if aug_params is not None:
            if sparse:
                self.augmentor = SparseFlowAugmentor(**aug_params)
            else:
                self.augmentor = FlowAugmentor(**aug_params)

        self.is_test = False
        self.init_seed = False
        self.flow_list = []
        self.image_list = []
        self.extra_info = []

    def __getitem__(self, index):

        if self.is_test:
            img1 = frame_utils.read_gen(self.image_list[index][0])
            img2 = frame_utils.read_gen(self.image_list[index][1])
            img1 = np.array(img1).astype(np.uint8)[..., :3]
            img2 = np.array(img2).astype(np.uint8)[..., :3]
            img1 = torch.from_numpy(img1).permute(2, 0, 1).float()
            img2 = torch.from_numpy(img2).permute(2, 0, 1).float()
            return img1, img2, self.extra_info[index]

        if not self.init_seed:
            worker_info = torch.utils.data.get_worker_info()
            if worker_info is not None:
                torch.manual_seed(worker_info.id)
                np.random.seed(worker_info.id)
                random.seed(worker_info.id)
                self.init_seed = True

        index = index % len(self.image_list)
        valid = None
        if self.sparse:
            flow, valid = frame_utils.readFlowKITTI(self.flow_list[index])
        else:
            flow = frame_utils.read_gen(self.flow_list[index])

        img1 = frame_utils.read_gen(self.image_list[index][0])
        img2 = frame_utils.read_gen(self.image_list[index][1])

        flow = np.array(flow).astype(np.float32)
        img1 = np.array(img1).astype(np.uint8)
        img2 = np.array(img2).astype(np.uint8)

        # grayscale images
        if len(img1.shape) == 2:
            img1 = np.tile(img1[...,None], (1, 1, 3))
            img2 = np.tile(img2[...,None], (1, 1, 3))
        else:
            img1 = img1[..., :3]
            img2 = img2[..., :3]

        if self.augmentor is not None:
            if self.sparse:
                img1, img2, flow, valid = self.augmentor(img1, img2, flow, valid)
            else:
                img1, img2, flow = self.augmentor(img1, img2, flow)

        img1 = torch.from_numpy(img1).permute(2, 0, 1).float()
        img2 = torch.from_numpy(img2).permute(2, 0, 1).float()
        flow = torch.from_numpy(flow).permute(2, 0, 1).float()

        if valid is not None:
            valid = torch.from_numpy(valid)
        else:
            valid = (flow[0].abs() < 1000) & (flow[1].abs() < 1000)

        return img1, img2, flow, valid.float()


    def __rmul__(self, v):
        self.flow_list = v * self.flow_list
        self.image_list = v * self.image_list
        return self
        
    def __len__(self):
        return len(self.image_list)
        

class MpiSintel(FlowDataset):
    def __init__(self, aug_params=None, split='training', root='datasets/Sintel', dstype='clean'):
        super(MpiSintel, self).__init__(aug_params)
        flow_root = osp.join(root, split, 'flow')
        image_root = osp.join(root, split, dstype)

        if split == 'test':
            self.is_test = True

        for scene in os.listdir(image_root):
            image_list = sorted(glob(osp.join(image_root, scene, '*.png')))
            for i in range(len(image_list)-1):
                self.image_list += [ [image_list[i], image_list[i+1]] ]
                self.extra_info += [ (scene, i) ] # scene and frame_id

            if split != 'test':
                self.flow_list += sorted(glob(osp.join(flow_root, scene, '*.flo')))


class FlyingChairs(FlowDataset):
    def __init__(self, aug_params=None, split='train', root='datasets/FlyingChairs_release/data'):
        super(FlyingChairs, self).__init__(aug_params)

        images = sorted(glob(osp.join(root, '*.ppm')))
        flows = sorted(glob(osp.join(root, '*.flo')))
        assert (len(images)//2 == len(flows))

        split_list = np.loadtxt('chairs_split.txt', dtype=np.int32)
        for i in range(len(flows)):
            xid = split_list[i]
            if (split=='training' and xid==1) or (split=='validation' and xid==2):
                self.flow_list += [ flows[i] ]
                self.image_list += [ [images[2*i], images[2*i+1]] ]


class FlyingThings3D(FlowDataset):
    def __init__(self, aug_params=None, root='datasets/FlyingThings3D', dstype='frames_cleanpass'):
        super(FlyingThings3D, self).__init__(aug_params)

        for cam in ['left']:
            for direction in ['into_future', 'into_past']:
                image_dirs = sorted(glob(osp.join(root, dstype, 'TRAIN/*/*')))
                image_dirs = sorted([osp.join(f, cam) for f in image_dirs])

                flow_dirs = sorted(glob(osp.join(root, 'optical_flow/TRAIN/*/*')))
                flow_dirs = sorted([osp.join(f, direction, cam) for f in flow_dirs])

                for idir, fdir in zip(image_dirs, flow_dirs):
                    images = sorted(glob(osp.join(idir, '*.png')) )
                    flows = sorted(glob(osp.join(fdir, '*.pfm')) )
                    for i in range(len(flows)-1):
                        if direction == 'into_future':
                            self.image_list += [ [images[i], images[i+1]] ]
                            self.flow_list += [ flows[i] ]
                        elif direction == 'into_past':
                            self.image_list += [ [images[i+1], images[i]] ]
                            self.flow_list += [ flows[i+1] ]
      

class KITTI(FlowDataset):
    def __init__(self, aug_params=None, split='training', root='datasets/KITTI'):
        super(KITTI, self).__init__(aug_params, sparse=True)
        if split == 'testing':
            self.is_test = True

        root = osp.join(root, split)
        images1 = sorted(glob(osp.join(root, 'image_2/*_10.png')))
        images2 = sorted(glob(osp.join(root, 'image_2/*_11.png')))

        for img1, img2 in zip(images1, images2):
            frame_id = img1.split('/')[-1]
            self.extra_info += [ [frame_id] ]
            self.image_list += [ [img1, img2] ]

        if split == 'training':
            self.flow_list = sorted(glob(osp.join(root, 'flow_occ/*_10.png')))


class HD1K(FlowDataset):
    def __init__(self, aug_params=None, root='datasets/HD1k'):
        super(HD1K, self).__init__(aug_params, sparse=True)

        seq_ix = 0
        while 1:
            flows = sorted(glob(os.path.join(root, 'hd1k_flow_gt', 'flow_occ/%06d_*.png' % seq_ix)))
            images = sorted(glob(os.path.join(root, 'hd1k_input', 'image_2/%06d_*.png' % seq_ix)))

            if len(flows) == 0:
                break

            for i in range(len(flows)-1):
                self.flow_list += [flows[i]]
                self.image_list += [ [images[i], images[i+1]] ]

            seq_ix += 1


def fetch_dataloader(args, TRAIN_DS='C+T+K+S+H'):
    """ Create the data loader for the corresponding trainign set """

    if args.stage == 'chairs':
        aug_params = {'crop_size': args.image_size, 'min_scale': -0.1, 'max_scale': 1.0, 'do_flip': True}
        train_dataset = FlyingChairs(aug_params, split='training')
    
    elif args.stage == 'things':
        aug_params = {'crop_size': args.image_size, 'min_scale': -0.4, 'max_scale': 0.8, 'do_flip': True}
        clean_dataset = FlyingThings3D(aug_params, dstype='frames_cleanpass')
        final_dataset = FlyingThings3D(aug_params, dstype='frames_finalpass')
        train_dataset = clean_dataset + final_dataset

    elif args.stage == 'sintel':
        aug_params = {'crop_size': args.image_size, 'min_scale': -0.2, 'max_scale': 0.6, 'do_flip': True}
        things = FlyingThings3D(aug_params, dstype='frames_cleanpass')
        sintel_clean = MpiSintel(aug_params, split='training', dstype='clean')
        sintel_final = MpiSintel(aug_params, split='training', dstype='final')        

        if TRAIN_DS == 'C+T+K+S+H':
            kitti = KITTI({'crop_size': args.image_size, 'min_scale': -0.3, 'max_scale': 0.5, 'do_flip': True})
            hd1k = HD1K({'crop_size': args.image_size, 'min_scale': -0.5, 'max_scale': 0.2, 'do_flip': True})
            train_dataset = 100*sintel_clean + 100*sintel_final + 200*kitti + 5*hd1k + things

        elif TRAIN_DS == 'C+T+K/S':
            train_dataset = 100*sintel_clean + 100*sintel_final + things

    elif args.stage == 'kitti':
        aug_params = {'crop_size': args.image_size, 'min_scale': -0.2, 'max_scale': 0.4, 'do_flip': False}
        train_dataset = KITTI(aug_params, split='training')

    train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, 
        pin_memory=False, shuffle=True, num_workers=4, drop_last=True)

    print('Training with %d image pairs' % len(train_dataset))
    return train_loader


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/extractor.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


class ResidualBlock(nn.Module):
    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
        super(ResidualBlock, self).__init__()
  
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
        self.relu = nn.ReLU(inplace=True)

        num_groups = planes // 8

        if norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            if not stride == 1:
                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
        
        elif norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(planes)
            self.norm2 = nn.BatchNorm2d(planes)
            if not stride == 1:
                self.norm3 = nn.BatchNorm2d(planes)
        
        elif norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(planes)
            self.norm2 = nn.InstanceNorm2d(planes)
            if not stride == 1:
                self.norm3 = nn.InstanceNorm2d(planes)

        elif norm_fn == 'none':
            self.norm1 = nn.Sequential()
            self.norm2 = nn.Sequential()
            if not stride == 1:
                self.norm3 = nn.Sequential()

        if stride == 1:
            self.downsample = None
        
        else:    
            self.downsample = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)


    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x+y)


class BottleneckBlock(nn.Module):
    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
        super(BottleneckBlock, self).__init__()
  
        self.conv1 = nn.Conv2d(in_planes, planes//4, kernel_size=1, padding=0)
        self.conv2 = nn.Conv2d(planes//4, planes//4, kernel_size=3, padding=1, stride=stride)
        self.conv3 = nn.Conv2d(planes//4, planes, kernel_size=1, padding=0)
        self.relu = nn.ReLU(inplace=True)

        num_groups = planes // 8

        if norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4)
            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4)
            self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            if not stride == 1:
                self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
        
        elif norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(planes//4)
            self.norm2 = nn.BatchNorm2d(planes//4)
            self.norm3 = nn.BatchNorm2d(planes)
            if not stride == 1:
                self.norm4 = nn.BatchNorm2d(planes)
        
        elif norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(planes//4)
            self.norm2 = nn.InstanceNorm2d(planes//4)
            self.norm3 = nn.InstanceNorm2d(planes)
            if not stride == 1:
                self.norm4 = nn.InstanceNorm2d(planes)

        elif norm_fn == 'none':
            self.norm1 = nn.Sequential()
            self.norm2 = nn.Sequential()
            self.norm3 = nn.Sequential()
            if not stride == 1:
                self.norm4 = nn.Sequential()

        if stride == 1:
            self.downsample = None
        
        else:    
            self.downsample = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4)


    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))
        y = self.relu(self.norm3(self.conv3(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x+y)

class BasicEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
        super(BasicEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
            
        elif self.norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(64)

        elif self.norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(64)

        elif self.norm_fn == 'none':
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 64
        self.layer1 = self._make_layer(64,  stride=1)
        self.layer2 = self._make_layer(96, stride=2)
        self.layer3 = self._make_layer(128, stride=2)

        # output convolution
        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)
        
        self.in_planes = dim
        return nn.Sequential(*layers)


    def forward(self, x):

        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x


class SmallEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
        super(SmallEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
            
        elif self.norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(32)

        elif self.norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(32)

        elif self.norm_fn == 'none':
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 32
        self.layer1 = self._make_layer(32,  stride=1)
        self.layer2 = self._make_layer(64, stride=2)
        self.layer3 = self._make_layer(96, stride=2)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)
        
        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)
    
        self.in_planes = dim
        return nn.Sequential(*layers)


    def forward(self, x):

        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/raft.py
================================================
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from .update import BasicUpdateBlock, SmallUpdateBlock
from .extractor import BasicEncoder, SmallEncoder
from .corr import CorrBlock, AlternateCorrBlock
from .utils_core.utils import bilinear_sampler, coords_grid, upflow8

try:
    autocast = torch.cuda.amp.autocast
except:
    # dummy autocast for PyTorch < 1.6
    class autocast:
        def __init__(self, enabled):
            pass
        def __enter__(self):
            pass
        def __exit__(self, *args):
            pass


class RAFT(nn.Module):
    def __init__(self, args):
        super(RAFT, self).__init__()
        self.args = args

        if args.small:
            self.hidden_dim = hdim = 96
            self.context_dim = cdim = 64
            args.corr_levels = 4
            args.corr_radius = 3
        
        else:
            self.hidden_dim = hdim = 128
            self.context_dim = cdim = 128
            args.corr_levels = 4
            args.corr_radius = 4

        if 'dropout' not in self.args:
            self.args.dropout = 0

        if 'alternate_corr' not in self.args:
            self.args.alternate_corr = False

        # feature network, context network, and update block
        if args.small:
            self.fnet = SmallEncoder(output_dim=128, norm_fn='instance', dropout=args.dropout)        
            self.cnet = SmallEncoder(output_dim=hdim+cdim, norm_fn='none', dropout=args.dropout)
            self.update_block = SmallUpdateBlock(self.args, hidden_dim=hdim)

        else:
            self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', dropout=args.dropout)        
            self.cnet = BasicEncoder(output_dim=hdim+cdim, norm_fn='batch', dropout=args.dropout)
            self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim)

    def freeze_bn(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eval()

    def initialize_flow(self, img):
        """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0"""
        N, C, H, W = img.shape
        coords0 = coords_grid(N, H//8, W//8, device=img.device)
        coords1 = coords_grid(N, H//8, W//8, device=img.device)

        # optical flow computed as difference: flow = coords1 - coords0
        return coords0, coords1

    def upsample_flow(self, flow, mask):
        """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """
        N, _, H, W = flow.shape
        mask = mask.view(N, 1, 9, 8, 8, H, W)
        mask = torch.softmax(mask, dim=2)

        up_flow = F.unfold(8 * flow, [3,3], padding=1)
        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)

        up_flow = torch.sum(mask * up_flow, dim=2)
        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
        return up_flow.reshape(N, 2, 8*H, 8*W)


    def forward(self, image1, image2, iters=12, flow_init=None, upsample=True, test_mode=False):
        """ Estimate optical flow between pair of frames """

        image1 = 2 * (image1 / 255.0) - 1.0
        image2 = 2 * (image2 / 255.0) - 1.0

        image1 = image1.contiguous()
        image2 = image2.contiguous()

        hdim = self.hidden_dim
        cdim = self.context_dim

        # run the feature network
        with autocast(enabled=self.args.mixed_precision):
            fmap1, fmap2 = self.fnet([image1, image2])        
        
        fmap1 = fmap1.float()
        fmap2 = fmap2.float()
        if self.args.alternate_corr:
            corr_fn = AlternateCorrBlock(fmap1, fmap2, radius=self.args.corr_radius)
        else:
            corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius)

        # run the context network
        with autocast(enabled=self.args.mixed_precision):
            cnet = self.cnet(image1)
            net, inp = torch.split(cnet, [hdim, cdim], dim=1)
            net = torch.tanh(net)
            inp = torch.relu(inp)

        coords0, coords1 = self.initialize_flow(image1)

        if flow_init is not None:
            coords1 = coords1 + flow_init

        flow_predictions = []
        for itr in range(iters):
            coords1 = coords1.detach()
            corr = corr_fn(coords1) # index correlation volume

            flow = coords1 - coords0
            with autocast(enabled=self.args.mixed_precision):
                net, up_mask, delta_flow = self.update_block(net, inp, corr, flow)

            # F(t+1) = F(t) + \Delta(t)
            coords1 = coords1 + delta_flow

            # upsample predictions
            if up_mask is None:
                flow_up = upflow8(coords1 - coords0)
            else:
                flow_up = self.upsample_flow(coords1 - coords0, up_mask)
            
            flow_predictions.append(flow_up)

        if test_mode:
            return coords1 - coords0, flow_up
            
        return flow_predictions


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/update.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


class FlowHead(nn.Module):
    def __init__(self, input_dim=128, hidden_dim=256):
        super(FlowHead, self).__init__()
        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
        self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.conv2(self.relu(self.conv1(x)))

class ConvGRU(nn.Module):
    def __init__(self, hidden_dim=128, input_dim=192+128):
        super(ConvGRU, self).__init__()
        self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1)
        self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1)
        self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1)

    def forward(self, h, x):
        hx = torch.cat([h, x], dim=1)

        z = torch.sigmoid(self.convz(hx))
        r = torch.sigmoid(self.convr(hx))
        q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1)))

        h = (1-z) * h + z * q
        return h

class SepConvGRU(nn.Module):
    def __init__(self, hidden_dim=128, input_dim=192+128):
        super(SepConvGRU, self).__init__()
        self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
        self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
        self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))

        self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
        self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
        self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))


    def forward(self, h, x):
        # horizontal
        hx = torch.cat([h, x], dim=1)
        z = torch.sigmoid(self.convz1(hx))
        r = torch.sigmoid(self.convr1(hx))
        q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1)))        
        h = (1-z) * h + z * q

        # vertical
        hx = torch.cat([h, x], dim=1)
        z = torch.sigmoid(self.convz2(hx))
        r = torch.sigmoid(self.convr2(hx))
        q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1)))       
        h = (1-z) * h + z * q

        return h

class SmallMotionEncoder(nn.Module):
    def __init__(self, args):
        super(SmallMotionEncoder, self).__init__()
        cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2
        self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0)
        self.convf1 = nn.Conv2d(2, 64, 7, padding=3)
        self.convf2 = nn.Conv2d(64, 32, 3, padding=1)
        self.conv = nn.Conv2d(128, 80, 3, padding=1)

    def forward(self, flow, corr):
        cor = F.relu(self.convc1(corr))
        flo = F.relu(self.convf1(flow))
        flo = F.relu(self.convf2(flo))
        cor_flo = torch.cat([cor, flo], dim=1)
        out = F.relu(self.conv(cor_flo))
        return torch.cat([out, flow], dim=1)

class BasicMotionEncoder(nn.Module):
    def __init__(self, args):
        super(BasicMotionEncoder, self).__init__()
        cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2
        self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0)
        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
        self.convf1 = nn.Conv2d(2, 128, 7, padding=3)
        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
        self.conv = nn.Conv2d(64+192, 128-2, 3, padding=1)

    def forward(self, flow, corr):
        cor = F.relu(self.convc1(corr))
        cor = F.relu(self.convc2(cor))
        flo = F.relu(self.convf1(flow))
        flo = F.relu(self.convf2(flo))

        cor_flo = torch.cat([cor, flo], dim=1)
        out = F.relu(self.conv(cor_flo))
        return torch.cat([out, flow], dim=1)

class SmallUpdateBlock(nn.Module):
    def __init__(self, args, hidden_dim=96):
        super(SmallUpdateBlock, self).__init__()
        self.encoder = SmallMotionEncoder(args)
        self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82+64)
        self.flow_head = FlowHead(hidden_dim, hidden_dim=128)

    def forward(self, net, inp, corr, flow):
        motion_features = self.encoder(flow, corr)
        inp = torch.cat([inp, motion_features], dim=1)
        net = self.gru(net, inp)
        delta_flow = self.flow_head(net)

        return net, None, delta_flow

class BasicUpdateBlock(nn.Module):
    def __init__(self, args, hidden_dim=128, input_dim=128):
        super(BasicUpdateBlock, self).__init__()
        self.args = args
        self.encoder = BasicMotionEncoder(args)
        self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=128+hidden_dim)
        self.flow_head = FlowHead(hidden_dim, hidden_dim=256)

        self.mask = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 64*9, 1, padding=0))

    def forward(self, net, inp, corr, flow, upsample=True):
        motion_features = self.encoder(flow, corr)
        inp = torch.cat([inp, motion_features], dim=1)

        net = self.gru(net, inp)
        delta_flow = self.flow_head(net)

        # scale mask to balence gradients
        mask = .25 * self.mask(net)
        return net, mask, delta_flow


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/utils_core/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/utils_core/augmentor.py
================================================
import numpy as np
import random
import math
from PIL import Image

import cv2
cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)

import torch
from torchvision.transforms import ColorJitter
import torch.nn.functional as F


class FlowAugmentor:
    def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True):
        
        # spatial augmentation params
        self.crop_size = crop_size
        self.min_scale = min_scale
        self.max_scale = max_scale
        self.spatial_aug_prob = 0.8
        self.stretch_prob = 0.8
        self.max_stretch = 0.2

        # flip augmentation params
        self.do_flip = do_flip
        self.h_flip_prob = 0.5
        self.v_flip_prob = 0.1

        # photometric augmentation params
        self.photo_aug = ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5/3.14)
        self.asymmetric_color_aug_prob = 0.2
        self.eraser_aug_prob = 0.5

    def color_transform(self, img1, img2):
        """ Photometric augmentation """

        # asymmetric
        if np.random.rand() < self.asymmetric_color_aug_prob:
            img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8)
            img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8)

        # symmetric
        else:
            image_stack = np.concatenate([img1, img2], axis=0)
            image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8)
            img1, img2 = np.split(image_stack, 2, axis=0)

        return img1, img2

    def eraser_transform(self, img1, img2, bounds=[50, 100]):
        """ Occlusion augmentation """

        ht, wd = img1.shape[:2]
        if np.random.rand() < self.eraser_aug_prob:
            mean_color = np.mean(img2.reshape(-1, 3), axis=0)
            for _ in range(np.random.randint(1, 3)):
                x0 = np.random.randint(0, wd)
                y0 = np.random.randint(0, ht)
                dx = np.random.randint(bounds[0], bounds[1])
                dy = np.random.randint(bounds[0], bounds[1])
                img2[y0:y0+dy, x0:x0+dx, :] = mean_color

        return img1, img2

    def spatial_transform(self, img1, img2, flow):
        # randomly sample scale
        ht, wd = img1.shape[:2]
        min_scale = np.maximum(
            (self.crop_size[0] + 8) / float(ht), 
            (self.crop_size[1] + 8) / float(wd))

        scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
        scale_x = scale
        scale_y = scale
        if np.random.rand() < self.stretch_prob:
            scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
            scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
        
        scale_x = np.clip(scale_x, min_scale, None)
        scale_y = np.clip(scale_y, min_scale, None)

        if np.random.rand() < self.spatial_aug_prob:
            # rescale the images
            img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            flow = cv2.resize(flow, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            flow = flow * [scale_x, scale_y]

        if self.do_flip:
            if np.random.rand() < self.h_flip_prob: # h-flip
                img1 = img1[:, ::-1]
                img2 = img2[:, ::-1]
                flow = flow[:, ::-1] * [-1.0, 1.0]

            if np.random.rand() < self.v_flip_prob: # v-flip
                img1 = img1[::-1, :]
                img2 = img2[::-1, :]
                flow = flow[::-1, :] * [1.0, -1.0]

        y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0])
        x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1])
        
        img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]

        return img1, img2, flow

    def __call__(self, img1, img2, flow):
        img1, img2 = self.color_transform(img1, img2)
        img1, img2 = self.eraser_transform(img1, img2)
        img1, img2, flow = self.spatial_transform(img1, img2, flow)

        img1 = np.ascontiguousarray(img1)
        img2 = np.ascontiguousarray(img2)
        flow = np.ascontiguousarray(flow)

        return img1, img2, flow

class SparseFlowAugmentor:
    def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=False):
        # spatial augmentation params
        self.crop_size = crop_size
        self.min_scale = min_scale
        self.max_scale = max_scale
        self.spatial_aug_prob = 0.8
        self.stretch_prob = 0.8
        self.max_stretch = 0.2

        # flip augmentation params
        self.do_flip = do_flip
        self.h_flip_prob = 0.5
        self.v_flip_prob = 0.1

        # photometric augmentation params
        self.photo_aug = ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.3/3.14)
        self.asymmetric_color_aug_prob = 0.2
        self.eraser_aug_prob = 0.5
        
    def color_transform(self, img1, img2):
        image_stack = np.concatenate([img1, img2], axis=0)
        image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8)
        img1, img2 = np.split(image_stack, 2, axis=0)
        return img1, img2

    def eraser_transform(self, img1, img2):
        ht, wd = img1.shape[:2]
        if np.random.rand() < self.eraser_aug_prob:
            mean_color = np.mean(img2.reshape(-1, 3), axis=0)
            for _ in range(np.random.randint(1, 3)):
                x0 = np.random.randint(0, wd)
                y0 = np.random.randint(0, ht)
                dx = np.random.randint(50, 100)
                dy = np.random.randint(50, 100)
                img2[y0:y0+dy, x0:x0+dx, :] = mean_color

        return img1, img2

    def resize_sparse_flow_map(self, flow, valid, fx=1.0, fy=1.0):
        ht, wd = flow.shape[:2]
        coords = np.meshgrid(np.arange(wd), np.arange(ht))
        coords = np.stack(coords, axis=-1)

        coords = coords.reshape(-1, 2).astype(np.float32)
        flow = flow.reshape(-1, 2).astype(np.float32)
        valid = valid.reshape(-1).astype(np.float32)

        coords0 = coords[valid>=1]
        flow0 = flow[valid>=1]

        ht1 = int(round(ht * fy))
        wd1 = int(round(wd * fx))

        coords1 = coords0 * [fx, fy]
        flow1 = flow0 * [fx, fy]

        xx = np.round(coords1[:,0]).astype(np.int32)
        yy = np.round(coords1[:,1]).astype(np.int32)

        v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1)
        xx = xx[v]
        yy = yy[v]
        flow1 = flow1[v]

        flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32)
        valid_img = np.zeros([ht1, wd1], dtype=np.int32)

        flow_img[yy, xx] = flow1
        valid_img[yy, xx] = 1

        return flow_img, valid_img

    def spatial_transform(self, img1, img2, flow, valid):
        # randomly sample scale

        ht, wd = img1.shape[:2]
        min_scale = np.maximum(
            (self.crop_size[0] + 1) / float(ht), 
            (self.crop_size[1] + 1) / float(wd))

        scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
        scale_x = np.clip(scale, min_scale, None)
        scale_y = np.clip(scale, min_scale, None)

        if np.random.rand() < self.spatial_aug_prob:
            # rescale the images
            img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            flow, valid = self.resize_sparse_flow_map(flow, valid, fx=scale_x, fy=scale_y)

        if self.do_flip:
            if np.random.rand() < 0.5: # h-flip
                img1 = img1[:, ::-1]
                img2 = img2[:, ::-1]
                flow = flow[:, ::-1] * [-1.0, 1.0]
                valid = valid[:, ::-1]

        margin_y = 20
        margin_x = 50

        y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0] + margin_y)
        x0 = np.random.randint(-margin_x, img1.shape[1] - self.crop_size[1] + margin_x)

        y0 = np.clip(y0, 0, img1.shape[0] - self.crop_size[0])
        x0 = np.clip(x0, 0, img1.shape[1] - self.crop_size[1])

        img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        valid = valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        return img1, img2, flow, valid


    def __call__(self, img1, img2, flow, valid):
        img1, img2 = self.color_transform(img1, img2)
        img1, img2 = self.eraser_transform(img1, img2)
        img1, img2, flow, valid = self.spatial_transform(img1, img2, flow, valid)

        img1 = np.ascontiguousarray(img1)
        img2 = np.ascontiguousarray(img2)
        flow = np.ascontiguousarray(flow)
        valid = np.ascontiguousarray(valid)

        return img1, img2, flow, valid


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/utils_core/flow_viz.py
================================================
# Flow visualization code used from https://github.com/tomrunia/OpticalFlow_Visualization


# MIT License
#
# Copyright (c) 2018 Tom Runia
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to conditions.
#
# Author: Tom Runia
# Date Created: 2018-08-03

import numpy as np

def make_colorwheel():
    """
    Generates a color wheel for optical flow visualization as presented in:
        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf

    Code follows the original C++ source code of Daniel Scharstein.
    Code follows the the Matlab source code of Deqing Sun.

    Returns:
        np.ndarray: Color wheel
    """

    RY = 15
    YG = 6
    GC = 4
    CB = 11
    BM = 13
    MR = 6

    ncols = RY + YG + GC + CB + BM + MR
    colorwheel = np.zeros((ncols, 3))
    col = 0

    # RY
    colorwheel[0:RY, 0] = 255
    colorwheel[0:RY, 1] = np.floor(255*np.arange(0,RY)/RY)
    col = col+RY
    # YG
    colorwheel[col:col+YG, 0] = 255 - np.floor(255*np.arange(0,YG)/YG)
    colorwheel[col:col+YG, 1] = 255
    col = col+YG
    # GC
    colorwheel[col:col+GC, 1] = 255
    colorwheel[col:col+GC, 2] = np.floor(255*np.arange(0,GC)/GC)
    col = col+GC
    # CB
    colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB)
    colorwheel[col:col+CB, 2] = 255
    col = col+CB
    # BM
    colorwheel[col:col+BM, 2] = 255
    colorwheel[col:col+BM, 0] = np.floor(255*np.arange(0,BM)/BM)
    col = col+BM
    # MR
    colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR)
    colorwheel[col:col+MR, 0] = 255
    return colorwheel


def flow_uv_to_colors(u, v, convert_to_bgr=False):
    """
    Applies the flow color wheel to (possibly clipped) flow components u and v.

    According to the C++ source code of Daniel Scharstein
    According to the Matlab source code of Deqing Sun

    Args:
        u (np.ndarray): Input horizontal flow of shape [H,W]
        v (np.ndarray): Input vertical flow of shape [H,W]
        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.

    Returns:
        np.ndarray: Flow visualization image of shape [H,W,3]
    """
    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
    colorwheel = make_colorwheel()  # shape [55x3]
    ncols = colorwheel.shape[0]
    rad = np.sqrt(np.square(u) + np.square(v))
    a = np.arctan2(-v, -u)/np.pi
    fk = (a+1) / 2*(ncols-1)
    k0 = np.floor(fk).astype(np.int32)
    k1 = k0 + 1
    k1[k1 == ncols] = 0
    f = fk - k0
    for i in range(colorwheel.shape[1]):
        tmp = colorwheel[:,i]
        col0 = tmp[k0] / 255.0
        col1 = tmp[k1] / 255.0
        col = (1-f)*col0 + f*col1
        idx = (rad <= 1)
        col[idx]  = 1 - rad[idx] * (1-col[idx])
        col[~idx] = col[~idx] * 0.75   # out of range
        # Note the 2-i => BGR instead of RGB
        ch_idx = 2-i if convert_to_bgr else i
        flow_image[:,:,ch_idx] = np.floor(255 * col)
    return flow_image


def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
    """
    Expects a two dimensional flow image of shape.

    Args:
        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.

    Returns:
        np.ndarray: Flow visualization image of shape [H,W,3]
    """
    assert flow_uv.ndim == 3, 'input flow must have three dimensions'
    assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]'
    if clip_flow is not None:
        flow_uv = np.clip(flow_uv, 0, clip_flow)
    u = flow_uv[:,:,0]
    v = flow_uv[:,:,1]
    rad = np.sqrt(np.square(u) + np.square(v))
    rad_max = np.max(rad)
    epsilon = 1e-5
    u = u / (rad_max + epsilon)
    v = v / (rad_max + epsilon)
    return flow_uv_to_colors(u, v, convert_to_bgr)

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/utils_core/frame_utils.py
================================================
import numpy as np
from PIL import Image
from os.path import *
import re

import cv2
cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)

TAG_CHAR = np.array([202021.25], np.float32)

def readFlow(fn):
    """ Read .flo file in Middlebury format"""
    # Code adapted from:
    # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy

    # WARNING: this will work on little-endian architectures (eg Intel x86) only!
    # print 'fn = %s'%(fn)
    with open(fn, 'rb') as f:
        magic = np.fromfile(f, np.float32, count=1)
        if 202021.25 != magic:
            print('Magic number incorrect. Invalid .flo file')
            return None
        else:
            w = np.fromfile(f, np.int32, count=1)
            h = np.fromfile(f, np.int32, count=1)
            # print 'Reading %d x %d flo file\n' % (w, h)
            data = np.fromfile(f, np.float32, count=2*int(w)*int(h))
            # Reshape data into 3D array (columns, rows, bands)
            # The reshape here is for visualization, the original code is (w,h,2)
            return np.resize(data, (int(h), int(w), 2))

def readPFM(file):
    file = open(file, 'rb')

    color = None
    width = None
    height = None
    scale = None
    endian = None

    header = file.readline().rstrip()
    if header == b'PF':
        color = True
    elif header == b'Pf':
        color = False
    else:
        raise Exception('Not a PFM file.')

    dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline())
    if dim_match:
        width, height = map(int, dim_match.groups())
    else:
        raise Exception('Malformed PFM header.')

    scale = float(file.readline().rstrip())
    if scale < 0: # little-endian
        endian = '<'
        scale = -scale
    else:
        endian = '>' # big-endian

    data = np.fromfile(file, endian + 'f')
    shape = (height, width, 3) if color else (height, width)

    data = np.reshape(data, shape)
    data = np.flipud(data)
    return data

def writeFlow(filename,uv,v=None):
    """ Write optical flow to file.
    
    If v is None, uv is assumed to contain both u and v channels,
    stacked in depth.
    Original code by Deqing Sun, adapted from Daniel Scharstein.
    """
    nBands = 2

    if v is None:
        assert(uv.ndim == 3)
        assert(uv.shape[2] == 2)
        u = uv[:,:,0]
        v = uv[:,:,1]
    else:
        u = uv

    assert(u.shape == v.shape)
    height,width = u.shape
    f = open(filename,'wb')
    # write the header
    f.write(TAG_CHAR)
    np.array(width).astype(np.int32).tofile(f)
    np.array(height).astype(np.int32).tofile(f)
    # arrange into matrix form
    tmp = np.zeros((height, width*nBands))
    tmp[:,np.arange(width)*2] = u
    tmp[:,np.arange(width)*2 + 1] = v
    tmp.astype(np.float32).tofile(f)
    f.close()


def readFlowKITTI(filename):
    flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH|cv2.IMREAD_COLOR)
    flow = flow[:,:,::-1].astype(np.float32)
    flow, valid = flow[:, :, :2], flow[:, :, 2]
    flow = (flow - 2**15) / 64.0
    return flow, valid

def readDispKITTI(filename):
    disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0
    valid = disp > 0.0
    flow = np.stack([-disp, np.zeros_like(disp)], -1)
    return flow, valid


def writeFlowKITTI(filename, uv):
    uv = 64.0 * uv + 2**15
    valid = np.ones([uv.shape[0], uv.shape[1], 1])
    uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16)
    cv2.imwrite(filename, uv[..., ::-1])
    

def read_gen(file_name, pil=False):
    ext = splitext(file_name)[-1]
    if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg':
        return Image.open(file_name)
    elif ext == '.bin' or ext == '.raw':
        return np.load(file_name)
    elif ext == '.flo':
        return readFlow(file_name).astype(np.float32)
    elif ext == '.pfm':
        flow = readPFM(file_name).astype(np.float32)
        if len(flow.shape) == 2:
            return flow
        else:
            return flow[:, :, :-1]
    return []

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/RAFT/core/utils_core/utils.py
================================================
import torch
import torch.nn.functional as F
import numpy as np
from scipy import interpolate


class InputPadder:
    """ Pads images such that dimensions are divisible by 8 """
    def __init__(self, dims, mode='sintel'):
        self.ht, self.wd = dims[-2:]
        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
        if mode == 'sintel':
            self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
        else:
            self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]

    def pad(self, *inputs):
        return [F.pad(x, self._pad, mode='replicate') for x in inputs]

    def unpad(self,x):
        ht, wd = x.shape[-2:]
        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
        return x[..., c[0]:c[1], c[2]:c[3]]

def forward_interpolate(flow):
    flow = flow.detach().cpu().numpy()
    dx, dy = flow[0], flow[1]

    ht, wd = dx.shape
    x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))

    x1 = x0 + dx
    y1 = y0 + dy
    
    x1 = x1.reshape(-1)
    y1 = y1.reshape(-1)
    dx = dx.reshape(-1)
    dy = dy.reshape(-1)

    valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
    x1 = x1[valid]
    y1 = y1[valid]
    dx = dx[valid]
    dy = dy[valid]

    flow_x = interpolate.griddata(
        (x1, y1), dx, (x0, y0), method='nearest', fill_value=0)

    flow_y = interpolate.griddata(
        (x1, y1), dy, (x0, y0), method='nearest', fill_value=0)

    flow = np.stack([flow_x, flow_y], axis=0)
    return torch.from_numpy(flow).float()


def bilinear_sampler(img, coords, mode='bilinear', mask=False):
    """ Wrapper for grid_sample, uses pixel coordinates """
    H, W = img.shape[-2:]
    xgrid, ygrid = coords.split([1,1], dim=-1)
    xgrid = 2*xgrid/(W-1) - 1
    ygrid = 2*ygrid/(H-1) - 1

    grid = torch.cat([xgrid, ygrid], dim=-1)
    img = F.grid_sample(img, grid, align_corners=True)

    if mask:
        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
        return img, mask.float()

    return img


def coords_grid(batch, ht, wd, device):
    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device))
    coords = torch.stack(coords[::-1], dim=0).float()
    return coords[None].repeat(batch, 1, 1, 1)


def upflow8(flow, mode='bilinear'):
    new_size = (8 * flow.shape[2], 8 * flow.shape[3])
    return  8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/ViCLIP/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/ViCLIP/simple_tokenizer.py
================================================
import gzip
import html
import os
import subprocess
from functools import lru_cache
import ftfy
import regex as re
from vbench.utils import CACHE_DIR

def default_bpe():
    tokenizer_file = os.path.join(CACHE_DIR, "ViCLIP/bpe_simple_vocab_16e6.txt.gz")
    if not os.path.exists(tokenizer_file):
        print(f'Downloading ViCLIP tokenizer to {tokenizer_file}')
        wget_command = ['wget', 'https://raw.githubusercontent.com/openai/CLIP/main/clip/bpe_simple_vocab_16e6.txt.gz', '-P', os.path.dirname(tokenizer_file)]
        subprocess.run(wget_command)
    return tokenizer_file


@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


class SimpleTokenizer(object):
    def __init__(self, bpe_path: str = default_bpe()):
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
        merges = merges[1:49152-256-2+1]
        merges = [tuple(merge.split()) for merge in merges]
        vocab = list(bytes_to_unicode().values())
        vocab = vocab + [v+'</w>' for v in vocab]
        for merge in merges:
            vocab.append(''.join(merge))
        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
        self.encoder = dict(zip(vocab, range(len(vocab))))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
        pairs = get_pairs(word)

        if not pairs:
            return token+'</w>'

        while True:
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        bpe_tokens = []
        text = whitespace_clean(basic_clean(text)).lower()
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
        return text


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/ViCLIP/viclip.py
================================================
import os
import logging

import torch
from einops import rearrange
from torch import nn
import math

from .simple_tokenizer import SimpleTokenizer as _Tokenizer
from .viclip_vision import clip_joint_l14
from .viclip_text import clip_text_l14

logger = logging.getLogger(__name__)


class ViCLIP(nn.Module):
    """docstring for ViCLIP"""

    def __init__(self,  tokenizer=None, pretrain=os.path.join(os.path.dirname(os.path.abspath(__file__)), "ViClip-InternVid-10M-FLT.pth"), freeze_text=True):
        super(ViCLIP, self).__init__()
        if tokenizer:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = _Tokenizer()
        self.max_txt_l = 32
        
        self.vision_encoder_name = 'vit_l14'
    
        self.vision_encoder_pretrained = False
        self.inputs_image_res = 224
        self.vision_encoder_kernel_size = 1
        self.vision_encoder_center = True
        self.video_input_num_frames = 8
        self.vision_encoder_drop_path_rate = 0.1
        self.vision_encoder_checkpoint_num = 24
        self.is_pretrain = pretrain
        self.vision_width = 1024
        self.text_width = 768 
        self.embed_dim = 768 
        self.masking_prob = 0.9
        
        self.text_encoder_name = 'vit_l14'
        self.text_encoder_pretrained = False#'bert-base-uncased'
        self.text_encoder_d_model = 768

        self.text_encoder_vocab_size = 49408
        
        
        # create modules.
        self.vision_encoder = self.build_vision_encoder()
        self.text_encoder = self.build_text_encoder()

        self.temp = nn.parameter.Parameter(torch.ones([]) * 1 / 100.0)
        self.temp_min = 1 / 100.0

        if pretrain:
            logger.info(f"Load pretrained weights from {pretrain}")
            state_dict = torch.load(pretrain, map_location='cpu')['model']
            self.load_state_dict(state_dict)
        
        # Freeze weights
        if freeze_text:
            self.freeze_text()
            

    def freeze_text(self):
        """freeze text encoder"""
        for p in self.text_encoder.parameters():
            p.requires_grad = False

    def no_weight_decay(self):
        ret = {"temp"}
        ret.update(
            {"vision_encoder." + k for k in self.vision_encoder.no_weight_decay()}
        )
        ret.update(
            {"text_encoder." + k for k in self.text_encoder.no_weight_decay()}
        )

        return ret

    def forward(self, image, text, raw_text, idx, log_generation=None, return_sims=False):
        """forward and calculate loss.

        Args:
            image (torch.Tensor): The input images. Shape: [B,T,C,H,W].
            text (dict): TODO
            idx (torch.Tensor): TODO

        Returns: TODO

        """
        self.clip_contrastive_temperature()

        vision_embeds = self.encode_vision(image)
        text_embeds = self.encode_text(raw_text)
        if return_sims:
            sims = torch.nn.functional.normalize(vision_embeds, dim=-1) @ \
                  torch.nn.functional.normalize(text_embeds, dim=-1).transpose(0, 1)
            return sims

        # calculate loss

        ## VTC loss
        loss_vtc = self.clip_loss.vtc_loss(
            vision_embeds, text_embeds, idx, self.temp, all_gather=True
        )

        return dict(
            loss_vtc=loss_vtc,
        )

    def encode_vision(self, image, test=False):
        """encode image / videos as features.

        Args:
            image (torch.Tensor): The input images.
            test (bool): Whether testing.

        Returns: tuple.
            - vision_embeds (torch.Tensor): The features of all patches. Shape: [B,T,L,C].
            - pooled_vision_embeds (torch.Tensor): The pooled features. Shape: [B,T,C].

        """
        if image.ndim == 5:
            image = image.permute(0, 2, 1, 3, 4).contiguous()
        else:
            image = image.unsqueeze(2)

        if not test and self.masking_prob > 0.0:
            return self.vision_encoder(
                image, masking_prob=self.masking_prob
            )

        return self.vision_encoder(image)

    def encode_text(self, text):
        """encode text.
        Args:
            text (dict): The output of huggingface's `PreTrainedTokenizer`. contains keys:
                - input_ids (torch.Tensor): Token ids to be fed to a model. Shape: [B,L].
                - attention_mask (torch.Tensor): The mask indicate padded tokens. Shape: [B,L]. 0 is padded token.
                - other keys refer to "https://huggingface.co/docs/transformers/v4.21.2/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__".
        Returns: tuple.
            - text_embeds (torch.Tensor): The features of all tokens. Shape: [B,L,C].
            - pooled_text_embeds (torch.Tensor): The pooled features. Shape: [B,C].

        """
        device = next(self.text_encoder.parameters()).device
        text = self.text_encoder.tokenize(
            text, context_length=self.max_txt_l
        ).to(device)
        text_embeds = self.text_encoder(text)
        return text_embeds

    @torch.no_grad()
    def clip_contrastive_temperature(self, min_val=0.001, max_val=0.5):
        """Seems only used during pre-training"""
        self.temp.clamp_(min=self.temp_min)

    def build_vision_encoder(self):
        """build vision encoder
        Returns: (vision_encoder, vision_layernorm). Each is a `nn.Module`.

        """
        encoder_name = self.vision_encoder_name
        if encoder_name != "vit_l14":
            raise ValueError(f"Not implemented: {encoder_name}")
        vision_encoder = clip_joint_l14(
            pretrained=self.vision_encoder_pretrained,
            input_resolution=self.inputs_image_res,
            kernel_size=self.vision_encoder_kernel_size,
            center=self.vision_encoder_center,
            num_frames=self.video_input_num_frames,
            drop_path=self.vision_encoder_drop_path_rate,
            checkpoint_num=self.vision_encoder_checkpoint_num,
        )
        return vision_encoder

    def build_text_encoder(self):
        """build text_encoder and possiblly video-to-text multimodal fusion encoder.
        Returns: nn.Module. The text encoder

        """
        encoder_name = self.text_encoder_name
        if encoder_name != "vit_l14":
            raise ValueError(f"Not implemented: {encoder_name}")
        text_encoder = clip_text_l14(
            pretrained=self.text_encoder_pretrained,
            embed_dim=self.text_encoder_d_model,
            context_length=self.max_txt_l,
            vocab_size=self.text_encoder_vocab_size,
            checkpoint_num=0,
        )

        return text_encoder

    def get_text_encoder(self):
        """get text encoder, used for text and cross-modal encoding"""
        encoder = self.text_encoder
        return encoder.bert if hasattr(encoder, "bert") else encoder
    
    def get_text_features(self, input_text, tokenizer, text_feature_dict={}):
        if input_text in text_feature_dict:
            return text_feature_dict[input_text]
        text_template= f"{input_text}"
        with torch.no_grad():
            # text_token = tokenizer.encode(text_template).cuda()
            text_features = self.encode_text(text_template).float()
            text_features /= text_features.norm(dim=-1, keepdim=True)      
            text_feature_dict[input_text] = text_features
        return text_features

    def get_vid_features(self, input_frames):
        with torch.no_grad():
            clip_feat = self.encode_vision(input_frames,test=True).float()
            clip_feat /= clip_feat.norm(dim=-1, keepdim=True)    
        return clip_feat

    def get_predict_label(self, clip_feature, text_feats_tensor, top=5):
        label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1)
        top_probs, top_labels = label_probs.cpu().topk(top, dim=-1)
        return top_probs, top_labels


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/ViCLIP/viclip_text.py
================================================
import os
import logging
from collections import OrderedDict
from pkg_resources import packaging
from .simple_tokenizer import SimpleTokenizer as _Tokenizer

import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
import torch.utils.checkpoint as checkpoint
import functools

logger = logging.getLogger(__name__)


MODEL_PATH = 'https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K'
_MODELS = {
    "ViT-L/14": os.path.join(MODEL_PATH, "vit_l14_text.pth"),
}


class LayerNorm(nn.LayerNorm):
    """Subclass torch's LayerNorm to handle fp16."""

    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)


class QuickGELU(nn.Module):
    def forward(self, x: torch.Tensor):
        return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):
    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
        super().__init__()

        self.attn = nn.MultiheadAttention(d_model, n_head)
        self.ln_1 = LayerNorm(d_model)
        self.mlp = nn.Sequential(OrderedDict([
            ("c_fc", nn.Linear(d_model, d_model * 4)),
            ("gelu", QuickGELU()),
            ("c_proj", nn.Linear(d_model * 4, d_model))
        ]))
        self.ln_2 = LayerNorm(d_model)
        self.attn_mask = attn_mask

    def attention(self, x: torch.Tensor):
        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

    def forward(self, x: torch.Tensor):
        x = x + self.attention(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


class Transformer(nn.Module):
    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None,
                 checkpoint_num: int = 0):
        super().__init__()
        self.width = width
        self.layers = layers
        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])

        self.checkpoint_num = checkpoint_num

    def forward(self, x: torch.Tensor):
        if self.checkpoint_num > 0:
            segments = min(self.checkpoint_num, len(self.resblocks))
            return checkpoint.checkpoint_sequential(self.resblocks, segments, x)
        else:
            return self.resblocks(x)


class CLIP_TEXT(nn.Module):
    def __init__(
            self,
            embed_dim: int,
            context_length: int,
            vocab_size: int,
            transformer_width: int,
            transformer_heads: int,
            transformer_layers: int,
            checkpoint_num: int,
        ):
        super().__init__()

        self.context_length = context_length
        self._tokenizer = _Tokenizer()

        self.transformer = Transformer(
            width=transformer_width,
            layers=transformer_layers,
            heads=transformer_heads,
            attn_mask=self.build_attention_mask(),
            checkpoint_num=checkpoint_num,
        )

        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
        self.ln_final = LayerNorm(transformer_width)

        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
    
    def no_weight_decay(self):
        return {'token_embedding', 'positional_embedding'}

    @functools.lru_cache(maxsize=None)
    def build_attention_mask(self):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
        mask = torch.empty(self.context_length, self.context_length)
        mask.fill_(float("-inf"))
        mask.triu_(1)  # zero out the lower diagonal
        return mask

    def tokenize(self, texts, context_length=77, truncate=True):
        """
        Returns the tokenized representation of given input string(s)
        Parameters
        ----------
        texts : Union[str, List[str]]
            An input string or a list of input strings to tokenize
        context_length : int
            The context length to use; all CLIP models use 77 as the context length
        truncate: bool
            Whether to truncate the text in case its encoding is longer than the context length
        Returns
        -------
        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
        We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
        """
        if isinstance(texts, str):
            texts = [texts]

        sot_token = self._tokenizer.encoder["<|startoftext|>"]
        eot_token = self._tokenizer.encoder["<|endoftext|>"]
        all_tokens = [[sot_token] + self._tokenizer.encode(text) + [eot_token] for text in texts]
        if packaging.version.parse(torch.__version__) < packaging.version.parse("1.8.0"):
            result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
        else:
            result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)

        for i, tokens in enumerate(all_tokens):
            if len(tokens) > context_length:
                if truncate:
                    tokens = tokens[:context_length]
                    tokens[-1] = eot_token
                else:
                    raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
            result[i, :len(tokens)] = torch.tensor(tokens)

        return result

    def forward(self, text):
        x = self.token_embedding(text)  # [batch_size, n_ctx, d_model]

        x = x + self.positional_embedding
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_final(x)

        # x.shape = [batch_size, n_ctx, transformer.width]
        # take features from the eot embedding (eot_token is the highest number in each sequence)
        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection

        return x


def clip_text_b16(
    embed_dim=512,
    context_length=77,
    vocab_size=49408,
    transformer_width=512,
    transformer_heads=8,
    transformer_layers=12,
):
    raise NotImplementedError
    model = CLIP_TEXT(
        embed_dim,
        context_length,
        vocab_size,
        transformer_width,
        transformer_heads,
        transformer_layers
    )
    pretrained = _MODELS["ViT-B/16"]
    logger.info(f"Load pretrained weights from {pretrained}")
    state_dict = torch.load(pretrained, map_location='cpu')
    model.load_state_dict(state_dict, strict=False)
    return model.eval()


def clip_text_l14(
    embed_dim=768,
    context_length=77,
    vocab_size=49408,
    transformer_width=768,
    transformer_heads=12,
    transformer_layers=12,
    checkpoint_num=0,
    pretrained=True,
):
    model = CLIP_TEXT(
        embed_dim,
        context_length,
        vocab_size,
        transformer_width,
        transformer_heads,
        transformer_layers,
        checkpoint_num,
    )
    if pretrained:
        if isinstance(pretrained, str) and pretrained != "bert-base-uncased":
            pretrained = _MODELS[pretrained]
        else:
            pretrained = _MODELS["ViT-L/14"]
        logger.info(f"Load pretrained weights from {pretrained}")
        state_dict = torch.load(pretrained, map_location='cpu')
        if context_length != state_dict["positional_embedding"].size(0):
            # assert context_length < state_dict["positional_embedding"].size(0), "Cannot increase context length."
            print(f"Resize positional embedding from {state_dict['positional_embedding'].size(0)} to {context_length}")
            if context_length < state_dict["positional_embedding"].size(0):
                state_dict["positional_embedding"] = state_dict["positional_embedding"][:context_length]
            else:
                state_dict["positional_embedding"] = F.pad(
                    state_dict["positional_embedding"],
                    (0, 0, 0, context_length - state_dict["positional_embedding"].size(0)),
                    value=0,
                )

        message = model.load_state_dict(state_dict, strict=False)
        print(f"Load pretrained weights from {pretrained}: {message}")
    return model.eval()


def clip_text_l14_336(
    embed_dim=768,
    context_length=77,
    vocab_size=49408,
    transformer_width=768,
    transformer_heads=12,
    transformer_layers=12,
):
    raise NotImplementedError
    model = CLIP_TEXT(
        embed_dim,
        context_length,
        vocab_size,
        transformer_width,
        transformer_heads,
        transformer_layers
    )
    pretrained = _MODELS["ViT-L/14_336"]
    logger.info(f"Load pretrained weights from {pretrained}")
    state_dict = torch.load(pretrained, map_location='cpu')
    model.load_state_dict(state_dict, strict=False)
    return model.eval()


def build_clip(config):
    model_cls = config.text_encoder.clip_teacher
    model = eval(model_cls)()
    return model


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/ViCLIP/viclip_vision.py
================================================
#!/usr/bin/env python
import os
import logging
from collections import OrderedDict

import torch
from torch import nn
from einops import rearrange
from timm.models.layers import DropPath
from timm.models.registry import register_model

import torch.utils.checkpoint as checkpoint

logger = logging.getLogger(__name__)

def load_temp_embed_with_mismatch(temp_embed_old, temp_embed_new, add_zero=True):
    """
    Add/Remove extra temporal_embeddings as needed.
    https://arxiv.org/abs/2104.00650 shows adding zero paddings works.

    temp_embed_old: (1, num_frames_old, 1, d)
    temp_embed_new: (1, num_frames_new, 1, d)
    add_zero: bool, if True, add zero, else, interpolate trained embeddings.
    """
    # TODO zero pad
    num_frms_new = temp_embed_new.shape[1]
    num_frms_old = temp_embed_old.shape[1]
    logger.info(f"Load temporal_embeddings, lengths: {num_frms_old}-->{num_frms_new}")
    if num_frms_new > num_frms_old:
        if add_zero:
            temp_embed_new[
                :, :num_frms_old
            ] = temp_embed_old  # untrained embeddings are zeros.
        else:
            temp_embed_new = interpolate_temporal_pos_embed(temp_embed_old, num_frms_new)
    elif num_frms_new < num_frms_old:
        temp_embed_new = temp_embed_old[:, :num_frms_new]
    else:  # =
        temp_embed_new = temp_embed_old
    return temp_embed_new


MODEL_PATH = 'https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/internvideo/viclip/'
_MODELS = {
    "ViT-L/14": os.path.join(MODEL_PATH, "ViClip-InternVid-10M-FLT.pth"),
}


class QuickGELU(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):
    def __init__(self, d_model, n_head, drop_path=0., attn_mask=None, dropout=0.):
        super().__init__()

        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout)
        self.ln_1 = nn.LayerNorm(d_model)
        self.mlp = nn.Sequential(OrderedDict([
            ("c_fc", nn.Linear(d_model, d_model * 4)),
            ("gelu", QuickGELU()),
            ("drop1", nn.Dropout(dropout)),
            ("c_proj", nn.Linear(d_model * 4, d_model)),
            ("drop2", nn.Dropout(dropout)),
        ]))
        self.ln_2 = nn.LayerNorm(d_model)
        self.attn_mask = attn_mask

    def attention(self, x):
        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

    def forward(self, x):
        x = x + self.drop_path1(self.attention(self.ln_1(x)))
        x = x + self.drop_path2(self.mlp(self.ln_2(x)))
        return x


class Transformer(nn.Module):
    def __init__(self, width, layers, heads, drop_path=0., checkpoint_num=0, dropout=0.):
        super().__init__()
        dpr = [x.item() for x in torch.linspace(0, drop_path, layers)]
        self.resblocks = nn.ModuleList()
        for idx in range(layers):
            self.resblocks.append(ResidualAttentionBlock(width, heads, drop_path=dpr[idx], dropout=dropout))
        self.checkpoint_num = checkpoint_num

    def forward(self, x):
        for idx, blk in enumerate(self.resblocks):
            if idx < self.checkpoint_num:
                x = checkpoint.checkpoint(blk, x)
            else:
                x = blk(x)
        return x


class VisionTransformer(nn.Module):
    def __init__(
        self, input_resolution, patch_size, width, layers, heads, output_dim=None, 
        kernel_size=1, num_frames=8, drop_path=0, checkpoint_num=0, dropout=0.,
        temp_embed=True,
    ):
        super().__init__()
        self.output_dim = output_dim
        self.conv1 = nn.Conv3d(
            3, width, 
            (kernel_size, patch_size, patch_size), 
            (kernel_size, patch_size, patch_size), 
            (0, 0, 0), bias=False
        )

        scale = width ** -0.5
        self.class_embedding = nn.Parameter(scale * torch.randn(width))
        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
        self.ln_pre = nn.LayerNorm(width)
        if temp_embed:
            self.temporal_positional_embedding = nn.Parameter(torch.zeros(1, num_frames, width))
        
        self.transformer = Transformer(
            width, layers, heads, drop_path=drop_path, checkpoint_num=checkpoint_num,
            dropout=dropout)

        self.ln_post = nn.LayerNorm(width)
        if output_dim is not None:
            self.proj = nn.Parameter(torch.empty(width, output_dim))
        else:
            self.proj = None
        
        self.dropout = nn.Dropout(dropout)

    def get_num_layers(self):
        return len(self.transformer.resblocks)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'positional_embedding', 'class_embedding', 'temporal_positional_embedding'}
    
    def mask_tokens(self, inputs, masking_prob=0.0):
        B, L, _ = inputs.shape

        # This is different from text as we are masking a fix number of tokens
        Lm = int(masking_prob * L)
        masked_indices = torch.zeros(B, L)
        indices = torch.argsort(torch.rand_like(masked_indices), dim=-1)[:, :Lm]
        batch_indices = (
            torch.arange(masked_indices.shape[0]).unsqueeze(-1).expand_as(indices)
        )
        masked_indices[batch_indices, indices] = 1

        masked_indices = masked_indices.bool()

        return inputs[~masked_indices].reshape(B, -1, inputs.shape[-1])

    def forward(self, x, masking_prob=0.0):
        x = self.conv1(x)  # shape = [*, width, grid, grid]
        B, C, T, H, W = x.shape
        x = x.permute(0, 2, 3, 4, 1).reshape(B * T, H * W, C)

        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
        x = x + self.positional_embedding.to(x.dtype)

        # temporal pos
        cls_tokens = x[:B, :1, :]
        x = x[:, 1:]
        x = rearrange(x, '(b t) n m -> (b n) t m', b=B, t=T)
        if hasattr(self, 'temporal_positional_embedding'):
            if x.size(1) == 1:
                # This is a workaround for unused parameter issue
                x = x + self.temporal_positional_embedding.mean(1)
            else:
                x = x + self.temporal_positional_embedding
        x = rearrange(x, '(b n) t m -> b (n t) m', b=B, t=T)

        if masking_prob > 0.0:
            x = self.mask_tokens(x, masking_prob)

        x = torch.cat((cls_tokens, x), dim=1)

        x = self.ln_pre(x)

        x = x.permute(1, 0, 2)  #BND -> NBD
        x = self.transformer(x)

        x = self.ln_post(x)

        if self.proj is not None:
            x = self.dropout(x[0]) @ self.proj
        else:
            x = x.permute(1, 0, 2)  #NBD -> BND

        return x


def inflate_weight(weight_2d, time_dim, center=True):
    logger.info(f'Init center: {center}')
    if center:
        weight_3d = torch.zeros(*weight_2d.shape)
        weight_3d = weight_3d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
        middle_idx = time_dim // 2
        weight_3d[:, :, middle_idx, :, :] = weight_2d
    else:
        weight_3d = weight_2d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
        weight_3d = weight_3d / time_dim
    return weight_3d


def load_state_dict(model, state_dict, input_resolution=224, patch_size=16, center=True):
    state_dict_3d = model.state_dict()
    for k in state_dict.keys():
        if k in state_dict_3d.keys() and state_dict[k].shape != state_dict_3d[k].shape:
            if len(state_dict_3d[k].shape) <= 2:
                logger.info(f'Ignore: {k}')
                continue
            logger.info(f'Inflate: {k}, {state_dict[k].shape} => {state_dict_3d[k].shape}')
            time_dim = state_dict_3d[k].shape[2]
            state_dict[k] = inflate_weight(state_dict[k], time_dim, center=center)

    pos_embed_checkpoint = state_dict['positional_embedding']
    embedding_size = pos_embed_checkpoint.shape[-1]
    num_patches = (input_resolution // patch_size) ** 2
    orig_size = int((pos_embed_checkpoint.shape[-2] - 1) ** 0.5)
    new_size = int(num_patches ** 0.5)
    if orig_size != new_size:
        logger.info(f'Pos_emb from {orig_size} to {new_size}')
        extra_tokens = pos_embed_checkpoint[:1]
        pos_tokens = pos_embed_checkpoint[1:]
        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
        pos_tokens = torch.nn.functional.interpolate(
            pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(0, 2)
        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=0)
        state_dict['positional_embedding'] = new_pos_embed
    
    message = model.load_state_dict(state_dict, strict=False)
    logger.info(f"Load pretrained weights: {message}")


@register_model
def clip_joint_b16(
    pretrained=True, input_resolution=224, kernel_size=1,
    center=True, num_frames=8, drop_path=0.
):
    model = VisionTransformer(
        input_resolution=input_resolution, patch_size=16, 
        width=768, layers=12, heads=12, output_dim=512,
        kernel_size=kernel_size, num_frames=num_frames, 
        drop_path=drop_path,
    )
    raise NotImplementedError
    if pretrained:
        logger.info('load pretrained weights')
        state_dict = torch.load(_MODELS["ViT-B/16"], map_location='cpu')
        load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=16, center=center)
    return model.eval()


@register_model
def clip_joint_l14(
    pretrained=False, input_resolution=224, kernel_size=1,
    center=True, num_frames=8, drop_path=0., checkpoint_num=0,
    dropout=0.,
):
    model = VisionTransformer(
        input_resolution=input_resolution, patch_size=14,
        width=1024, layers=24, heads=16, output_dim=768,
        kernel_size=kernel_size, num_frames=num_frames, 
        drop_path=drop_path, checkpoint_num=checkpoint_num,
        dropout=dropout,
    )
    if pretrained:
        if isinstance(pretrained, str):
            model_name = pretrained
        else:
            model_name = "ViT-L/14"
        logger.info('load pretrained weights')
        state_dict = torch.load(_MODELS[model_name], map_location='cpu')
        load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=14, center=center)
    return model.eval()


@register_model
def clip_joint_l14_336(
    pretrained=True, input_resolution=336, kernel_size=1,
    center=True, num_frames=8, drop_path=0.
):
    raise NotImplementedError
    model = VisionTransformer(
        input_resolution=input_resolution, patch_size=14, 
        width=1024, layers=24, heads=16, output_dim=768,
        kernel_size=kernel_size, num_frames=num_frames,
        drop_path=drop_path,
    )
    if pretrained:
        logger.info('load pretrained weights')
        state_dict = torch.load(_MODELS["ViT-L/14_336"], map_location='cpu')
        load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=14, center=center)
    return model.eval()


def interpolate_pos_embed_vit(state_dict, new_model):
    key = "vision_encoder.temporal_positional_embedding"
    if key in state_dict:
        vision_temp_embed_new = new_model.state_dict()[key]
        vision_temp_embed_new = vision_temp_embed_new.unsqueeze(2)  # [1, n, d] -> [1, n, 1, d]
        vision_temp_embed_old = state_dict[key]
        vision_temp_embed_old = vision_temp_embed_old.unsqueeze(2)

        state_dict[key] = load_temp_embed_with_mismatch(
            vision_temp_embed_old, vision_temp_embed_new, add_zero=False
        ).squeeze(2)

    key = "text_encoder.positional_embedding"
    if key in state_dict:
        text_temp_embed_new = new_model.state_dict()[key]
        text_temp_embed_new = text_temp_embed_new.unsqueeze(0).unsqueeze(2)  # [n, d] -> [1, n, 1, d]
        text_temp_embed_old = state_dict[key]
        text_temp_embed_old = text_temp_embed_old.unsqueeze(0).unsqueeze(2)

        state_dict[key] = load_temp_embed_with_mismatch(
            text_temp_embed_old, text_temp_embed_new, add_zero=False
        ).squeeze(2).squeeze(0)
    return state_dict


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/adobe240.py
================================================
import sys
import tqdm
import torch
import argparse
import numpy as np
from omegaconf import OmegaConf

sys.path.append('.')
from utils.build_utils import build_from_cfg
from datasets.adobe_datasets import Adobe240_Dataset
from metrics.psnr_ssim import calculate_psnr, calculate_ssim

parser = argparse.ArgumentParser(
                prog = 'AMT',
                description = 'Adobe240 evaluation',
                )
parser.add_argument('-c', '--config', default='cfgs/AMT-S_gopro.yaml') 
parser.add_argument('-p', '--ckpt', default='pretrained/gopro_amt-s.pth',) 
parser.add_argument('-r', '--root', default='data/Adobe240/test_frames',) 
args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cfg_path = args.config
ckpt_path = args.ckpt
root = args.root

network_cfg = OmegaConf.load(cfg_path).network
network_name = network_cfg.name
model = build_from_cfg(network_cfg)
ckpt = torch.load(ckpt_path)
model.load_state_dict(ckpt['state_dict'])
model = model.to(device)
model.eval()

dataset = Adobe240_Dataset(dataset_dir=root, augment=False)

psnr_list = []
ssim_list = []
pbar = tqdm.tqdm(dataset, total=len(dataset))
for data in pbar:
    input_dict = {}
    for k, v in data.items():
        input_dict[k] = v.to(device).unsqueeze(0)
    with torch.no_grad():
        imgt_pred = model(**input_dict)['imgt_pred']
        psnr = calculate_psnr(imgt_pred, input_dict['imgt'])
        ssim = calculate_ssim(imgt_pred, input_dict['imgt'])
    psnr_list.append(psnr)
    ssim_list.append(ssim)
    avg_psnr = np.mean(psnr_list)
    avg_ssim = np.mean(ssim_list)
    desc_str = f'[{network_name}/Adobe240] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
    pbar.set_description_str(desc_str)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/gopro.py
================================================
import sys
import tqdm
import torch
import argparse
import numpy as np
from omegaconf import OmegaConf

sys.path.append('.')
from utils.build_utils import build_from_cfg
from datasets.gopro_datasets import GoPro_Test_Dataset
from metrics.psnr_ssim import calculate_psnr, calculate_ssim

parser = argparse.ArgumentParser(
                prog = 'AMT',
                description = 'GOPRO evaluation',
                )
parser.add_argument('-c', '--config', default='cfgs/AMT-S_gopro.yaml') 
parser.add_argument('-p', '--ckpt', default='pretrained/gopro_amt-s.pth',) 
parser.add_argument('-r', '--root', default='data/GOPRO',) 
args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cfg_path = args.config
ckpt_path = args.ckpt
root = args.root

network_cfg = OmegaConf.load(cfg_path).network
network_name = network_cfg.name
model = build_from_cfg(network_cfg)
ckpt = torch.load(ckpt_path)
model.load_state_dict(ckpt['state_dict'])
model = model.to(device)
model.eval()

dataset = GoPro_Test_Dataset(dataset_dir=root)

psnr_list = []
ssim_list = []
pbar = tqdm.tqdm(dataset, total=len(dataset))
for data in pbar:
    input_dict = {}
    for k, v in data.items():
        input_dict[k] = v.to(device).unsqueeze(0)
    with torch.no_grad():
        imgt_pred = model(**input_dict)['imgt_pred']
        psnr = calculate_psnr(imgt_pred, input_dict['imgt'])
        ssim = calculate_ssim(imgt_pred, input_dict['imgt'])
    psnr_list.append(psnr)
    ssim_list.append(ssim)
    avg_psnr = np.mean(psnr_list)
    avg_ssim = np.mean(ssim_list)
    desc_str = f'[{network_name}/GOPRO] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
    pbar.set_description_str(desc_str)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/snu_film.py
================================================
import os
import sys
import tqdm
import torch
import argparse
import numpy as np
import os.path as osp
from omegaconf import OmegaConf

sys.path.append('.')
from utils.build_utils import build_from_cfg
from metrics.psnr_ssim import calculate_psnr, calculate_ssim
from utils.utils import InputPadder, read, img2tensor


def parse_path(path):
    path_list = path.split('/')
    new_path = osp.join(*path_list[-3:])
    return new_path

parser = argparse.ArgumentParser(
                prog = 'AMT',
                description = 'SNU-FILM evaluation',
                )
parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth')
parser.add_argument('-r', '--root', default='data/SNU_FILM') 
args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cfg_path = args.config
ckpt_path = args.ckpt
root = args.root

network_cfg = OmegaConf.load(cfg_path).network
network_name = network_cfg.name
model = build_from_cfg(network_cfg)
ckpt = torch.load(ckpt_path)
model.load_state_dict(ckpt['state_dict'])
model = model.to(device)
model.eval()

divisor = 20; scale_factor = 0.8
splits = ['easy', 'medium', 'hard', 'extreme']
for split in splits:
    with open(os.path.join(root, f'test-{split}.txt'), "r") as fr:
        file_list = [l.strip().split(' ') for l in fr.readlines()]
    pbar = tqdm.tqdm(file_list, total=len(file_list))
    
    psnr_list = []; ssim_list = []
    for name in pbar:
        img0 = img2tensor(read(osp.join(root, parse_path(name[0])))).to(device)
        imgt = img2tensor(read(osp.join(root, parse_path(name[1])))).to(device)
        img1 = img2tensor(read(osp.join(root, parse_path(name[2])))).to(device)
        padder = InputPadder(img0.shape, divisor)
        img0, img1 = padder.pad(img0, img1)
            
        embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)
        imgt_pred = model(img0, img1, embt, scale_factor=scale_factor, eval=True)['imgt_pred']
        imgt_pred = padder.unpad(imgt_pred)

        psnr = calculate_psnr(imgt_pred, imgt).detach().cpu().numpy()
        ssim = calculate_ssim(imgt_pred, imgt).detach().cpu().numpy()

        psnr_list.append(psnr)
        ssim_list.append(ssim)
        avg_psnr = np.mean(psnr_list)
        avg_ssim = np.mean(ssim_list)
        desc_str = f'[{network_name}/SNU-FILM] [{split}] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
        pbar.set_description_str(desc_str)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/speed_parameters.py
================================================
import sys
import time
import torch
import argparse
from omegaconf import OmegaConf

sys.path.append('.')
from utils.build_utils import build_from_cfg

parser = argparse.ArgumentParser(
                prog = 'AMT',
                description = 'Speed&parameter benchmark',
                )
parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
args = parser.parse_args()

cfg_path = args.config
network_cfg = OmegaConf.load(cfg_path).network
model = build_from_cfg(network_cfg)
model = model.cuda()
model.eval()

img0 = torch.randn(1, 3, 256, 448).cuda()
img1 = torch.randn(1, 3, 256, 448).cuda()
embt = torch.tensor(1/2).float().view(1, 1, 1, 1).cuda()

with torch.no_grad():
    for i in range(100):
        out = model(img0, img1, embt, eval=True)
    torch.cuda.synchronize()
    time_stamp = time.time()
    for i in range(1000):
        out = model(img0, img1, embt, eval=True)
    torch.cuda.synchronize()
    print('Time: {:.5f}s'.format((time.time() - time_stamp) / 1))

total = sum([param.nelement() for param in model.parameters()])
print('Parameters: {:.2f}M'.format(total / 1e6))


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/ucf101.py
================================================
import os
import sys
import tqdm
import torch
import argparse
import numpy as np
import os.path as osp
from omegaconf import OmegaConf

sys.path.append('.')
from utils.utils import read, img2tensor
from utils.build_utils import build_from_cfg
from metrics.psnr_ssim import calculate_psnr, calculate_ssim

parser = argparse.ArgumentParser(
                prog = 'AMT',
                description = 'UCF101 evaluation',
                )
parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth') 
parser.add_argument('-r', '--root', default='data/ucf101_interp_ours') 
args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cfg_path = args.config
ckpt_path = args.ckpt
root = args.root

network_cfg = OmegaConf.load(cfg_path).network
network_name = network_cfg.name
model = build_from_cfg(network_cfg)
ckpt = torch.load(ckpt_path)
model.load_state_dict(ckpt['state_dict'])
model = model.to(device)
model.eval()

dirs = sorted(os.listdir(root))
psnr_list = []
ssim_list = []
pbar = tqdm.tqdm(dirs, total=len(dirs))
for d in pbar:
    dir_path = osp.join(root, d)
    I0 = img2tensor(read(osp.join(dir_path, 'frame_00.png'))).to(device)
    I1 = img2tensor(read(osp.join(dir_path, 'frame_01_gt.png'))).to(device)
    I2 = img2tensor(read(osp.join(dir_path, 'frame_02.png'))).to(device)
    embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)

    I1_pred = model(I0, I2, embt, eval=True)['imgt_pred']

    psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy()
    ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy()

    psnr_list.append(psnr)
    ssim_list.append(ssim)
    
    avg_psnr = np.mean(psnr_list)
    avg_ssim = np.mean(ssim_list)
    desc_str = f'[{network_name}/UCF101] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
    pbar.set_description_str(desc_str)

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/vimeo90k.py
================================================
import sys
import tqdm
import torch
import argparse
import numpy as np
import os.path as osp
from omegaconf import OmegaConf

sys.path.append('.')
from utils.utils import read, img2tensor
from utils.build_utils import build_from_cfg
from metrics.psnr_ssim import calculate_psnr, calculate_ssim

parser = argparse.ArgumentParser(
                prog = 'AMT',
                description = 'Vimeo90K evaluation',
                )
parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth',) 
parser.add_argument('-r', '--root', default='data/vimeo_triplet',) 
args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cfg_path = args.config
ckpt_path = args.ckpt
root = args.root

network_cfg = OmegaConf.load(cfg_path).network
network_name = network_cfg.name
model = build_from_cfg(network_cfg)
ckpt = torch.load(ckpt_path)
model.load_state_dict(ckpt['state_dict'])
model = model.to(device)
model.eval()

with open(osp.join(root, 'tri_testlist.txt'), 'r') as fr:
    file_list = fr.readlines()

psnr_list = []
ssim_list = []

pbar = tqdm.tqdm(file_list, total=len(file_list))
for name in pbar:
    name = str(name).strip()
    if(len(name) <= 1):
        continue
    dir_path = osp.join(root, 'sequences', name)
    I0 = img2tensor(read(osp.join(dir_path, 'im1.png'))).to(device)
    I1 = img2tensor(read(osp.join(dir_path, 'im2.png'))).to(device)
    I2 = img2tensor(read(osp.join(dir_path, 'im3.png'))).to(device)
    embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)

    I1_pred = model(I0, I2, embt, 
                        scale_factor=1.0, eval=True)['imgt_pred']

    psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy()
    ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy()

    psnr_list.append(psnr)
    ssim_list.append(ssim)
    avg_psnr = np.mean(psnr_list)
    avg_ssim = np.mean(ssim_list)
    desc_str = f'[{network_name}/Vimeo90K] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
    pbar.set_description_str(desc_str)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/vimeo90k_tta.py
================================================
import sys
import tqdm
import torch
import argparse
import numpy as np
import os.path as osp
from omegaconf import OmegaConf

sys.path.append('.')
from utils.utils import read, img2tensor
from utils.build_utils import build_from_cfg
from metrics.psnr_ssim import calculate_psnr, calculate_ssim

parser = argparse.ArgumentParser(
                prog = 'AMT',
                description = 'Vimeo90K evaluation (with Test-Time Augmentation)',
                )
parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
parser.add_argument('p', '--ckpt', default='pretrained/amt-s.pth',) 
parser.add_argument('-r', '--root', default='data/vimeo_triplet',) 
args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cfg_path = args.config
ckpt_path = args.ckpt
root = args.root

network_cfg = OmegaConf.load(cfg_path).network
network_name = network_cfg.name
model = build_from_cfg(network_cfg)
ckpt = torch.load(ckpt_path)
model.load_state_dict(ckpt['state_dict'])
model = model.to(device)
model.eval()

with open(osp.join(root, 'tri_testlist.txt'), 'r') as fr:
    file_list = fr.readlines()

psnr_list = []
ssim_list = []

pbar = tqdm.tqdm(file_list, total=len(file_list))
for name in pbar:
    name = str(name).strip()
    if(len(name) <= 1):
        continue
    dir_path = osp.join(root, 'sequences', name)
    I0 = img2tensor(read(osp.join(dir_path, 'im1.png'))).to(device)
    I1 = img2tensor(read(osp.join(dir_path, 'im2.png'))).to(device)
    I2 = img2tensor(read(osp.join(dir_path, 'im3.png'))).to(device)
    embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)

    I1_pred1 = model(I0, I2, embt, 
                        scale_factor=1.0, eval=True)['imgt_pred']
    I1_pred2 = model(torch.flip(I0, [2]), torch.flip(I2, [2]), embt, 
                        scale_factor=1.0, eval=True)['imgt_pred']
    I1_pred = I1_pred1 / 2 + torch.flip(I1_pred2, [2]) / 2
    psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy()
    ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy()

    psnr_list.append(psnr)
    ssim_list.append(ssim)
    avg_psnr = np.mean(psnr_list)
    avg_ssim = np.mean(ssim_list)
    desc_str = f'[{network_name}/Vimeo90K] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
    pbar.set_description_str(desc_str)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/benchmarks/xiph.py
================================================
import os
import sys
import cv2
import tqdm
import glob
import torch
import argparse
import numpy as np
import os.path as osp
from omegaconf import OmegaConf

sys.path.append('.')
from utils.utils import InputPadder, read, img2tensor
from utils.build_utils import build_from_cfg
from metrics.psnr_ssim import calculate_psnr, calculate_ssim

parser = argparse.ArgumentParser(
                prog = 'AMT',
                description = 'Xiph evaluation',
                )
parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth') 
parser.add_argument('-r', '--root', default='data/xiph') 
args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cfg_path = args.config
ckpt_path = args.ckpt
root = args.root

network_cfg = OmegaConf.load(cfg_path).network
network_name = network_cfg.name
model = build_from_cfg(network_cfg)
ckpt = torch.load(ckpt_path)
model.load_state_dict(ckpt['state_dict'], False)
model = model.to(device)
model.eval()

############################################# Prepare Dataset #############################################
download_links = [
    'https://media.xiph.org/video/derf/ElFuente/Netflix_BoxingPractice_4096x2160_60fps_10bit_420.y4m',
    'https://media.xiph.org/video/derf/ElFuente/Netflix_Crosswalk_4096x2160_60fps_10bit_420.y4m',
    'https://media.xiph.org/video/derf/Chimera/Netflix_DrivingPOV_4096x2160_60fps_10bit_420.y4m',
    'https://media.xiph.org/video/derf/ElFuente/Netflix_FoodMarket_4096x2160_60fps_10bit_420.y4m',
    'https://media.xiph.org/video/derf/ElFuente/Netflix_FoodMarket2_4096x2160_60fps_10bit_420.y4m',
    'https://media.xiph.org/video/derf/ElFuente/Netflix_RitualDance_4096x2160_60fps_10bit_420.y4m',
    'https://media.xiph.org/video/derf/ElFuente/Netflix_SquareAndTimelapse_4096x2160_60fps_10bit_420.y4m',
    'https://media.xiph.org/video/derf/ElFuente/Netflix_Tango_4096x2160_60fps_10bit_420.y4m',
]
file_list = ['BoxingPractice', 'Crosswalk', 'DrivingPOV', 'FoodMarket', 'FoodMarket2', 'RitualDance', 
             'SquareAndTimelapse', 'Tango']

for file_name, link in zip(file_list, download_links):
    data_dir = osp.join(root, file_name)
    if osp.exists(data_dir) is False:
        os.makedirs(data_dir)
    if len(glob.glob(f'{data_dir}/*.png')) < 100:
        os.system(f'ffmpeg -i {link} -pix_fmt rgb24 -vframes 100 {data_dir}/%03d.png')
############################################### Prepare End ###############################################


divisor = 32; scale_factor = 0.5
for category in ['resized-2k', 'cropped-4k']:
    psnr_list = []
    ssim_list = []
    pbar = tqdm.tqdm(file_list, total=len(file_list))
    for flie_name in pbar:
        dir_name = osp.join(root, flie_name)
        for intFrame in range(2, 99, 2):
            img0 = read(f'{dir_name}/{intFrame - 1:03d}.png')
            img1 = read(f'{dir_name}/{intFrame + 1:03d}.png')
            imgt = read(f'{dir_name}/{intFrame:03d}.png')

            if category == 'resized-2k':
                img0 = cv2.resize(src=img0, dsize=(2048, 1080), fx=0.0, fy=0.0, interpolation=cv2.INTER_AREA)
                img1 = cv2.resize(src=img1, dsize=(2048, 1080), fx=0.0, fy=0.0, interpolation=cv2.INTER_AREA)
                imgt = cv2.resize(src=imgt, dsize=(2048, 1080), fx=0.0, fy=0.0, interpolation=cv2.INTER_AREA)

            elif category == 'cropped-4k':
                img0 = img0[540:-540, 1024:-1024, :]
                img1 = img1[540:-540, 1024:-1024, :]
                imgt = imgt[540:-540, 1024:-1024, :]
            img0 = img2tensor(img0).to(device)
            imgt = img2tensor(imgt).to(device)
            img1 = img2tensor(img1).to(device)
            embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)
            
            padder = InputPadder(img0.shape, divisor)
            img0, img1 = padder.pad(img0, img1)

            with torch.no_grad():
                imgt_pred = model(img0, img1, embt, scale_factor=scale_factor, eval=True)['imgt_pred']
                imgt_pred = padder.unpad(imgt_pred)

            psnr = calculate_psnr(imgt_pred, imgt)
            ssim = calculate_ssim(imgt_pred, imgt)

            avg_psnr = np.mean(psnr_list)
            avg_ssim = np.mean(ssim_list)
            psnr_list.append(psnr)
            ssim_list.append(ssim)
            desc_str = f'[{network_name}/Xiph] [{category}/{flie_name}] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'

            pbar.set_description_str(desc_str)

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/datasets/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/datasets/adobe_datasets.py
================================================
'''
    This code is partially borrowed from IFRNet (https://github.com/ltkong218/IFRNet). 
'''
import os
import sys
import torch
import numpy as np
from torch.utils.data import Dataset
sys.path.append('.')
from utils.utils import read, img2tensor
from datasets.gopro_datasets import (
    random_resize_woflow, random_crop_woflow, center_crop_woflow,
    random_reverse_channel_woflow, random_vertical_flip_woflow,
    random_horizontal_flip_woflow, random_rotate_woflow, 
    random_reverse_time_woflow
)


class Adobe240_Dataset(Dataset):
    def __init__(self, dataset_dir='data/adobe240/test_frames', interFrames=7, augment=True):
        super().__init__()
        self.augment = augment
        self.interFrames = interFrames
        self.setLength = interFrames + 2
        self.dataset_dir = os.path.join(dataset_dir)
        video_list = os.listdir(self.dataset_dir)[9::10]
        self.frames_list = []
        self.file_list = []
        for video in video_list:
            frames = sorted(os.listdir(os.path.join(self.dataset_dir, video)))
            n_sets = (len(frames) - self.setLength) // (interFrames + 1)  + 1
            videoInputs = [frames[(interFrames + 1) * i: (interFrames + 1) * i + self.setLength] for i in range(n_sets)]
            videoInputs = [[os.path.join(video, f) for f in group] for group in videoInputs]
            self.file_list.extend(videoInputs)

    def __getitem__(self, idx):
        clip_idx = idx // self.interFrames
        embt_idx = idx % self.interFrames
        imgpaths = [os.path.join(self.dataset_dir, fp) for fp in self.file_list[clip_idx]]
        pick_idxs = list(range(0, self.setLength, self.interFrames + 1))
        imgt_beg = self.setLength // 2 - self.interFrames // 2
        imgt_end = self.setLength // 2 + self.interFrames // 2 + self.interFrames % 2
        imgt_idx = list(range(imgt_beg, imgt_end)) 
        input_paths = [imgpaths[idx] for idx in pick_idxs]
        imgt_paths = [imgpaths[idx] for idx in imgt_idx]
        
        img0 = np.array(read(input_paths[0]))
        imgt = np.array(read(imgt_paths[embt_idx]))
        img1 = np.array(read(input_paths[1]))
        embt = torch.from_numpy(np.array((embt_idx  + 1) / (self.interFrames + 1)
                                         ).reshape(1, 1, 1).astype(np.float32))

        if self.augment == True:
            img0, imgt, img1 = random_resize_woflow(img0, imgt, img1, p=0.1)
            img0, imgt, img1 = random_crop_woflow(img0, imgt, img1, crop_size=(224, 224))
            img0, imgt, img1 = random_reverse_channel_woflow(img0, imgt, img1, p=0.5)
            img0, imgt, img1 = random_vertical_flip_woflow(img0, imgt, img1, p=0.3)
            img0, imgt, img1 = random_horizontal_flip_woflow(img0, imgt, img1, p=0.5)
            img0, imgt, img1 = random_rotate_woflow(img0, imgt, img1, p=0.05)
            img0, imgt, img1, embt = random_reverse_time_woflow(img0, imgt, img1, 
                                                                embt=embt, p=0.5)
        else:
            img0, imgt, img1 = center_crop_woflow(img0, imgt, img1, crop_size=(512, 512))
            
        img0 = img2tensor(img0).squeeze(0)
        imgt = img2tensor(imgt).squeeze(0)
        img1 = img2tensor(img1).squeeze(0)
        
        return {'img0': img0.float(), 
                'imgt': imgt.float(), 
                'img1': img1.float(),  
                'embt': embt}

    def __len__(self):
        return len(self.file_list) * self.interFrames


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/datasets/gopro_datasets.py
================================================
'''
    This code is partially borrowed from IFRNet (https://github.com/ltkong218/IFRNet). 
    In the consideration of the difficulty in flow supervision generation, we abort 
    flow loss in the 8x case.
'''
import os
import cv2
import torch
import random
import numpy as np
from torch.utils.data import Dataset
from utils.utils import read, img2tensor

def random_resize_woflow(img0, imgt, img1, p=0.1):
    if random.uniform(0, 1) < p:
        img0 = cv2.resize(img0, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR)
        imgt = cv2.resize(imgt, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR)
        img1 = cv2.resize(img1, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR)
    return img0, imgt, img1

def random_crop_woflow(img0, imgt, img1, crop_size=(224, 224)):
    h, w = crop_size[0], crop_size[1]
    ih, iw, _ = img0.shape
    x = np.random.randint(0, ih-h+1)
    y = np.random.randint(0, iw-w+1)
    img0 = img0[x: x + h, y : y + w, :]
    imgt = imgt[x: x + h, y : y + w, :]
    img1 = img1[x: x + h, y : y + w, :]
    return img0, imgt, img1

def center_crop_woflow(img0, imgt, img1, crop_size=(512, 512)):
    h, w = crop_size[0], crop_size[1]
    ih, iw, _ = img0.shape
    img0 = img0[ih // 2 - h // 2: ih // 2 + h // 2, iw // 2 - w // 2: iw // 2 +  w // 2, :]
    imgt = imgt[ih // 2 - h // 2: ih // 2 + h // 2, iw // 2 - w // 2: iw // 2 +  w // 2, :]
    img1 = img1[ih // 2 - h // 2: ih // 2 + h // 2, iw // 2 - w // 2: iw // 2 +  w // 2, :]
    return img0, imgt, img1

def random_reverse_channel_woflow(img0, imgt, img1, p=0.5):
    if random.uniform(0, 1) < p:
        img0 = img0[:, :, ::-1]
        imgt = imgt[:, :, ::-1]
        img1 = img1[:, :, ::-1]
    return img0, imgt, img1

def random_vertical_flip_woflow(img0, imgt, img1, p=0.3):
    if random.uniform(0, 1) < p:
        img0 = img0[::-1]
        imgt = imgt[::-1]
        img1 = img1[::-1]
    return img0, imgt, img1

def random_horizontal_flip_woflow(img0, imgt, img1, p=0.5):
    if random.uniform(0, 1) < p:
        img0 = img0[:, ::-1]
        imgt = imgt[:, ::-1]
        img1 = img1[:, ::-1]
    return img0, imgt, img1

def random_rotate_woflow(img0, imgt, img1, p=0.05):
    if random.uniform(0, 1) < p:
        img0 = img0.transpose((1, 0, 2))
        imgt = imgt.transpose((1, 0, 2))
        img1 = img1.transpose((1, 0, 2))
    return img0, imgt, img1

def random_reverse_time_woflow(img0, imgt, img1, embt, p=0.5):
    if random.uniform(0, 1) < p:
        tmp = img1
        img1 = img0
        img0 = tmp
    embt = 1 - embt
    return img0, imgt, img1, embt

class GoPro_Train_Dataset(Dataset):
    def __init__(self, dataset_dir='data/GOPRO', interFrames=7, augment=True):
        self.dataset_dir = dataset_dir + '/train'
        self.interFrames = interFrames
        self.augment = augment
        self.setLength = interFrames + 2
        video_list = [
            'GOPR0372_07_00', 'GOPR0374_11_01', 'GOPR0378_13_00', 'GOPR0384_11_01', 
            'GOPR0384_11_04', 'GOPR0477_11_00', 'GOPR0868_11_02', 'GOPR0884_11_00', 
            'GOPR0372_07_01', 'GOPR0374_11_02', 'GOPR0379_11_00', 'GOPR0384_11_02', 
            'GOPR0385_11_00', 'GOPR0857_11_00', 'GOPR0871_11_01', 'GOPR0374_11_00', 
            'GOPR0374_11_03', 'GOPR0380_11_00', 'GOPR0384_11_03', 'GOPR0386_11_00', 
            'GOPR0868_11_01', 'GOPR0881_11_00']
        self.frames_list = []
        self.file_list = []
        for video in video_list:
            frames = sorted(os.listdir(os.path.join(self.dataset_dir, video)))
            n_sets = (len(frames) - self.setLength) // (interFrames+1)  + 1
            videoInputs = [frames[(interFrames + 1) * i: (interFrames + 1) * i + self.setLength
                                                        ] for i in range(n_sets)]
            videoInputs = [[os.path.join(video, f) for f in group] for group in videoInputs]
            self.file_list.extend(videoInputs)

    def __len__(self):
        return len(self.file_list) * self.interFrames

    def __getitem__(self, idx):
        clip_idx = idx // self.interFrames
        embt_idx = idx % self.interFrames
        imgpaths = [os.path.join(self.dataset_dir, fp) for fp in self.file_list[clip_idx]]
        pick_idxs = list(range(0, self.setLength, self.interFrames + 1))
        imgt_beg = self.setLength // 2 - self.interFrames // 2
        imgt_end = self.setLength // 2 + self.interFrames // 2 + self.interFrames % 2
        imgt_idx = list(range(imgt_beg, imgt_end)) 
        input_paths = [imgpaths[idx] for idx in pick_idxs]
        imgt_paths = [imgpaths[idx] for idx in imgt_idx]
        
        embt = torch.from_numpy(np.array((embt_idx  + 1) / (self.interFrames+1)
                                         ).reshape(1, 1, 1).astype(np.float32))
        img0 = np.array(read(input_paths[0]))
        imgt = np.array(read(imgt_paths[embt_idx]))
        img1 = np.array(read(input_paths[1]))

        if self.augment == True:
            img0, imgt, img1 = random_resize_woflow(img0, imgt, img1, p=0.1)
            img0, imgt, img1 = random_crop_woflow(img0, imgt, img1, crop_size=(224, 224))
            img0, imgt, img1 = random_reverse_channel_woflow(img0, imgt, img1, p=0.5)
            img0, imgt, img1 = random_vertical_flip_woflow(img0, imgt, img1, p=0.3)
            img0, imgt, img1 = random_horizontal_flip_woflow(img0, imgt, img1, p=0.5)
            img0, imgt, img1 = random_rotate_woflow(img0, imgt, img1, p=0.05)
            img0, imgt, img1, embt = random_reverse_time_woflow(img0, imgt, img1, 
                                                                embt=embt, p=0.5)
        else:
            img0, imgt, img1 = center_crop_woflow(img0, imgt, img1, crop_size=(512, 512))
            
        img0 = img2tensor(img0.copy()).squeeze(0)
        imgt = img2tensor(imgt.copy()).squeeze(0)
        img1 = img2tensor(img1.copy()).squeeze(0)
        
        return {'img0': img0.float(), 
                'imgt': imgt.float(), 
                'img1': img1.float(),  
                'embt': embt}

class GoPro_Test_Dataset(Dataset):
    def __init__(self, dataset_dir='data/GOPRO', interFrames=7):
        self.dataset_dir = dataset_dir + '/test'
        self.interFrames = interFrames
        self.setLength = interFrames + 2
        video_list = [
            'GOPR0384_11_00', 'GOPR0385_11_01', 'GOPR0410_11_00', 
            'GOPR0862_11_00', 'GOPR0869_11_00', 'GOPR0881_11_01', 
            'GOPR0384_11_05', 'GOPR0396_11_00', 'GOPR0854_11_00', 
            'GOPR0868_11_00', 'GOPR0871_11_00']
        self.frames_list = []
        self.file_list = []
        for video in video_list:
            frames = sorted(os.listdir(os.path.join(self.dataset_dir, video)))
            n_sets = (len(frames) - self.setLength)//(interFrames+1)  + 1
            videoInputs = [frames[(interFrames + 1) * i:(interFrames + 1) * i + self.setLength
                                                        ] for i in range(n_sets)]
            videoInputs = [[os.path.join(video, f) for f in group] for group in videoInputs]
            self.file_list.extend(videoInputs)

    def __len__(self):
        return len(self.file_list) * self.interFrames

    def __getitem__(self, idx):
        clip_idx = idx // self.interFrames
        embt_idx = idx % self.interFrames
        imgpaths = [os.path.join(self.dataset_dir, fp) for fp in self.file_list[clip_idx]]
        pick_idxs = list(range(0, self.setLength, self.interFrames + 1))
        imgt_beg = self.setLength // 2 - self.interFrames // 2
        imgt_end = self.setLength // 2 + self.interFrames // 2 + self.interFrames % 2
        imgt_idx = list(range(imgt_beg, imgt_end)) 
        input_paths = [imgpaths[idx] for idx in pick_idxs]
        imgt_paths = [imgpaths[idx] for idx in imgt_idx]

        img0 = np.array(read(input_paths[0]))
        imgt = np.array(read(imgt_paths[embt_idx]))
        img1 = np.array(read(input_paths[1]))

        img0, imgt, img1 = center_crop_woflow(img0, imgt, img1, crop_size=(512, 512))

        img0 = img2tensor(img0).squeeze(0)
        imgt = img2tensor(imgt).squeeze(0)
        img1 = img2tensor(img1).squeeze(0)
        
        embt = torch.from_numpy(np.array((embt_idx + 1) / (self.interFrames + 1)
                                         ).reshape(1, 1, 1).astype(np.float32))
        return {'img0': img0.float(), 
                'imgt': imgt.float(), 
                'img1': img1.float(),  
                'embt': embt}

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/datasets/vimeo_datasets.py
================================================
'''
    This code is partially borrowed from IFRNet (https://github.com/ltkong218/IFRNet). 
'''
import os
import cv2
import torch
import random
import numpy as np
from torch.utils.data import Dataset
from utils.utils import read


def random_resize(img0, imgt, img1, flow, p=0.1):
    if random.uniform(0, 1) < p:
        img0 = cv2.resize(img0, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR)
        imgt = cv2.resize(imgt, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR)
        img1 = cv2.resize(img1, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR)
        flow = cv2.resize(flow, dsize=None, fx=2.0, fy=2.0, interpolation=cv2.INTER_LINEAR) * 2.0
    return img0, imgt, img1, flow

def random_crop(img0, imgt, img1, flow, crop_size=(224, 224)):
    h, w = crop_size[0], crop_size[1]
    ih, iw, _ = img0.shape
    x = np.random.randint(0, ih-h+1)
    y = np.random.randint(0, iw-w+1)
    img0 = img0[x:x+h, y:y+w, :]
    imgt = imgt[x:x+h, y:y+w, :]
    img1 = img1[x:x+h, y:y+w, :]
    flow = flow[x:x+h, y:y+w, :]
    return img0, imgt, img1, flow

def random_reverse_channel(img0, imgt, img1, flow, p=0.5):
    if random.uniform(0, 1) < p:
        img0 = img0[:, :, ::-1]
        imgt = imgt[:, :, ::-1]
        img1 = img1[:, :, ::-1]
    return img0, imgt, img1, flow

def random_vertical_flip(img0, imgt, img1, flow, p=0.3):
    if random.uniform(0, 1) < p:
        img0 = img0[::-1]
        imgt = imgt[::-1]
        img1 = img1[::-1]
        flow = flow[::-1]
        flow = np.concatenate((flow[:, :, 0:1], -flow[:, :, 1:2], flow[:, :, 2:3], -flow[:, :, 3:4]), 2)
    return img0, imgt, img1, flow

def random_horizontal_flip(img0, imgt, img1, flow, p=0.5):
    if random.uniform(0, 1) < p:
        img0 = img0[:, ::-1]
        imgt = imgt[:, ::-1]
        img1 = img1[:, ::-1]
        flow = flow[:, ::-1]
        flow = np.concatenate((-flow[:, :, 0:1], flow[:, :, 1:2], -flow[:, :, 2:3], flow[:, :, 3:4]), 2)
    return img0, imgt, img1, flow

def random_rotate(img0, imgt, img1, flow, p=0.05):
    if random.uniform(0, 1) < p:
        img0 = img0.transpose((1, 0, 2))
        imgt = imgt.transpose((1, 0, 2))
        img1 = img1.transpose((1, 0, 2))
        flow = flow.transpose((1, 0, 2))
        flow = np.concatenate((flow[:, :, 1:2], flow[:, :, 0:1], flow[:, :, 3:4], flow[:, :, 2:3]), 2)
    return img0, imgt, img1, flow

def random_reverse_time(img0, imgt, img1, flow, p=0.5):
    if random.uniform(0, 1) < p:
        tmp = img1
        img1 = img0
        img0 = tmp
        flow = np.concatenate((flow[:, :, 2:4], flow[:, :, 0:2]), 2)
    return img0, imgt, img1, flow


class Vimeo90K_Train_Dataset(Dataset):
    def __init__(self, 
                 dataset_dir='data/vimeo_triplet', 
                 flow_dir=None, 
                 augment=True, 
                 crop_size=(224, 224)):
        self.dataset_dir = dataset_dir
        self.augment = augment
        self.crop_size = crop_size
        self.img0_list = []
        self.imgt_list = []
        self.img1_list = []
        self.flow_t0_list = []
        self.flow_t1_list = []
        if flow_dir is None:
            flow_dir = 'flow'
        with open(os.path.join(dataset_dir, 'tri_trainlist.txt'), 'r') as f:
            for i in f:
                name = str(i).strip()
                if(len(name) <= 1):
                    continue
                self.img0_list.append(os.path.join(dataset_dir, 'sequences', name, 'im1.png'))
                self.imgt_list.append(os.path.join(dataset_dir, 'sequences', name, 'im2.png'))
                self.img1_list.append(os.path.join(dataset_dir, 'sequences', name, 'im3.png'))
                self.flow_t0_list.append(os.path.join(dataset_dir, flow_dir, name, 'flow_t0.flo'))
                self.flow_t1_list.append(os.path.join(dataset_dir, flow_dir, name, 'flow_t1.flo'))

    def __len__(self):
        return len(self.imgt_list)

    def __getitem__(self, idx):
        img0 = read(self.img0_list[idx])
        imgt = read(self.imgt_list[idx])
        img1 = read(self.img1_list[idx])
        flow_t0 = read(self.flow_t0_list[idx])
        flow_t1 = read(self.flow_t1_list[idx])
        flow = np.concatenate((flow_t0, flow_t1), 2).astype(np.float64)

        if self.augment == True:
            img0, imgt, img1, flow = random_resize(img0, imgt, img1, flow, p=0.1)
            img0, imgt, img1, flow = random_crop(img0, imgt, img1, flow, crop_size=self.crop_size)
            img0, imgt, img1, flow = random_reverse_channel(img0, imgt, img1, flow, p=0.5)
            img0, imgt, img1, flow = random_vertical_flip(img0, imgt, img1, flow, p=0.3)
            img0, imgt, img1, flow = random_horizontal_flip(img0, imgt, img1, flow, p=0.5)
            img0, imgt, img1, flow = random_rotate(img0, imgt, img1, flow, p=0.05)
            img0, imgt, img1, flow = random_reverse_time(img0, imgt, img1, flow, p=0.5)
                
        
        img0 = torch.from_numpy(img0.transpose((2, 0, 1)).astype(np.float32) / 255.0)
        imgt = torch.from_numpy(imgt.transpose((2, 0, 1)).astype(np.float32) / 255.0)
        img1 = torch.from_numpy(img1.transpose((2, 0, 1)).astype(np.float32) / 255.0)
        flow = torch.from_numpy(flow.transpose((2, 0, 1)).astype(np.float32))
        embt = torch.from_numpy(np.array(1/2).reshape(1, 1, 1).astype(np.float32))

        return {'img0': img0.float(), 'imgt': imgt.float(), 'img1': img1.float(), 'flow': flow.float(), 'embt': embt}


class Vimeo90K_Test_Dataset(Dataset):
    def __init__(self, dataset_dir='data/vimeo_triplet'):
        self.dataset_dir = dataset_dir
        self.img0_list = []
        self.imgt_list = []
        self.img1_list = []
        self.flow_t0_list = []
        self.flow_t1_list = []
        with open(os.path.join(dataset_dir, 'tri_testlist.txt'), 'r') as f:
            for i in f:
                name = str(i).strip()
                if(len(name) <= 1):
                    continue
                self.img0_list.append(os.path.join(dataset_dir, 'sequences', name, 'im1.png'))
                self.imgt_list.append(os.path.join(dataset_dir, 'sequences', name, 'im2.png'))
                self.img1_list.append(os.path.join(dataset_dir, 'sequences', name, 'im3.png'))
                self.flow_t0_list.append(os.path.join(dataset_dir, 'flow', name, 'flow_t0.flo'))
                self.flow_t1_list.append(os.path.join(dataset_dir, 'flow', name, 'flow_t1.flo'))

    def __len__(self):
        return len(self.imgt_list)

    def __getitem__(self, idx):
        img0 = read(self.img0_list[idx])
        imgt = read(self.imgt_list[idx])
        img1 = read(self.img1_list[idx])
        flow_t0 = read(self.flow_t0_list[idx])
        flow_t1 = read(self.flow_t1_list[idx])
        flow = np.concatenate((flow_t0, flow_t1), 2)

        img0 = torch.from_numpy(img0.transpose((2, 0, 1)).astype(np.float32) / 255.0)
        imgt = torch.from_numpy(imgt.transpose((2, 0, 1)).astype(np.float32) / 255.0)
        img1 = torch.from_numpy(img1.transpose((2, 0, 1)).astype(np.float32) / 255.0)
        flow = torch.from_numpy(flow.transpose((2, 0, 1)).astype(np.float32))
        embt = torch.from_numpy(np.array(1/2).reshape(1, 1, 1).astype(np.float32))
        
        return {'img0': img0.float(), 
                'imgt': imgt.float(), 
                'img1': img1.float(), 
                'flow': flow.float(), 
                'embt': embt}


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/flow_generation/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/flow_generation/gen_flow.py
================================================
import os
import sys
import torch
import argparse
import numpy as np
import os.path as osp
import torch.nn.functional as F

sys.path.append('.')
from utils.utils import read, write
from flow_generation.liteflownet.run import estimate

parser = argparse.ArgumentParser(
                prog = 'AMT',
                description = 'Flow generation',
                )
parser.add_argument('-r', '--root', default='data/vimeo_triplet') 
args = parser.parse_args()

vimeo90k_dir = args.root
vimeo90k_sequences_dir = osp.join(vimeo90k_dir, 'sequences')
vimeo90k_flow_dir = osp.join(vimeo90k_dir, 'flow')

def pred_flow(img1, img2):
    img1 = torch.from_numpy(img1).float().permute(2, 0, 1) / 255.0
    img2 = torch.from_numpy(img2).float().permute(2, 0, 1) / 255.0

    flow = estimate(img1, img2)

    flow = flow.permute(1, 2, 0).cpu().numpy()
    return flow

print('Built Flow Path')
if not osp.exists(vimeo90k_flow_dir):
    os.makedirs(vimeo90k_flow_dir)

for sequences_path in sorted(os.listdir(vimeo90k_sequences_dir)):
    vimeo90k_sequences_path_dir = osp.join(vimeo90k_sequences_dir, sequences_path)
    vimeo90k_flow_path_dir = osp.join(vimeo90k_flow_dir, sequences_path)
    if not osp.exists(vimeo90k_flow_path_dir):
        os.mkdir(vimeo90k_flow_path_dir)
        
    for sequences_id in sorted(os.listdir(vimeo90k_sequences_path_dir)):
        vimeo90k_flow_id_dir = osp.join(vimeo90k_flow_path_dir, sequences_id)
        if not osp.exists(vimeo90k_flow_id_dir):
            os.mkdir(vimeo90k_flow_id_dir)

for sequences_path in sorted(os.listdir(vimeo90k_sequences_dir)):
    vimeo90k_sequences_path_dir = os.path.join(vimeo90k_sequences_dir, sequences_path)
    vimeo90k_flow_path_dir = os.path.join(vimeo90k_flow_dir, sequences_path)
    
    for sequences_id in sorted(os.listdir(vimeo90k_sequences_path_dir)):
        vimeo90k_sequences_id_dir = os.path.join(vimeo90k_sequences_path_dir, sequences_id)
        vimeo90k_flow_id_dir = os.path.join(vimeo90k_flow_path_dir, sequences_id)
        
        img0_path = vimeo90k_sequences_id_dir + '/im1.png'
        imgt_path = vimeo90k_sequences_id_dir + '/im2.png'
        img1_path = vimeo90k_sequences_id_dir + '/im3.png'
        flow_t0_path = vimeo90k_flow_id_dir + '/flow_t0.flo'
        flow_t1_path = vimeo90k_flow_id_dir + '/flow_t1.flo'
        
        img0 = read(img0_path)
        imgt = read(imgt_path)
        img1 = read(img1_path)
        
        flow_t0 = pred_flow(imgt, img0)
        flow_t1 = pred_flow(imgt, img1)
        
        write(flow_t0_path, flow_t0)
        write(flow_t1_path, flow_t1)
        
    print('Written Sequences {}'.format(sequences_path))

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/flow_generation/liteflownet/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/flow_generation/liteflownet/run.py
================================================
#!/usr/bin/env python

import getopt
import math
import numpy
import PIL
import PIL.Image
import sys
import torch

try:
    from .correlation import correlation # the custom cost volume layer
except:
    sys.path.insert(0, './correlation'); import correlation # you should consider upgrading python
# end

##########################################################

assert(int(str('').join(torch.__version__.split('.')[0:2])) >= 13) # requires at least pytorch version 1.3.0

torch.set_grad_enabled(False) # make sure to not compute gradients for computational performance

torch.backends.cudnn.enabled = True # make sure to use cudnn for computational performance

##########################################################

arguments_strModel = 'default' # 'default', or 'kitti', or 'sintel'
arguments_strOne = './images/one.png'
arguments_strTwo = './images/two.png'
arguments_strOut = './out.flo'

for strOption, strArgument in getopt.getopt(sys.argv[1:], '', [ strParameter[2:] + '=' for strParameter in sys.argv[1::2] ])[0]:
    if strOption == '--model' and strArgument != '': arguments_strModel = strArgument # which model to use
    if strOption == '--one' and strArgument != '': arguments_strOne = strArgument # path to the first frame
    if strOption == '--two' and strArgument != '': arguments_strTwo = strArgument # path to the second frame
    if strOption == '--out' and strArgument != '': arguments_strOut = strArgument # path to where the output should be stored
# end

##########################################################

backwarp_tenGrid = {}

def backwarp(tenInput, tenFlow):
    if str(tenFlow.shape) not in backwarp_tenGrid:
        tenHor = torch.linspace(-1.0 + (1.0 / tenFlow.shape[3]), 1.0 - (1.0 / tenFlow.shape[3]), tenFlow.shape[3]).view(1, 1, 1, -1).repeat(1, 1, tenFlow.shape[2], 1)
        tenVer = torch.linspace(-1.0 + (1.0 / tenFlow.shape[2]), 1.0 - (1.0 / tenFlow.shape[2]), tenFlow.shape[2]).view(1, 1, -1, 1).repeat(1, 1, 1, tenFlow.shape[3])

        backwarp_tenGrid[str(tenFlow.shape)] = torch.cat([ tenHor, tenVer ], 1).cuda()
    # end

    tenFlow = torch.cat([ tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0), tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0) ], 1)

    return torch.nn.functional.grid_sample(input=tenInput, grid=(backwarp_tenGrid[str(tenFlow.shape)] + tenFlow).permute(0, 2, 3, 1), mode='bilinear', padding_mode='zeros', align_corners=False)
# end

##########################################################

class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()

        class Features(torch.nn.Module):
            def __init__(self):
                super().__init__()

                self.netOne = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=3, out_channels=32, kernel_size=7, stride=1, padding=3),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)
                )

                self.netTwo = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)
                )

                self.netThr = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)
                )

                self.netFou = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)
                )

                self.netFiv = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=96, out_channels=128, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)
                )

                self.netSix = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=128, out_channels=192, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)
                )
            # end

            def forward(self, tenInput):
                tenOne = self.netOne(tenInput)
                tenTwo = self.netTwo(tenOne)
                tenThr = self.netThr(tenTwo)
                tenFou = self.netFou(tenThr)
                tenFiv = self.netFiv(tenFou)
                tenSix = self.netSix(tenFiv)

                return [ tenOne, tenTwo, tenThr, tenFou, tenFiv, tenSix ]
            # end
        # end

        class Matching(torch.nn.Module):
            def __init__(self, intLevel):
                super().__init__()

                self.fltBackwarp = [ 0.0, 0.0, 10.0, 5.0, 2.5, 1.25, 0.625 ][intLevel]

                if intLevel != 2:
                    self.netFeat = torch.nn.Sequential()

                elif intLevel == 2:
                    self.netFeat = torch.nn.Sequential(
                        torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=1, stride=1, padding=0),
                        torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)
                    )

                # end

                if intLevel == 6:
                    self.netUpflow = None

                elif intLevel != 6:
                    self.netUpflow = torch.nn.ConvTranspose2d(in_channels=2, out_channels=2, kernel_size=4, stride=2, padding=1, bias=False, groups=2)

                # end

                if intLevel >= 4:
                    self.netUpcorr = None

                elif intLevel < 4:
                    self.netUpcorr = torch.nn.ConvTranspose2d(in_channels=49, out_channels=49, kernel_size=4, stride=2, padding=1, bias=False, groups=49)

                # end

                self.netMain = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=49, out_channels=128, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=[ 0, 0, 7, 5, 5, 3, 3 ][intLevel], stride=1, padding=[ 0, 0, 3, 2, 2, 1, 1 ][intLevel])
                )
            # end

            def forward(self, tenOne, tenTwo, tenFeaturesOne, tenFeaturesTwo, tenFlow):
                tenFeaturesOne = self.netFeat(tenFeaturesOne)
                tenFeaturesTwo = self.netFeat(tenFeaturesTwo)

                if tenFlow is not None:
                    tenFlow = self.netUpflow(tenFlow)
                # end

                if tenFlow is not None:
                    tenFeaturesTwo = backwarp(tenInput=tenFeaturesTwo, tenFlow=tenFlow * self.fltBackwarp)
                # end

                if self.netUpcorr is None:
                    tenCorrelation = torch.nn.functional.leaky_relu(input=correlation.FunctionCorrelation(tenOne=tenFeaturesOne, tenTwo=tenFeaturesTwo, intStride=1), negative_slope=0.1, inplace=False)

                elif self.netUpcorr is not None:
                    tenCorrelation = self.netUpcorr(torch.nn.functional.leaky_relu(input=correlation.FunctionCorrelation(tenOne=tenFeaturesOne, tenTwo=tenFeaturesTwo, intStride=2), negative_slope=0.1, inplace=False))

                # end

                return (tenFlow if tenFlow is not None else 0.0) + self.netMain(tenCorrelation)
            # end
        # end

        class Subpixel(torch.nn.Module):
            def __init__(self, intLevel):
                super().__init__()

                self.fltBackward = [ 0.0, 0.0, 10.0, 5.0, 2.5, 1.25, 0.625 ][intLevel]

                if intLevel != 2:
                    self.netFeat = torch.nn.Sequential()

                elif intLevel == 2:
                    self.netFeat = torch.nn.Sequential(
                        torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=1, stride=1, padding=0),
                        torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)
                    )

                # end

                self.netMain = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=[ 0, 0, 130, 130, 194, 258, 386 ][intLevel], out_channels=128, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=[ 0, 0, 7, 5, 5, 3, 3 ][intLevel], stride=1, padding=[ 0, 0, 3, 2, 2, 1, 1 ][intLevel])
                )
            # end

            def forward(self, tenOne, tenTwo, tenFeaturesOne, tenFeaturesTwo, tenFlow):
                tenFeaturesOne = self.netFeat(tenFeaturesOne)
                tenFeaturesTwo = self.netFeat(tenFeaturesTwo)

                if tenFlow is not None:
                    tenFeaturesTwo = backwarp(tenInput=tenFeaturesTwo, tenFlow=tenFlow * self.fltBackward)
                # end

                return (tenFlow if tenFlow is not None else 0.0) + self.netMain(torch.cat([ tenFeaturesOne, tenFeaturesTwo, tenFlow ], 1))
            # end
        # end

        class Regularization(torch.nn.Module):
            def __init__(self, intLevel):
                super().__init__()

                self.fltBackward = [ 0.0, 0.0, 10.0, 5.0, 2.5, 1.25, 0.625 ][intLevel]

                self.intUnfold = [ 0, 0, 7, 5, 5, 3, 3 ][intLevel]

                if intLevel >= 5:
                    self.netFeat = torch.nn.Sequential()

                elif intLevel < 5:
                    self.netFeat = torch.nn.Sequential(
                        torch.nn.Conv2d(in_channels=[ 0, 0, 32, 64, 96, 128, 192 ][intLevel], out_channels=128, kernel_size=1, stride=1, padding=0),
                        torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)
                    )

                # end

                self.netMain = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=[ 0, 0, 131, 131, 131, 131, 195 ][intLevel], out_channels=128, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1)
                )

                if intLevel >= 5:
                    self.netDist = torch.nn.Sequential(
                        torch.nn.Conv2d(in_channels=32, out_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], kernel_size=[ 0, 0, 7, 5, 5, 3, 3 ][intLevel], stride=1, padding=[ 0, 0, 3, 2, 2, 1, 1 ][intLevel])
                    )

                elif intLevel < 5:
                    self.netDist = torch.nn.Sequential(
                        torch.nn.Conv2d(in_channels=32, out_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], kernel_size=([ 0, 0, 7, 5, 5, 3, 3 ][intLevel], 1), stride=1, padding=([ 0, 0, 3, 2, 2, 1, 1 ][intLevel], 0)),
                        torch.nn.Conv2d(in_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], out_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], kernel_size=(1, [ 0, 0, 7, 5, 5, 3, 3 ][intLevel]), stride=1, padding=(0, [ 0, 0, 3, 2, 2, 1, 1 ][intLevel]))
                    )

                # end

                self.netScaleX = torch.nn.Conv2d(in_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], out_channels=1, kernel_size=1, stride=1, padding=0)
                self.netScaleY = torch.nn.Conv2d(in_channels=[ 0, 0, 49, 25, 25, 9, 9 ][intLevel], out_channels=1, kernel_size=1, stride=1, padding=0)
            # eny

            def forward(self, tenOne, tenTwo, tenFeaturesOne, tenFeaturesTwo, tenFlow):
                tenDifference = ((tenOne - backwarp(tenInput=tenTwo, tenFlow=tenFlow * self.fltBackward)) ** 2).sum(1, True).sqrt().detach()

                tenDist = self.netDist(self.netMain(torch.cat([ tenDifference, tenFlow - tenFlow.view(tenFlow.shape[0], 2, -1).mean(2, True).view(tenFlow.shape[0], 2, 1, 1), self.netFeat(tenFeaturesOne) ], 1)))
                tenDist = (tenDist ** 2).neg()
                tenDist = (tenDist - tenDist.max(1, True)[0]).exp()

                tenDivisor = tenDist.sum(1, True).reciprocal()

                tenScaleX = self.netScaleX(tenDist * torch.nn.functional.unfold(input=tenFlow[:, 0:1, :, :], kernel_size=self.intUnfold, stride=1, padding=int((self.intUnfold - 1) / 2)).view_as(tenDist)) * tenDivisor
                tenScaleY = self.netScaleY(tenDist * torch.nn.functional.unfold(input=tenFlow[:, 1:2, :, :], kernel_size=self.intUnfold, stride=1, padding=int((self.intUnfold - 1) / 2)).view_as(tenDist)) * tenDivisor

                return torch.cat([ tenScaleX, tenScaleY ], 1)
            # end
        # end

        self.netFeatures = Features()
        self.netMatching = torch.nn.ModuleList([ Matching(intLevel) for intLevel in [ 2, 3, 4, 5, 6 ] ])
        self.netSubpixel = torch.nn.ModuleList([ Subpixel(intLevel) for intLevel in [ 2, 3, 4, 5, 6 ] ])
        self.netRegularization = torch.nn.ModuleList([ Regularization(intLevel) for intLevel in [ 2, 3, 4, 5, 6 ] ])

        self.load_state_dict({ strKey.replace('module', 'net'): tenWeight for strKey, tenWeight in torch.hub.load_state_dict_from_url(url='http://content.sniklaus.com/github/pytorch-liteflownet/network-' + arguments_strModel + '.pytorch').items() })
        # self.load_state_dict(torch.load('./liteflownet/network-default.pth'))
    # end

    def forward(self, tenOne, tenTwo):
        tenOne[:, 0, :, :] = tenOne[:, 0, :, :] - 0.411618
        tenOne[:, 1, :, :] = tenOne[:, 1, :, :] - 0.434631
        tenOne[:, 2, :, :] = tenOne[:, 2, :, :] - 0.454253

        tenTwo[:, 0, :, :] = tenTwo[:, 0, :, :] - 0.410782
        tenTwo[:, 1, :, :] = tenTwo[:, 1, :, :] - 0.433645
        tenTwo[:, 2, :, :] = tenTwo[:, 2, :, :] - 0.452793

        tenFeaturesOne = self.netFeatures(tenOne)
        tenFeaturesTwo = self.netFeatures(tenTwo)

        tenOne = [ tenOne ]
        tenTwo = [ tenTwo ]

        for intLevel in [ 1, 2, 3, 4, 5 ]:
            tenOne.append(torch.nn.functional.interpolate(input=tenOne[-1], size=(tenFeaturesOne[intLevel].shape[2], tenFeaturesOne[intLevel].shape[3]), mode='bilinear', align_corners=False))
            tenTwo.append(torch.nn.functional.interpolate(input=tenTwo[-1], size=(tenFeaturesTwo[intLevel].shape[2], tenFeaturesTwo[intLevel].shape[3]), mode='bilinear', align_corners=False))
        # end

        tenFlow = None

        for intLevel in [ -1, -2, -3, -4, -5 ]:
            tenFlow = self.netMatching[intLevel](tenOne[intLevel], tenTwo[intLevel], tenFeaturesOne[intLevel], tenFeaturesTwo[intLevel], tenFlow)
            tenFlow = self.netSubpixel[intLevel](tenOne[intLevel], tenTwo[intLevel], tenFeaturesOne[intLevel], tenFeaturesTwo[intLevel], tenFlow)
            tenFlow = self.netRegularization[intLevel](tenOne[intLevel], tenTwo[intLevel], tenFeaturesOne[intLevel], tenFeaturesTwo[intLevel], tenFlow)
        # end

        return tenFlow * 20.0
    # end
# end

netNetwork = None

##########################################################

def estimate(tenOne, tenTwo):
    global netNetwork

    if netNetwork is None:
        netNetwork = Network().cuda().eval()
    # end

    assert(tenOne.shape[1] == tenTwo.shape[1])
    assert(tenOne.shape[2] == tenTwo.shape[2])

    intWidth = tenOne.shape[2]
    intHeight = tenOne.shape[1]

    # assert(intWidth == 1024) # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue
    # assert(intHeight == 436) # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue

    tenPreprocessedOne = tenOne.cuda().view(1, 3, intHeight, intWidth)
    tenPreprocessedTwo = tenTwo.cuda().view(1, 3, intHeight, intWidth)

    intPreprocessedWidth = int(math.floor(math.ceil(intWidth / 32.0) * 32.0))
    intPreprocessedHeight = int(math.floor(math.ceil(intHeight / 32.0) * 32.0))

    tenPreprocessedOne = torch.nn.functional.interpolate(input=tenPreprocessedOne, size=(intPreprocessedHeight, intPreprocessedWidth), mode='bilinear', align_corners=False)
    tenPreprocessedTwo = torch.nn.functional.interpolate(input=tenPreprocessedTwo, size=(intPreprocessedHeight, intPreprocessedWidth), mode='bilinear', align_corners=False)

    tenFlow = torch.nn.functional.interpolate(input=netNetwork(tenPreprocessedOne, tenPreprocessedTwo), size=(intHeight, intWidth), mode='bilinear', align_corners=False)

    tenFlow[:, 0, :, :] *= float(intWidth) / float(intPreprocessedWidth)
    tenFlow[:, 1, :, :] *= float(intHeight) / float(intPreprocessedHeight)

    return tenFlow[0, :, :, :].cpu()
# end

##########################################################

if __name__ == '__main__':
    tenOne = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strOne))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0)))
    tenTwo = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strTwo))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0)))

    tenOutput = estimate(tenOne, tenTwo)

    objOutput = open(arguments_strOut, 'wb')

    numpy.array([ 80, 73, 69, 72 ], numpy.uint8).tofile(objOutput)
    numpy.array([ tenOutput.shape[2], tenOutput.shape[1] ], numpy.int32).tofile(objOutput)
    numpy.array(tenOutput.numpy().transpose(1, 2, 0), numpy.float32).tofile(objOutput)

    objOutput.close()
# end

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/losses/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/losses/loss.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class Loss(nn.Module):
    def __init__(self, loss_weight, keys, mapping=None) -> None:
        '''
            mapping: map the kwargs keys into desired ones.
        '''
        super().__init__()
        self.loss_weight = loss_weight
        self.keys = keys
        self.mapping = mapping
        if isinstance(mapping, dict):
            self.mapping = {k: v for k, v in mapping if v in keys}

    
    def forward(self, **kwargs):
        params = {k: v for k, v in kwargs.items() if k in self.keys}
        if self.mapping is not None:
            for k, v in kwargs.items(): 
                if self.mapping.get(k) is not None: 
                    params[self.mapping[k]] = v 
        
        return self._forward(**params) * self.loss_weight

    def _forward(self, **kwargs):
        pass


class CharbonnierLoss(Loss):
    def __init__(self, loss_weight, keys) -> None:
        super().__init__(loss_weight, keys)
        
    def _forward(self, imgt_pred, imgt):    
        diff = imgt_pred - imgt
        loss = ((diff ** 2 + 1e-6) ** 0.5).mean()
        return loss


class AdaCharbonnierLoss(Loss):
    def __init__(self, loss_weight, keys) -> None:
        super().__init__(loss_weight, keys)
        
    def _forward(self, imgt_pred, imgt, weight):   
        alpha = weight / 2
        epsilon = 10 ** (-(10 * weight - 1) / 3)

        diff = imgt_pred - imgt
        loss = ((diff ** 2 + epsilon ** 2) ** alpha).mean()
        return loss
  
  
class TernaryLoss(Loss):
    def __init__(self, loss_weight, keys, patch_size=7):
        super().__init__(loss_weight, keys)
        self.patch_size = patch_size
        out_channels = patch_size * patch_size
        self.w = np.eye(out_channels).reshape((patch_size, patch_size, 1, out_channels))
        self.w = np.transpose(self.w, (3, 2, 0, 1))
        self.w = torch.tensor(self.w, dtype=torch.float32)

    def transform(self, tensor):
        self.w = self.w.to(tensor.device)
        tensor_ = tensor.mean(dim=1, keepdim=True)
        patches = F.conv2d(tensor_, self.w, padding=self.patch_size//2, bias=None)
        loc_diff = patches - tensor_
        loc_diff_norm = loc_diff / torch.sqrt(0.81 + loc_diff ** 2)
        return loc_diff_norm

    def valid_mask(self, tensor):
        padding = self.patch_size//2
        b, c, h, w = tensor.size()
        inner = torch.ones(b, 1, h - 2 * padding, w - 2 * padding).type_as(tensor)
        mask = F.pad(inner, [padding] * 4)
        return mask
  
    def _forward(self, imgt_pred, imgt):
        loc_diff_x = self.transform(imgt_pred)
        loc_diff_y = self.transform(imgt)
        diff = loc_diff_x - loc_diff_y.detach()
        dist = (diff ** 2 / (0.1 + diff ** 2)).mean(dim=1, keepdim=True)
        mask = self.valid_mask(imgt_pred)
        loss = (dist * mask).mean()
        return loss
 

class GeometryLoss(Loss):
    def __init__(self, loss_weight, keys, patch_size=3):
        super().__init__(loss_weight, keys)
        self.patch_size = patch_size
        out_channels = patch_size * patch_size
        self.w = np.eye(out_channels).reshape((patch_size, patch_size, 1, out_channels))
        self.w = np.transpose(self.w, (3, 2, 0, 1))
        self.w = torch.tensor(self.w).float()

    def transform(self, tensor):
        b, c, h, w = tensor.size()
        self.w = self.w.to(tensor.device)
        tensor_ = tensor.reshape(b*c, 1, h, w)
        patches = F.conv2d(tensor_, self.w, padding=self.patch_size // 2, bias=None)
        loc_diff = patches - tensor_
        loc_diff_ = loc_diff.reshape(b, c*(self.patch_size ** 2), h, w)
        loc_diff_norm = loc_diff_ / torch.sqrt(0.81 + loc_diff_ ** 2)
        return loc_diff_norm

    def valid_mask(self, tensor):
        padding = self.patch_size // 2
        b, c, h, w = tensor.size()
        inner = torch.ones(b, 1, h - 2 * padding, w - 2 * padding).type_as(tensor)
        mask = F.pad(inner, [padding] * 4)
        return mask

    def _forward(self, ft_pred, ft_gt):
        loss = 0.
        for pred, gt in zip(ft_pred, ft_gt):
            loc_diff_x = self.transform(pred)
            loc_diff_y = self.transform(gt)
            diff = loc_diff_x - loc_diff_y
            dist = (diff ** 2 / (0.1 + diff ** 2)).mean(dim=1, keepdim=True)
            mask = self.valid_mask(pred)
            loss = loss + (dist * mask).mean()
        return loss
    

class IFRFlowLoss(Loss):
    def __init__(self, loss_weight, keys, beta=0.3) -> None:
        super().__init__(loss_weight, keys)
        self.beta = beta
        self.ada_cb_loss = AdaCharbonnierLoss(1.0, ['imgt_pred', 'imgt', 'weight'])
    
    def _forward(self, flow0_pred, flow1_pred, flow):
        
        robust_weight0 = self.get_robust_weight(flow0_pred[0], flow[:, 0:2])
        robust_weight1 = self.get_robust_weight(flow1_pred[0], flow[:, 2:4])
        loss = 0
        for lvl in range(1, len(flow0_pred)):
            scale_factor = 2**lvl
            loss = loss + self.ada_cb_loss(**{
                'imgt_pred': self.resize(flow0_pred[lvl], scale_factor),
                'imgt': flow[:, 0:2],
                'weight': robust_weight0
            })
            loss = loss + self.ada_cb_loss(**{
                'imgt_pred': self.resize(flow1_pred[lvl], scale_factor),
                'imgt': flow[:, 2:4],
                'weight': robust_weight1
            })
        return loss
    
    def resize(self, x, scale_factor):
        return scale_factor * F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)
    
    def get_robust_weight(self, flow_pred, flow_gt):
        epe = ((flow_pred.detach() - flow_gt) ** 2).sum(dim=1, keepdim=True) ** 0.5
        robust_weight = torch.exp(-self.beta * epe)
        return robust_weight


class MultipleFlowLoss(Loss):
    def __init__(self, loss_weight, keys, beta=0.3) -> None:
        super().__init__(loss_weight, keys)
        self.beta = beta
        self.ada_cb_loss = AdaCharbonnierLoss(1.0, ['imgt_pred', 'imgt', 'weight'])
    
    def _forward(self, flow0_pred, flow1_pred, flow):
        
        robust_weight0 = self.get_mutli_flow_robust_weight(flow0_pred[0], flow[:, 0:2])
        robust_weight1 = self.get_mutli_flow_robust_weight(flow1_pred[0], flow[:, 2:4])
        loss = 0
        for lvl in range(1, len(flow0_pred)):
            scale_factor = 2**lvl
            loss = loss + self.ada_cb_loss(**{
                'imgt_pred': self.resize(flow0_pred[lvl], scale_factor),
                'imgt': flow[:, 0:2],
                'weight': robust_weight0
            })
            loss = loss + self.ada_cb_loss(**{
                'imgt_pred': self.resize(flow1_pred[lvl], scale_factor),
                'imgt': flow[:, 2:4],
                'weight': robust_weight1
            })
        return loss
    
    def resize(self, x, scale_factor):
        return scale_factor * F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)

    def get_mutli_flow_robust_weight(self, flow_pred, flow_gt):
        b, num_flows, c, h, w = flow_pred.shape
        flow_pred = flow_pred.view(b, num_flows, c, h, w)
        flow_gt = flow_gt.repeat(1, num_flows, 1, 1).view(b, num_flows, c, h, w)
        epe = ((flow_pred.detach() - flow_gt) ** 2).sum(dim=2, keepdim=True).max(1)[0] ** 0.5
        robust_weight = torch.exp(-self.beta * epe)
        return robust_weight

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/metrics/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/metrics/psnr_ssim.py
================================================
import torch
import torch.nn.functional as F
from math import exp

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def gaussian(window_size, sigma):
    gauss = torch.Tensor([exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)])
    return gauss/gauss.sum()


def create_window(window_size, channel=1):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0).to(device)
    window = _2D_window.expand(channel, 1, window_size, window_size).contiguous()
    return window


def create_window_3d(window_size, channel=1):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t())
    _3D_window = _2D_window.unsqueeze(2) @ (_1D_window.t())
    window = _3D_window.expand(1, channel, window_size, window_size, window_size).contiguous().to(device)
    return window


def ssim(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None):
    if val_range is None:
        if torch.max(img1) > 128:
            max_val = 255
        else:
            max_val = 1

        if torch.min(img1) < -0.5:
            min_val = -1
        else:
            min_val = 0
        L = max_val - min_val
    else:
        L = val_range

    padd = 0
    (_, channel, height, width) = img1.size()
    if window is None:
        real_size = min(window_size, height, width)
        window = create_window(real_size, channel=channel).to(img1.device)

    mu1 = F.conv2d(F.pad(img1, (5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=channel)
    mu2 = F.conv2d(F.pad(img2, (5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=channel)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv2d(F.pad(img1 * img1, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu1_sq
    sigma2_sq = F.conv2d(F.pad(img2 * img2, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu2_sq
    sigma12 = F.conv2d(F.pad(img1 * img2, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu1_mu2

    C1 = (0.01 * L) ** 2
    C2 = (0.03 * L) ** 2

    v1 = 2.0 * sigma12 + C2
    v2 = sigma1_sq + sigma2_sq + C2
    cs = torch.mean(v1 / v2)

    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)

    if size_average:
        ret = ssim_map.mean()
    else:
        ret = ssim_map.mean(1).mean(1).mean(1)

    if full:
        return ret, cs
    return ret


def calculate_ssim(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None):
    if val_range is None:
        if torch.max(img1) > 128:
            max_val = 255
        else:
            max_val = 1

        if torch.min(img1) < -0.5:
            min_val = -1
        else:
            min_val = 0
        L = max_val - min_val
    else:
        L = val_range

    padd = 0
    (_, _, height, width) = img1.size()
    if window is None:
        real_size = min(window_size, height, width)
        window = create_window_3d(real_size, channel=1).to(img1.device)

    img1 = img1.unsqueeze(1)
    img2 = img2.unsqueeze(1)

    mu1 = F.conv3d(F.pad(img1, (5, 5, 5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=1)
    mu2 = F.conv3d(F.pad(img2, (5, 5, 5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=1)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv3d(F.pad(img1 * img1, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu1_sq
    sigma2_sq = F.conv3d(F.pad(img2 * img2, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu2_sq
    sigma12 = F.conv3d(F.pad(img1 * img2, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu1_mu2

    C1 = (0.01 * L) ** 2
    C2 = (0.03 * L) ** 2

    v1 = 2.0 * sigma12 + C2
    v2 = sigma1_sq + sigma2_sq + C2
    cs = torch.mean(v1 / v2)

    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)

    if size_average:
        ret = ssim_map.mean()
    else:
        ret = ssim_map.mean(1).mean(1).mean(1)

    if full:
        return ret, cs
    return ret.detach().cpu().numpy()


def calculate_psnr(img1, img2):
    psnr = -10 * torch.log10(((img1 - img2) * (img1 - img2)).mean())
    return psnr.detach().cpu().numpy()


def calculate_ie(img1, img2):
    ie = torch.abs(torch.round(img1 * 255.0) - torch.round(img2 * 255.0)).mean()
    return ie.detach().cpu().numpy()


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/AMT-G.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from vbench.third_party.amt.networks.blocks.raft import (
    coords_grid,
    BasicUpdateBlock, BidirCorrBlock
)
from vbench.third_party.amt.networks.blocks.feat_enc import (
    LargeEncoder
)
from vbench.third_party.amt.networks.blocks.ifrnet import (
    resize,
    Encoder,
    InitDecoder,
    IntermediateDecoder
)
from vbench.third_party.amt.networks.blocks.multi_flow import (
    multi_flow_combine,
    MultiFlowDecoder
)


class Model(nn.Module):
    def __init__(self, 
                 corr_radius=3, 
                 corr_lvls=4, 
                 num_flows=5, 
                 channels=[84, 96, 112, 128], 
                 skip_channels=84):
        super(Model, self).__init__()
        self.radius = corr_radius
        self.corr_levels = corr_lvls
        self.num_flows = num_flows

        self.feat_encoder = LargeEncoder(output_dim=128, norm_fn='instance', dropout=0.)
        self.encoder = Encoder(channels, large=True)
        self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels)
        self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels)
        self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels)
        self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows)

        self.update4 = self._get_updateblock(112, None)
        self.update3_low = self._get_updateblock(96, 2.0)
        self.update2_low = self._get_updateblock(84, 4.0)
        
        self.update3_high = self._get_updateblock(96, None)
        self.update2_high = self._get_updateblock(84, None)
        
        self.comb_block = nn.Sequential(
            nn.Conv2d(3*self.num_flows, 6*self.num_flows, 7, 1, 3),
            nn.PReLU(6*self.num_flows),
            nn.Conv2d(6*self.num_flows, 3, 7, 1, 3),
        )

    def _get_updateblock(self, cdim, scale_factor=None):
        return BasicUpdateBlock(cdim=cdim, hidden_dim=192, flow_dim=64, 
                                corr_dim=256, corr_dim2=192, fc_dim=188, 
                                scale_factor=scale_factor, corr_levels=self.corr_levels, 
                                radius=self.radius)

    def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1):
        # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0
        # based on linear assumption
        t1_scale = 1. / embt
        t0_scale = 1. / (1. - embt)
        if downsample != 1:
            inv = 1 / downsample
            flow0 = inv * resize(flow0, scale_factor=inv)
            flow1 = inv * resize(flow1, scale_factor=inv)
            
        corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale) 
        corr = torch.cat([corr0, corr1], dim=1)
        flow = torch.cat([flow0, flow1], dim=1)
        return corr, flow
    
    def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs):
        mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True)
        img0 = img0 - mean_
        img1 = img1 - mean_
        img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0
        img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1
        b, _, h, w = img0_.shape
        coord = coords_grid(b, h // 8, w // 8, img0.device)
        
        fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8]
        corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels)

        # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4]
        # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16]
        f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
        f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)

        ######################################### the 4th decoder #########################################
        up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt)
        corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, 
                                                 up_flow0_4, up_flow1_4, 
                                                 embt, downsample=1)

        # residue update with lookup corr
        delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4)
        delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1)
        up_flow0_4 = up_flow0_4 + delta_flow0_4
        up_flow1_4 = up_flow1_4 + delta_flow1_4
        ft_3_ = ft_3_ + delta_ft_3_

        ######################################### the 3rd decoder #########################################
        up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
        corr_3, flow_3 = self._corr_scale_lookup(corr_fn, 
                                                 coord, up_flow0_3, up_flow1_3, 
                                                 embt, downsample=2)

        # residue update with lookup corr
        delta_ft_2_, delta_flow_3 = self.update3_low(ft_2_, flow_3, corr_3)
        delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1)
        up_flow0_3 = up_flow0_3 + delta_flow0_3
        up_flow1_3 = up_flow1_3 + delta_flow1_3
        ft_2_ = ft_2_ + delta_ft_2_
        
        # residue update with lookup corr (hr)
        corr_3 = resize(corr_3, scale_factor=2.0)
        up_flow_3 = torch.cat([up_flow0_3, up_flow1_3], dim=1)
        delta_ft_2_, delta_up_flow_3 = self.update3_high(ft_2_, up_flow_3, corr_3)
        ft_2_ += delta_ft_2_
        up_flow0_3 += delta_up_flow_3[:, 0:2]
        up_flow1_3 += delta_up_flow_3[:, 2:4]
        
        ######################################### the 2nd decoder #########################################
        up_flow0_2, up_flow1_2, ft_1_  = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
        corr_2, flow_2 = self._corr_scale_lookup(corr_fn, 
                                                 coord, up_flow0_2, up_flow1_2, 
                                                 embt, downsample=4)
        
        # residue update with lookup corr
        delta_ft_1_, delta_flow_2 = self.update2_low(ft_1_, flow_2, corr_2)
        delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1)
        up_flow0_2 = up_flow0_2 + delta_flow0_2
        up_flow1_2 = up_flow1_2 + delta_flow1_2
        ft_1_ = ft_1_ + delta_ft_1_
        
        # residue update with lookup corr (hr)
        corr_2 = resize(corr_2, scale_factor=4.0)
        up_flow_2 = torch.cat([up_flow0_2, up_flow1_2], dim=1)
        delta_ft_1_, delta_up_flow_2 = self.update2_high(ft_1_, up_flow_2, corr_2)
        ft_1_ += delta_ft_1_
        up_flow0_2 += delta_up_flow_2[:, 0:2]
        up_flow1_2 += delta_up_flow_2[:, 2:4]
        
        ######################################### the 1st decoder #########################################
        up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)
        
        if scale_factor != 1.0: 
            up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
            up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
            mask = resize(mask, scale_factor=(1.0/scale_factor))
            img_res = resize(img_res, scale_factor=(1.0/scale_factor))

        # Merge multiple predictions 
        imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, 
                                                                        mask, img_res, mean_)
        imgt_pred = torch.clamp(imgt_pred, 0, 1)

        if eval:
            return  { 'imgt_pred': imgt_pred, }
        else:
            up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w)
            up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w)
            return {
                'imgt_pred': imgt_pred,
                'flow0_pred': [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4],
                'flow1_pred': [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4],
                'ft_pred': [ft_1_, ft_2_, ft_3_],
            }


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/AMT-L.py
================================================
import torch
import torch.nn as nn
from vbench.third_party.amt.networks.blocks.raft import (
    coords_grid,
    BasicUpdateBlock, BidirCorrBlock
)
from vbench.third_party.amt.networks.blocks.feat_enc import (
    BasicEncoder,
)
from vbench.third_party.amt.networks.blocks.ifrnet import (
    resize,
    Encoder,
    InitDecoder,
    IntermediateDecoder
)
from vbench.third_party.amt.networks.blocks.multi_flow import (
    multi_flow_combine,
    MultiFlowDecoder
)

class Model(nn.Module):
    def __init__(self, 
                 corr_radius=3, 
                 corr_lvls=4, 
                 num_flows=5,
                 channels=[48, 64, 72, 128], 
                 skip_channels=48
                 ):
        super(Model, self).__init__()
        self.radius = corr_radius
        self.corr_levels = corr_lvls
        self.num_flows = num_flows

        self.feat_encoder = BasicEncoder(output_dim=128, norm_fn='instance', dropout=0.)
        self.encoder = Encoder([48, 64, 72, 128], large=True)
        
        self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels)
        self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels)
        self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels)
        self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows)

        self.update4 = self._get_updateblock(72, None)
        self.update3 = self._get_updateblock(64, 2.0)
        self.update2 = self._get_updateblock(48, 4.0)
        
        self.comb_block = nn.Sequential(
            nn.Conv2d(3*self.num_flows, 6*self.num_flows, 7, 1, 3),
            nn.PReLU(6*self.num_flows),
            nn.Conv2d(6*self.num_flows, 3, 7, 1, 3),
        )

    def _get_updateblock(self, cdim, scale_factor=None):
        return BasicUpdateBlock(cdim=cdim, hidden_dim=128, flow_dim=48, 
                                corr_dim=256, corr_dim2=160, fc_dim=124, 
                                scale_factor=scale_factor, corr_levels=self.corr_levels, 
                                radius=self.radius)

    def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1):
        # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0
        # based on linear assumption
        t1_scale = 1. / embt
        t0_scale = 1. / (1. - embt)
        if downsample != 1:
            inv = 1 / downsample
            flow0 = inv * resize(flow0, scale_factor=inv)
            flow1 = inv * resize(flow1, scale_factor=inv)
            
        corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale) 
        corr = torch.cat([corr0, corr1], dim=1)
        flow = torch.cat([flow0, flow1], dim=1)
        return corr, flow
    
    def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs):
        mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True)
        img0 = img0 - mean_
        img1 = img1 - mean_
        img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0
        img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1
        b, _, h, w = img0_.shape
        coord = coords_grid(b, h // 8, w // 8, img0.device)
        
        fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8]
        corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels)

        # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4]
        # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16]
        f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
        f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)

        ######################################### the 4th decoder #########################################
        up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt)
        corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, 
                                                 up_flow0_4, up_flow1_4, 
                                                 embt, downsample=1)

        # residue update with lookup corr
        delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4)
        delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1)
        up_flow0_4 = up_flow0_4 + delta_flow0_4
        up_flow1_4 = up_flow1_4 + delta_flow1_4
        ft_3_ = ft_3_ + delta_ft_3_

        ######################################### the 3rd decoder #########################################
        up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
        corr_3, flow_3 = self._corr_scale_lookup(corr_fn, 
                                                 coord, up_flow0_3, up_flow1_3, 
                                                 embt, downsample=2)

        # residue update with lookup corr
        delta_ft_2_, delta_flow_3 = self.update3(ft_2_, flow_3, corr_3)
        delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1)
        up_flow0_3 = up_flow0_3 + delta_flow0_3
        up_flow1_3 = up_flow1_3 + delta_flow1_3
        ft_2_ = ft_2_ + delta_ft_2_

        ######################################### the 2nd decoder #########################################
        up_flow0_2, up_flow1_2, ft_1_  = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
        corr_2, flow_2 = self._corr_scale_lookup(corr_fn, 
                                                 coord, up_flow0_2, up_flow1_2, 
                                                 embt, downsample=4)
        
        # residue update with lookup corr
        delta_ft_1_, delta_flow_2 = self.update2(ft_1_, flow_2, corr_2)
        delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1)
        up_flow0_2 = up_flow0_2 + delta_flow0_2
        up_flow1_2 = up_flow1_2 + delta_flow1_2
        ft_1_ = ft_1_ + delta_ft_1_

        ######################################### the 1st decoder #########################################
        up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)
        
        if scale_factor != 1.0: 
            up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
            up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
            mask = resize(mask, scale_factor=(1.0/scale_factor))
            img_res = resize(img_res, scale_factor=(1.0/scale_factor))

        # Merge multiple predictions 
        imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, 
                                                                        mask, img_res, mean_)
        imgt_pred = torch.clamp(imgt_pred, 0, 1)

        if eval:
            return  { 'imgt_pred': imgt_pred, }
        else:
            up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w)
            up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w)
            return {
                'imgt_pred': imgt_pred,
                'flow0_pred': [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4],
                'flow1_pred': [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4],
                'ft_pred': [ft_1_, ft_2_, ft_3_],
            }
    

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/AMT-S.py
================================================
import torch
import torch.nn as nn
from vbench.third_party.amt.networks.blocks.raft import (
    SmallUpdateBlock,
    coords_grid,
    BidirCorrBlock
)
from vbench.third_party.amt.networks.blocks.feat_enc import (
    SmallEncoder
)
from vbench.third_party.amt.networks.blocks.ifrnet import (
    resize,
    Encoder,
    InitDecoder,
    IntermediateDecoder
)
from vbench.third_party.amt.networks.blocks.multi_flow import (
    multi_flow_combine,
    MultiFlowDecoder
)

class Model(nn.Module):
    def __init__(self, 
                 corr_radius=3, 
                 corr_lvls=4, 
                 num_flows=3, 
                 channels=[20, 32, 44, 56], 
                 skip_channels=20):
        super(Model, self).__init__()
        self.radius = corr_radius
        self.corr_levels = corr_lvls
        self.num_flows = num_flows
        self.channels = channels
        self.skip_channels = skip_channels

        self.feat_encoder = SmallEncoder(output_dim=84, norm_fn='instance', dropout=0.)
        self.encoder = Encoder(channels)

        self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels)
        self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels)
        self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels)
        self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows)

        self.update4 = self._get_updateblock(44)
        self.update3 = self._get_updateblock(32, 2)
        self.update2 = self._get_updateblock(20, 4)
        
        self.comb_block = nn.Sequential(
            nn.Conv2d(3*num_flows, 6*num_flows, 3, 1, 1),
            nn.PReLU(6*num_flows),
            nn.Conv2d(6*num_flows, 3, 3, 1, 1),
        )

    def _get_updateblock(self, cdim, scale_factor=None):
        return SmallUpdateBlock(cdim=cdim, hidden_dim=76, flow_dim=20, corr_dim=64, 
                                fc_dim=68, scale_factor=scale_factor, 
                                corr_levels=self.corr_levels, radius=self.radius)

    def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1):
        # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0
        # based on linear assumption
        t1_scale = 1. / embt
        t0_scale = 1. / (1. - embt)
        if downsample != 1:
            inv = 1 / downsample
            flow0 = inv * resize(flow0, scale_factor=inv)
            flow1 = inv * resize(flow1, scale_factor=inv)
            
        corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale) 
        corr = torch.cat([corr0, corr1], dim=1)
        flow = torch.cat([flow0, flow1], dim=1)
        return corr, flow

    def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs):
        mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True)
        img0 = img0 - mean_
        img1 = img1 - mean_
        img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0
        img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1
        b, _, h, w = img0_.shape
        coord = coords_grid(b, h // 8, w // 8, img0.device)
        
        fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8]
        corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels)

        # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4]
        # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16]
        f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
        f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)

        ######################################### the 4th decoder #########################################
        up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt)
        corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, 
                                                 up_flow0_4, up_flow1_4, 
                                                 embt, downsample=1)

        # residue update with lookup corr
        delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4)
        delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1)
        up_flow0_4 = up_flow0_4 + delta_flow0_4
        up_flow1_4 = up_flow1_4 + delta_flow1_4
        ft_3_ = ft_3_ + delta_ft_3_

        ######################################### the 3rd decoder #########################################
        up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
        corr_3, flow_3 = self._corr_scale_lookup(corr_fn, 
                                                 coord, up_flow0_3, up_flow1_3, 
                                                 embt, downsample=2)

        # residue update with lookup corr
        delta_ft_2_, delta_flow_3 = self.update3(ft_2_, flow_3, corr_3)
        delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1)
        up_flow0_3 = up_flow0_3 + delta_flow0_3
        up_flow1_3 = up_flow1_3 + delta_flow1_3
        ft_2_ = ft_2_ + delta_ft_2_

        ######################################### the 2nd decoder #########################################
        up_flow0_2, up_flow1_2, ft_1_  = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
        corr_2, flow_2 = self._corr_scale_lookup(corr_fn, 
                                                 coord, up_flow0_2, up_flow1_2, 
                                                 embt, downsample=4)
        
        # residue update with lookup corr
        delta_ft_1_, delta_flow_2 = self.update2(ft_1_, flow_2, corr_2)
        delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1)
        up_flow0_2 = up_flow0_2 + delta_flow0_2
        up_flow1_2 = up_flow1_2 + delta_flow1_2
        ft_1_ = ft_1_ + delta_ft_1_

        ######################################### the 1st decoder #########################################
        up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)
        
        if scale_factor != 1.0: 
            up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
            up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
            mask = resize(mask, scale_factor=(1.0/scale_factor))
            img_res = resize(img_res, scale_factor=(1.0/scale_factor))
        
        # Merge multiple predictions 
        imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, 
                                                                        mask, img_res, mean_)
        imgt_pred = torch.clamp(imgt_pred, 0, 1)

        if eval:
            return  { 'imgt_pred': imgt_pred, }
        else:
            up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w)
            up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w)
            return {
                'imgt_pred': imgt_pred,
                'flow0_pred': [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4],
                'flow1_pred': [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4],
                'ft_pred': [ft_1_, ft_2_, ft_3_],
            }


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/blocks/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/blocks/feat_enc.py
================================================
import torch
import torch.nn as nn


class BottleneckBlock(nn.Module):
    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
        super(BottleneckBlock, self).__init__()
  
        self.conv1 = nn.Conv2d(in_planes, planes//4, kernel_size=1, padding=0)
        self.conv2 = nn.Conv2d(planes//4, planes//4, kernel_size=3, padding=1, stride=stride)
        self.conv3 = nn.Conv2d(planes//4, planes, kernel_size=1, padding=0)
        self.relu = nn.ReLU(inplace=True)

        num_groups = planes // 8

        if norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4)
            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4)
            self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            if not stride == 1:
                self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
        
        elif norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(planes//4)
            self.norm2 = nn.BatchNorm2d(planes//4)
            self.norm3 = nn.BatchNorm2d(planes)
            if not stride == 1:
                self.norm4 = nn.BatchNorm2d(planes)
        
        elif norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(planes//4)
            self.norm2 = nn.InstanceNorm2d(planes//4)
            self.norm3 = nn.InstanceNorm2d(planes)
            if not stride == 1:
                self.norm4 = nn.InstanceNorm2d(planes)

        elif norm_fn == 'none':
            self.norm1 = nn.Sequential()
            self.norm2 = nn.Sequential()
            self.norm3 = nn.Sequential()
            if not stride == 1:
                self.norm4 = nn.Sequential()

        if stride == 1:
            self.downsample = None
        
        else:    
            self.downsample = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4)


    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))
        y = self.relu(self.norm3(self.conv3(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x+y)


class ResidualBlock(nn.Module):
    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
        super(ResidualBlock, self).__init__()
  
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
        self.relu = nn.ReLU(inplace=True)

        num_groups = planes // 8

        if norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            if not stride == 1:
                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
        
        elif norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(planes)
            self.norm2 = nn.BatchNorm2d(planes)
            if not stride == 1:
                self.norm3 = nn.BatchNorm2d(planes)
        
        elif norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(planes)
            self.norm2 = nn.InstanceNorm2d(planes)
            if not stride == 1:
                self.norm3 = nn.InstanceNorm2d(planes)

        elif norm_fn == 'none':
            self.norm1 = nn.Sequential()
            self.norm2 = nn.Sequential()
            if not stride == 1:
                self.norm3 = nn.Sequential()

        if stride == 1:
            self.downsample = None
        
        else:    
            self.downsample = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)


    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x+y)


class SmallEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
        super(SmallEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
            
        elif self.norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(32)

        elif self.norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(32)

        elif self.norm_fn == 'none':
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 32
        self.layer1 = self._make_layer(32,  stride=1)
        self.layer2 = self._make_layer(64, stride=2)
        self.layer3 = self._make_layer(96, stride=2)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)
        
        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)
    
        self.in_planes = dim
        return nn.Sequential(*layers)


    def forward(self, x):

        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x

class BasicEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
        super(BasicEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
            
        elif self.norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(64)

        elif self.norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(64)

        elif self.norm_fn == 'none':
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 64
        self.layer1 = self._make_layer(64,  stride=1)
        self.layer2 = self._make_layer(72, stride=2)
        self.layer3 = self._make_layer(128, stride=2)

        # output convolution
        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)
        
        self.in_planes = dim
        return nn.Sequential(*layers)


    def forward(self, x):

        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x

class LargeEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
        super(LargeEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
            
        elif self.norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(64)

        elif self.norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(64)

        elif self.norm_fn == 'none':
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 64
        self.layer1 = self._make_layer(64, stride=1)
        self.layer2 = self._make_layer(112, stride=2)
        self.layer3 = self._make_layer(160, stride=2)
        self.layer3_2 = self._make_layer(160, stride=1)

        # output convolution
        self.conv2 = nn.Conv2d(self.in_planes, output_dim, kernel_size=1)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)
        
        self.in_planes = dim
        return nn.Sequential(*layers)


    def forward(self, x):

        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer3_2(x)

        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/blocks/ifrnet.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from vbench.third_party.amt.utils.flow_utils import warp


def resize(x, scale_factor):
    return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)

def convrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=bias), 
        nn.PReLU(out_channels)
    )

class ResBlock(nn.Module):
    def __init__(self, in_channels, side_channels, bias=True):
        super(ResBlock, self).__init__()
        self.side_channels = side_channels
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), 
            nn.PReLU(in_channels)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias), 
            nn.PReLU(side_channels)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), 
            nn.PReLU(in_channels)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias), 
            nn.PReLU(side_channels)
        )
        self.conv5 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias)
        self.prelu = nn.PReLU(in_channels)

    def forward(self, x):
        out = self.conv1(x)

        res_feat = out[:, :-self.side_channels, ...]
        side_feat = out[:, -self.side_channels:, :, :]
        side_feat = self.conv2(side_feat)
        out = self.conv3(torch.cat([res_feat, side_feat], 1))

        res_feat = out[:, :-self.side_channels, ...]
        side_feat = out[:, -self.side_channels:, :, :]
        side_feat = self.conv4(side_feat)
        out = self.conv5(torch.cat([res_feat, side_feat], 1))

        out = self.prelu(x + out)
        return out
    
class Encoder(nn.Module):
    def __init__(self, channels, large=False):
        super(Encoder, self).__init__()
        self.channels = channels        
        prev_ch = 3
        for idx, ch in enumerate(channels, 1):
            k = 7 if large and idx == 1 else 3
            p = 3 if k ==7 else 1
            self.register_module(f'pyramid{idx}', 
            nn.Sequential(
                convrelu(prev_ch, ch, k, 2, p),
                convrelu(ch, ch, 3, 1, 1)
            ))
            prev_ch = ch
                
    def forward(self, in_x):
        fs = []
        for idx in range(len(self.channels)):
            out_x = getattr(self, f'pyramid{idx+1}')(in_x)
            fs.append(out_x)
            in_x = out_x
        return fs
    
class InitDecoder(nn.Module):
    def __init__(self, in_ch, out_ch, skip_ch) -> None:
        super().__init__()
        self.convblock = nn.Sequential(
            convrelu(in_ch*2+1, in_ch*2), 
            ResBlock(in_ch*2, skip_ch), 
            nn.ConvTranspose2d(in_ch*2, out_ch+4, 4, 2, 1, bias=True)
        )
    def forward(self, f0, f1, embt):
        h, w = f0.shape[2:]
        embt = embt.repeat(1, 1, h, w)
        out = self.convblock(torch.cat([f0, f1, embt], 1))
        flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
        ft_ = out[:, 4:, ...]
        return flow0, flow1, ft_
    
class IntermediateDecoder(nn.Module):
    def __init__(self, in_ch, out_ch, skip_ch) -> None:
        super().__init__()
        self.convblock = nn.Sequential(
            convrelu(in_ch*3+4, in_ch*3), 
            ResBlock(in_ch*3, skip_ch), 
            nn.ConvTranspose2d(in_ch*3, out_ch+4, 4, 2, 1, bias=True)
        )
    def forward(self, ft_, f0, f1, flow0_in, flow1_in):
        f0_warp = warp(f0, flow0_in)
        f1_warp = warp(f1, flow1_in)
        f_in = torch.cat([ft_, f0_warp, f1_warp, flow0_in, flow1_in], 1)
        out = self.convblock(f_in)
        flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
        ft_ = out[:, 4:, ...]
        flow0 = flow0 + 2.0 * resize(flow0_in, scale_factor=2.0)
        flow1 = flow1 + 2.0 * resize(flow1_in, scale_factor=2.0)
        return flow0, flow1, ft_


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/blocks/multi_flow.py
================================================
import torch
import torch.nn as nn
from vbench.third_party.amt.utils.flow_utils import warp
from vbench.third_party.amt.networks.blocks.ifrnet import (
    convrelu, resize,
    ResBlock,
)


def multi_flow_combine(comb_block, img0, img1, flow0, flow1, 
                       mask=None, img_res=None, mean=None):
        '''
            A parallel implementation of multiple flow field warping 
            comb_block: An nn.Seqential object.
            img shape: [b, c, h, w]
            flow shape: [b, 2*num_flows, h, w]
            mask (opt):
                If 'mask' is None, the function conduct a simple average.
            img_res (opt):
                If 'img_res' is None, the function adds zero instead. 
            mean (opt):
                If 'mean' is None, the function adds zero instead.       
        '''
        b, c, h, w = flow0.shape
        num_flows = c // 2
        flow0   =   flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
        flow1   =   flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
        
        mask    =    mask.reshape(b, num_flows, 1, h, w
                            ).reshape(-1, 1, h, w) if mask is not None else None
        img_res = img_res.reshape(b, num_flows, 3, h, w
                            ).reshape(-1, 3, h, w)  if img_res is not None else 0
        img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w)
        img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w)
        mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1
                                                    ) if mean is not None else 0
        
        img0_warp = warp(img0, flow0)
        img1_warp = warp(img1, flow1)
        img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res
        img_warps = img_warps.reshape(b, num_flows, 3, h, w)
        imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w))
        return imgt_pred


class MultiFlowDecoder(nn.Module):
    def __init__(self, in_ch, skip_ch, num_flows=3):
        super(MultiFlowDecoder, self).__init__()
        self.num_flows = num_flows
        self.convblock = nn.Sequential(
            convrelu(in_ch*3+4, in_ch*3), 
            ResBlock(in_ch*3, skip_ch), 
            nn.ConvTranspose2d(in_ch*3, 8*num_flows, 4, 2, 1, bias=True)
        )
        
    def forward(self, ft_, f0, f1, flow0, flow1):
        n = self.num_flows
        f0_warp = warp(f0, flow0)
        f1_warp = warp(f1, flow1)
        out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1))
        delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2*n, 2*n, n, 3*n], 1)
        mask = torch.sigmoid(mask)
        
        flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0
                                           ).repeat(1, self.num_flows, 1, 1)
        flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0
                                           ).repeat(1, self.num_flows, 1, 1)
        
        return flow0, flow1, mask, img_res


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/amt/networks/blocks/raft.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


def resize(x, scale_factor):
    return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)


def bilinear_sampler(img, coords, mask=False):
    """ Wrapper for grid_sample, uses pixel coordinates """
    H, W = img.shape[-2:]
    xgrid, ygrid = coords.split([1,1], dim=-1)
    xgrid = 2*xgrid/(W-1) - 1
    ygrid = 2*ygrid/(H-1) - 1

    grid = torch.cat([xgrid, ygrid], dim=-1)
    img = F.grid_sample(img, grid, align_corners=True)

    if mask:
        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
        return img, mask.float()

    return img


def coords_grid(batch, ht, wd, device):
    coords = torch.meshgrid(torch.arange(ht, device=device), 
                            torch.arange(wd, device=device), 
                            indexing='ij')
    coords = torch.stack(coords[::-1], dim=0).float()
    return coords[None].repeat(batch, 1, 1, 1)


class SmallUpdateBlock(nn.Module):
    def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, fc_dim,
                 corr_levels=4, radius=3, scale_factor=None):
        super(SmallUpdateBlock, self).__init__()
        cor_planes = corr_levels * (2 * radius + 1) **2
        self.scale_factor = scale_factor

        self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
        self.convf1 = nn.Conv2d(4, flow_dim*2, 7, padding=3)
        self.convf2 = nn.Conv2d(flow_dim*2, flow_dim, 3, padding=1)
        self.conv = nn.Conv2d(corr_dim+flow_dim, fc_dim, 3, padding=1)

        self.gru = nn.Sequential(
            nn.Conv2d(fc_dim+4+cdim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
        )

        self.feat_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, cdim, 3, padding=1),
        )

        self.flow_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, 4, 3, padding=1),
        )

        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
            
    def forward(self, net, flow, corr):
        net = resize(net, 1 / self.scale_factor
                      ) if self.scale_factor is not None else net
        cor = self.lrelu(self.convc1(corr))
        flo = self.lrelu(self.convf1(flow))
        flo = self.lrelu(self.convf2(flo))
        cor_flo = torch.cat([cor, flo], dim=1)
        inp = self.lrelu(self.conv(cor_flo))
        inp = torch.cat([inp, flow, net], dim=1)

        out = self.gru(inp)
        delta_net = self.feat_head(out)
        delta_flow = self.flow_head(out)
        
        if self.scale_factor is not None:
            delta_net = resize(delta_net, scale_factor=self.scale_factor)
            delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)
        
        return delta_net, delta_flow


class BasicUpdateBlock(nn.Module):
    def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, corr_dim2, 
                 fc_dim, corr_levels=4, radius=3, scale_factor=None, out_num=1):
        super(BasicUpdateBlock, self).__init__()
        cor_planes = corr_levels * (2 * radius + 1) **2

        self.scale_factor = scale_factor
        self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
        self.convc2 = nn.Conv2d(corr_dim, corr_dim2, 3, padding=1)
        self.convf1 = nn.Conv2d(4, flow_dim*2, 7, padding=3)
        self.convf2 = nn.Conv2d(flow_dim*2, flow_dim, 3, padding=1)
        self.conv = nn.Conv2d(flow_dim+corr_dim2, fc_dim, 3, padding=1)

        self.gru = nn.Sequential(
            nn.Conv2d(fc_dim+4+cdim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
        )

        self.feat_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, cdim, 3, padding=1),
        )

        self.flow_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, 4*out_num, 3, padding=1),
        )

        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
            
    def forward(self, net, flow, corr):
        net = resize(net, 1 / self.scale_factor
                      ) if self.scale_factor is not None else net
        cor = self.lrelu(self.convc1(corr))
        cor = self.lrelu(self.convc2(cor))
        flo = self.lrelu(self.convf1(flow))
        flo = self.lrelu(self.convf2(flo))
        cor_flo = torch.cat([cor, flo], dim=1)
        inp = self.lrelu(self.conv(cor_flo))
        inp = torch.cat([inp, flow, net], dim=1)

        out = self.gru(inp)
        delta_net = self.feat_head(out)
        delta_flow = self.flow_head(out)
        
        if self.scale_factor is not None:
            delta_net = resize(delta_net, scale_factor=self.scale_factor)
            delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)
        return delta_net, delta_flow


class BidirCorrBlock:
    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
        self.num_levels = num_levels
        self.radius = radius
        self.corr_pyramid = []
        self.corr_pyramid_T = []

        corr = BidirCorrBlock.corr(fmap1, fmap2)
        batch, h1, w1, dim, h2, w2 = corr.shape
        corr_T = corr.clone().permute(0, 4, 5, 3, 1, 2)

        corr = corr.reshape(batch*h1*w1, dim, h2, w2)
        corr_T = corr_T.reshape(batch*h2*w2, dim, h1, w1)
        
        self.corr_pyramid.append(corr)
        self.corr_pyramid_T.append(corr_T)

        for _ in range(self.num_levels-1):
            corr = F.avg_pool2d(corr, 2, stride=2)
            corr_T = F.avg_pool2d(corr_T, 2, stride=2)
            self.corr_pyramid.append(corr)
            self.corr_pyramid_T.append(corr_T)

    def __call__(self, coords0, coords1):
        r = self.radius
        coords0 = coords0.permute(0, 2, 3, 1)
        coords1 = coords1.permute(0, 2, 3, 1)
        assert coords0.shape == coords1.shape, f"coords0 shape: [{coords0.shape}] is not equal to [{coords1.shape}]"
        batch, h1, w1, _ = coords0.shape

        out_pyramid = []
        out_pyramid_T = []
        for i in range(self.num_levels):
            corr = self.corr_pyramid[i]
            corr_T = self.corr_pyramid_T[i]

            dx = torch.linspace(-r, r, 2*r+1, device=coords0.device)
            dy = torch.linspace(-r, r, 2*r+1, device=coords0.device)
            delta = torch.stack(torch.meshgrid(dy, dx, indexing='ij'), axis=-1)
            delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2)

            centroid_lvl_0 = coords0.reshape(batch*h1*w1, 1, 1, 2) / 2**i
            centroid_lvl_1 = coords1.reshape(batch*h1*w1, 1, 1, 2) / 2**i
            coords_lvl_0 = centroid_lvl_0 + delta_lvl
            coords_lvl_1 = centroid_lvl_1 + delta_lvl

            corr = bilinear_sampler(corr, coords_lvl_0)
            corr_T = bilinear_sampler(corr_T, coords_lvl_1)
            corr = corr.view(batch, h1, w1, -1)
            corr_T = corr_T.view(batch, h1, w1, -1)
            out_pyramid.append(corr)
            out_pyramid_T.append(corr_T)

        out = torch.cat(out_pyramid, dim=-1)
        out_T = torch.cat(out_pyramid_T, dim=-1)
        return out.permute(0, 3, 1, 2).contiguous().float(), out_T.permute(0, 3, 1, 2).contiguous().float()

    @staticmethod
    def corr(fmap1, fmap2):
        batch, dim, ht, wd = fmap1.shape
        fmap1 = fmap1.view(batch, dim, ht*wd)
        fmap2 = fmap2.view(batch, dim, ht*wd) 
        
        corr = torch.matmul(fmap1.transpose(1,2), fmap2)
        corr = corr.view(batch, ht, wd, 1, ht, wd)
        return corr  / torch.sqrt(torch.tensor(dim).float())

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_model.py
================================================
import os
import sys

from .grit_src.image_dense_captions import image_caption_api, init_demo, dense_pred_to_caption, dense_pred_to_caption_only_name,dense_pred_to_caption_tuple
from detectron2.data.detection_utils import read_image

class DenseCaptioning():
    def __init__(self, device):
        self.device = device
        self.demo =  None


    def initialize_model(self, model_weight):
        self.demo = init_demo(self.device, model_weight=model_weight)
        
    def initialize_model_det(self, model_weight):
        self.demo = init_demo(self.device, model_weight = model_weight, task="ObjectDet")
    
    def image_dense_caption(self, image_src):
        dense_caption = image_caption_api(image_src, self.device)
        print('\033[1;35m' + '*' * 100 + '\033[0m')
        print("Step2, Dense Caption:\n")
        print(dense_caption)
        print('\033[1;35m' + '*' * 100 + '\033[0m')
        return dense_caption
    
    def run_caption_api(self,image_src):
        img = read_image(image_src, format="BGR")
        print(img.shape)
        predictions, visualized_output = self.demo.run_on_image(img)
        new_caption = dense_pred_to_caption_only_name(predictions)
        return new_caption

    def run_caption_tensor(self,img):
        predictions, visualized_output = self.demo.run_on_image(img)
        new_caption = dense_pred_to_caption_tuple(predictions)
        return new_caption, visualized_output

    def run_det_tensor(self,img):
        predictions, visualized_output = self.demo.run_on_image(img)
        return predictions, visualized_output


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/__init__.py
================================================
from .modeling.meta_arch.centernet_detector import CenterNetDetector
from .modeling.dense_heads.centernet import CenterNet
from .modeling.roi_heads.custom_roi_heads import CustomROIHeads, CustomCascadeROIHeads

from .modeling.backbone.fpn_p5 import build_p67_resnet_fpn_backbone
from .modeling.backbone.dla import build_dla_backbone
from .modeling.backbone.dlafpn import build_dla_fpn3_backbone
from .modeling.backbone.bifpn import build_resnet_bifpn_backbone
from .modeling.backbone.bifpn_fcos import build_fcos_resnet_bifpn_backbone
from .modeling.backbone.res2net import build_p67_res2net_fpn_backbone


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/config.py
================================================
from detectron2.config import CfgNode as CN

def add_centernet_config(cfg):
    _C = cfg

    _C.MODEL.CENTERNET = CN()
    _C.MODEL.CENTERNET.NUM_CLASSES = 80
    _C.MODEL.CENTERNET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
    _C.MODEL.CENTERNET.FPN_STRIDES = [8, 16, 32, 64, 128]
    _C.MODEL.CENTERNET.PRIOR_PROB = 0.01
    _C.MODEL.CENTERNET.INFERENCE_TH = 0.05
    _C.MODEL.CENTERNET.CENTER_NMS = False
    _C.MODEL.CENTERNET.NMS_TH_TRAIN = 0.6
    _C.MODEL.CENTERNET.NMS_TH_TEST = 0.6
    _C.MODEL.CENTERNET.PRE_NMS_TOPK_TRAIN = 1000
    _C.MODEL.CENTERNET.POST_NMS_TOPK_TRAIN = 100
    _C.MODEL.CENTERNET.PRE_NMS_TOPK_TEST = 1000
    _C.MODEL.CENTERNET.POST_NMS_TOPK_TEST = 100
    _C.MODEL.CENTERNET.NORM = "GN"
    _C.MODEL.CENTERNET.USE_DEFORMABLE = False
    _C.MODEL.CENTERNET.NUM_CLS_CONVS = 4
    _C.MODEL.CENTERNET.NUM_BOX_CONVS = 4
    _C.MODEL.CENTERNET.NUM_SHARE_CONVS = 0
    _C.MODEL.CENTERNET.LOC_LOSS_TYPE = 'giou'
    _C.MODEL.CENTERNET.SIGMOID_CLAMP = 1e-4
    _C.MODEL.CENTERNET.HM_MIN_OVERLAP = 0.8
    _C.MODEL.CENTERNET.MIN_RADIUS = 4
    _C.MODEL.CENTERNET.SOI = [[0, 80], [64, 160], [128, 320], [256, 640], [512, 10000000]]
    _C.MODEL.CENTERNET.POS_WEIGHT = 1.
    _C.MODEL.CENTERNET.NEG_WEIGHT = 1.
    _C.MODEL.CENTERNET.REG_WEIGHT = 2.
    _C.MODEL.CENTERNET.HM_FOCAL_BETA = 4
    _C.MODEL.CENTERNET.HM_FOCAL_ALPHA = 0.25
    _C.MODEL.CENTERNET.LOSS_GAMMA = 2.0
    _C.MODEL.CENTERNET.WITH_AGN_HM = False
    _C.MODEL.CENTERNET.ONLY_PROPOSAL = False
    _C.MODEL.CENTERNET.AS_PROPOSAL = False
    _C.MODEL.CENTERNET.IGNORE_HIGH_FP = -1.
    _C.MODEL.CENTERNET.MORE_POS = False
    _C.MODEL.CENTERNET.MORE_POS_THRESH = 0.2
    _C.MODEL.CENTERNET.MORE_POS_TOPK = 9
    _C.MODEL.CENTERNET.NOT_NORM_REG = True
    _C.MODEL.CENTERNET.NOT_NMS = False
    _C.MODEL.CENTERNET.NO_REDUCE = False

    _C.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE = False
    _C.MODEL.ROI_BOX_HEAD.PRIOR_PROB = 0.01
    _C.MODEL.ROI_BOX_HEAD.USE_EQL_LOSS = False
    _C.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH = \
        'datasets/lvis/lvis_v1_train_cat_info.json'
    _C.MODEL.ROI_BOX_HEAD.EQL_FREQ_CAT = 200
    _C.MODEL.ROI_BOX_HEAD.USE_FED_LOSS = False
    _C.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CAT = 50
    _C.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT = 0.5
    _C.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE = False

    _C.MODEL.BIFPN = CN()
    _C.MODEL.BIFPN.NUM_LEVELS = 5
    _C.MODEL.BIFPN.NUM_BIFPN = 6
    _C.MODEL.BIFPN.NORM = 'GN'
    _C.MODEL.BIFPN.OUT_CHANNELS = 160
    _C.MODEL.BIFPN.SEPARABLE_CONV = False

    _C.MODEL.DLA = CN()
    _C.MODEL.DLA.OUT_FEATURES = ['dla2']
    _C.MODEL.DLA.USE_DLA_UP = True
    _C.MODEL.DLA.NUM_LAYERS = 34
    _C.MODEL.DLA.MS_OUTPUT = False
    _C.MODEL.DLA.NORM = 'BN'
    _C.MODEL.DLA.DLAUP_IN_FEATURES = ['dla3', 'dla4', 'dla5']
    _C.MODEL.DLA.DLAUP_NODE = 'conv'

    _C.SOLVER.RESET_ITER = False
    _C.SOLVER.TRAIN_ITER = -1

    _C.INPUT.CUSTOM_AUG = ''
    _C.INPUT.TRAIN_SIZE = 640
    _C.INPUT.TEST_SIZE = 640
    _C.INPUT.SCALE_RANGE = (0.1, 2.)
    # 'default' for fixed short/ long edge, 'square' for max size=INPUT.SIZE
    _C.INPUT.TEST_INPUT_TYPE = 'default' 
    
    _C.DEBUG = False
    _C.SAVE_DEBUG = False
    _C.SAVE_PTH = False
    _C.VIS_THRESH = 0.3
    _C.DEBUG_SHOW_NAME = False


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/bifpn.py
================================================
# Modified from https://github.com/rwightman/efficientdet-pytorch/blob/master/effdet/efficientdet.py
# The original file is under Apache-2.0 License
import math
from os.path import join
import numpy as np
from collections import OrderedDict
from typing import List

import torch
from torch import nn
import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F
import fvcore.nn.weight_init as weight_init

from detectron2.layers import ShapeSpec, Conv2d
from detectron2.modeling.backbone.resnet import build_resnet_backbone
from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
from detectron2.layers.batch_norm import get_norm
from detectron2.modeling.backbone import Backbone
from .dlafpn import dla34

def get_fpn_config(base_reduction=8):
    """BiFPN config with sum."""
    p = {
        'nodes': [
            {'reduction': base_reduction << 3, 'inputs_offsets': [3, 4]},
            {'reduction': base_reduction << 2, 'inputs_offsets': [2, 5]},
            {'reduction': base_reduction << 1, 'inputs_offsets': [1, 6]},
            {'reduction': base_reduction, 'inputs_offsets': [0, 7]},
            {'reduction': base_reduction << 1, 'inputs_offsets': [1, 7, 8]},
            {'reduction': base_reduction << 2, 'inputs_offsets': [2, 6, 9]},
            {'reduction': base_reduction << 3, 'inputs_offsets': [3, 5, 10]},
            {'reduction': base_reduction << 4, 'inputs_offsets': [4, 11]},
        ],
        'weight_method': 'fastattn',
    }
    return p


def swish(x, inplace: bool = False):
    """Swish - Described in: https://arxiv.org/abs/1710.05941
    """
    return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())


class Swish(nn.Module):
    def __init__(self, inplace: bool = False):
        super(Swish, self).__init__()
        self.inplace = inplace

    def forward(self, x):
        return swish(x, self.inplace)


class SequentialAppend(nn.Sequential):
    def __init__(self, *args):
        super(SequentialAppend, self).__init__(*args)

    def forward(self, x):
        for module in self:
            x.append(module(x))
        return x


class SequentialAppendLast(nn.Sequential):
    def __init__(self, *args):
        super(SequentialAppendLast, self).__init__(*args)

    # def forward(self, x: List[torch.Tensor]):
    def forward(self, x):
        for module in self:
            x.append(module(x[-1]))
        return x


class ConvBnAct2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, padding='', bias=False,
                 norm='', act_layer=Swish):
        super(ConvBnAct2d, self).__init__()
        # self.conv = create_conv2d(
        #     in_channels, out_channels, kernel_size, stride=stride, dilation=dilation, padding=padding, bias=bias)
        self.conv = Conv2d(
            in_channels, out_channels, kernel_size=kernel_size, stride=stride, 
            padding=kernel_size // 2, bias=(norm == ''))
        self.bn = get_norm(norm, out_channels)
        self.act = None if act_layer is None else act_layer(inplace=True)

    def forward(self, x):
        x = self.conv(x)
        if self.bn is not None:
            x = self.bn(x)
        if self.act is not None:
            x = self.act(x)
        return x


class SeparableConv2d(nn.Module):
    """ Separable Conv
    """
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
                 channel_multiplier=1.0, pw_kernel_size=1, act_layer=Swish,
                 norm=''):
        super(SeparableConv2d, self).__init__()

        # self.conv_dw = create_conv2d(
        #     in_channels, int(in_channels * channel_multiplier), kernel_size,
        #     stride=stride, dilation=dilation, padding=padding, depthwise=True)

        self.conv_dw = Conv2d(
            in_channels, int(in_channels * channel_multiplier), 
            kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=bias,
            groups=out_channels)
        # print('conv_dw', kernel_size, stride) 
        # self.conv_pw = create_conv2d(
        #     int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
        
        self.conv_pw = Conv2d(
            int(in_channels * channel_multiplier), out_channels, 
            kernel_size=pw_kernel_size, padding=pw_kernel_size // 2, bias=(norm==''))
        # print('conv_pw', pw_kernel_size) 

        self.bn = get_norm(norm, out_channels)
        self.act = None if act_layer is None else act_layer(inplace=True)

    def forward(self, x):
        x = self.conv_dw(x)
        x = self.conv_pw(x)
        if self.bn is not None:
            x = self.bn(x)
        if self.act is not None:
            x = self.act(x)
        return x


class ResampleFeatureMap(nn.Sequential):
    def __init__(self, in_channels, out_channels, reduction_ratio=1., pad_type='', pooling_type='max',
                 norm='', apply_bn=False, conv_after_downsample=False,
                 redundant_bias=False):
        super(ResampleFeatureMap, self).__init__()
        pooling_type = pooling_type or 'max'
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.reduction_ratio = reduction_ratio
        self.conv_after_downsample = conv_after_downsample

        conv = None
        if in_channels != out_channels:
            conv = ConvBnAct2d(
                in_channels, out_channels, kernel_size=1, padding=pad_type,
                norm=norm if apply_bn else '', 
                bias=not apply_bn or redundant_bias, act_layer=None)

        if reduction_ratio > 1:
            stride_size = int(reduction_ratio)
            if conv is not None and not self.conv_after_downsample:
                self.add_module('conv', conv)
            self.add_module(
                'downsample',
                # create_pool2d(
                #     pooling_type, kernel_size=stride_size + 1, stride=stride_size, padding=pad_type)
                # nn.MaxPool2d(kernel_size=stride_size + 1, stride=stride_size, padding=pad_type)
                nn.MaxPool2d(kernel_size=stride_size, stride=stride_size)
                )
            if conv is not None and self.conv_after_downsample:
                self.add_module('conv', conv)
        else:
            if conv is not None:
                self.add_module('conv', conv)
            if reduction_ratio < 1:
                scale = int(1 // reduction_ratio)
                self.add_module('upsample', nn.UpsamplingNearest2d(scale_factor=scale))


class FpnCombine(nn.Module):
    def __init__(self, feature_info, fpn_config, fpn_channels, inputs_offsets, target_reduction, pad_type='',
                 pooling_type='max', norm='', apply_bn_for_resampling=False,
                 conv_after_downsample=False, redundant_bias=False, weight_method='attn'):
        super(FpnCombine, self).__init__()
        self.inputs_offsets = inputs_offsets
        self.weight_method = weight_method

        self.resample = nn.ModuleDict()
        for idx, offset in enumerate(inputs_offsets):
            in_channels = fpn_channels
            if offset < len(feature_info):
                in_channels = feature_info[offset]['num_chs']
                input_reduction = feature_info[offset]['reduction']
            else:
                node_idx = offset - len(feature_info)
                # print('node_idx, len', node_idx, len(fpn_config['nodes']))
                input_reduction = fpn_config['nodes'][node_idx]['reduction']
            reduction_ratio = target_reduction / input_reduction
            self.resample[str(offset)] = ResampleFeatureMap(
                in_channels, fpn_channels, reduction_ratio=reduction_ratio, pad_type=pad_type,
                pooling_type=pooling_type, norm=norm,
                apply_bn=apply_bn_for_resampling, conv_after_downsample=conv_after_downsample,
                redundant_bias=redundant_bias)

        if weight_method == 'attn' or weight_method == 'fastattn':
            # WSM
            self.edge_weights = nn.Parameter(torch.ones(len(inputs_offsets)), requires_grad=True)
        else:
            self.edge_weights = None

    def forward(self, x):
        dtype = x[0].dtype
        nodes = []
        for offset in self.inputs_offsets:
            input_node = x[offset]
            input_node = self.resample[str(offset)](input_node)
            nodes.append(input_node)

        if self.weight_method == 'attn':
            normalized_weights = torch.softmax(self.edge_weights.type(dtype), dim=0)
            x = torch.stack(nodes, dim=-1) * normalized_weights
        elif self.weight_method == 'fastattn':
            edge_weights = nn.functional.relu(self.edge_weights.type(dtype))
            weights_sum = torch.sum(edge_weights)
            x = torch.stack(
                [(nodes[i] * edge_weights[i]) / (weights_sum + 0.0001) for i in range(len(nodes))], dim=-1)
        elif self.weight_method == 'sum':
            x = torch.stack(nodes, dim=-1)
        else:
            raise ValueError('unknown weight_method {}'.format(self.weight_method))
        x = torch.sum(x, dim=-1)
        return x


class BiFpnLayer(nn.Module):
    def __init__(self, feature_info, fpn_config, fpn_channels, num_levels=5, pad_type='',
                 pooling_type='max', norm='', act_layer=Swish,
                 apply_bn_for_resampling=False, conv_after_downsample=True, conv_bn_relu_pattern=False,
                 separable_conv=True, redundant_bias=False):
        super(BiFpnLayer, self).__init__()
        self.fpn_config = fpn_config
        self.num_levels = num_levels
        self.conv_bn_relu_pattern = False

        self.feature_info = []
        self.fnode = SequentialAppend()
        for i, fnode_cfg in enumerate(fpn_config['nodes']):
            # logging.debug('fnode {} : {}'.format(i, fnode_cfg))
            # print('fnode {} : {}'.format(i, fnode_cfg))
            fnode_layers = OrderedDict()

            # combine features
            reduction = fnode_cfg['reduction']
            fnode_layers['combine'] = FpnCombine(
                feature_info, fpn_config, fpn_channels, fnode_cfg['inputs_offsets'], target_reduction=reduction,
                pad_type=pad_type, pooling_type=pooling_type, norm=norm,
                apply_bn_for_resampling=apply_bn_for_resampling, conv_after_downsample=conv_after_downsample,
                redundant_bias=redundant_bias, weight_method=fpn_config['weight_method'])
            self.feature_info.append(dict(num_chs=fpn_channels, reduction=reduction))

            # after combine ops
            after_combine = OrderedDict()
            if not conv_bn_relu_pattern:
                after_combine['act'] = act_layer(inplace=True)
                conv_bias = redundant_bias
                conv_act = None
            else:
                conv_bias = False
                conv_act = act_layer
            conv_kwargs = dict(
                in_channels=fpn_channels, out_channels=fpn_channels, kernel_size=3, padding=pad_type,
                bias=conv_bias, norm=norm, act_layer=conv_act)
            after_combine['conv'] = SeparableConv2d(**conv_kwargs) if separable_conv else ConvBnAct2d(**conv_kwargs)
            fnode_layers['after_combine'] = nn.Sequential(after_combine)

            self.fnode.add_module(str(i), nn.Sequential(fnode_layers))

        self.feature_info = self.feature_info[-num_levels::]

    def forward(self, x):
        x = self.fnode(x)
        return x[-self.num_levels::]


class BiFPN(Backbone):
    def __init__(
        self, cfg, bottom_up, in_features, out_channels, norm='', 
        num_levels=5, num_bifpn=4, separable_conv=False,
    ):
        super(BiFPN, self).__init__()
        assert isinstance(bottom_up, Backbone)
        
        # Feature map strides and channels from the bottom up network (e.g. ResNet)
        input_shapes = bottom_up.output_shape()
        in_strides = [input_shapes[f].stride for f in in_features]
        in_channels = [input_shapes[f].channels for f in in_features]

        self.num_levels = num_levels
        self.num_bifpn = num_bifpn
        self.bottom_up = bottom_up
        self.in_features = in_features
        self._size_divisibility = 128
        levels = [int(math.log2(s)) for s in in_strides]
        self._out_feature_strides = {
            "p{}".format(int(math.log2(s))): s for s in in_strides}
        if len(in_features) < num_levels:
            for l in range(num_levels - len(in_features)):
                s = l + levels[-1]
                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
        self._out_features = list(sorted(self._out_feature_strides.keys()))
        self._out_feature_channels = {k: out_channels for k in self._out_features}
        
        # print('self._out_feature_strides', self._out_feature_strides)
        # print('self._out_feature_channels', self._out_feature_channels)
        
        feature_info = [
            {'num_chs': in_channels[level], 'reduction': in_strides[level]} \
            for level in range(len(self.in_features))
        ]
        # self.config = config
        fpn_config = get_fpn_config()
        self.resample = SequentialAppendLast()
        for level in range(num_levels):
            if level < len(feature_info):
                in_chs = in_channels[level] # feature_info[level]['num_chs']
                reduction = in_strides[level] # feature_info[level]['reduction']
            else:
                # Adds a coarser level by downsampling the last feature map
                reduction_ratio = 2
                self.resample.add_module(str(level), ResampleFeatureMap(
                    in_channels=in_chs,
                    out_channels=out_channels,
                    pad_type='same',
                    pooling_type=None,
                    norm=norm,
                    reduction_ratio=reduction_ratio,
                    apply_bn=True,
                    conv_after_downsample=False,
                    redundant_bias=False,
                ))
                in_chs = out_channels
                reduction = int(reduction * reduction_ratio)
                feature_info.append(dict(num_chs=in_chs, reduction=reduction))

        self.cell = nn.Sequential()
        for rep in range(self.num_bifpn):
            # logging.debug('building cell {}'.format(rep))
            # print('building cell {}'.format(rep))
            fpn_layer = BiFpnLayer(
                feature_info=feature_info,
                fpn_config=fpn_config,
                fpn_channels=out_channels,
                num_levels=self.num_levels,
                pad_type='same',
                pooling_type=None,
                norm=norm,
                act_layer=Swish,
                separable_conv=separable_conv,
                apply_bn_for_resampling=True,
                conv_after_downsample=False,
                conv_bn_relu_pattern=False,
                redundant_bias=False,
            )
            self.cell.add_module(str(rep), fpn_layer)
            feature_info = fpn_layer.feature_info
        # import pdb; pdb.set_trace()

    @property
    def size_divisibility(self):
        return self._size_divisibility

    def forward(self, x):
        # print('input shapes', x.shape)
        bottom_up_features = self.bottom_up(x)
        x = [bottom_up_features[f] for f in self.in_features]
        assert len(self.resample) == self.num_levels - len(x)
        x = self.resample(x)
        shapes = [xx.shape for xx in x]
        # print('resample shapes', shapes)
        x = self.cell(x)
        out = {f: xx for f, xx in zip(self._out_features, x)}
        # import pdb; pdb.set_trace()
        return out


@BACKBONE_REGISTRY.register()
def build_resnet_bifpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode

    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = build_resnet_backbone(cfg, input_shape)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    backbone = BiFPN(
        cfg=cfg,
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=cfg.MODEL.BIFPN.OUT_CHANNELS,
        norm=cfg.MODEL.BIFPN.NORM,
        num_levels=cfg.MODEL.BIFPN.NUM_LEVELS,
        num_bifpn=cfg.MODEL.BIFPN.NUM_BIFPN,
        separable_conv=cfg.MODEL.BIFPN.SEPARABLE_CONV,
    )
    return backbone

@BACKBONE_REGISTRY.register()
def build_p37_dla_bifpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode
    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = dla34(cfg)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    assert cfg.MODEL.BIFPN.NUM_LEVELS == 5

    backbone = BiFPN(
        cfg=cfg,
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=cfg.MODEL.BIFPN.OUT_CHANNELS,
        norm=cfg.MODEL.BIFPN.NORM,
        num_levels=cfg.MODEL.BIFPN.NUM_LEVELS,
        num_bifpn=cfg.MODEL.BIFPN.NUM_BIFPN,
        separable_conv=cfg.MODEL.BIFPN.SEPARABLE_CONV,
    )
    return backbone


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/bifpn_fcos.py
================================================
# This file is modified from https://github.com/aim-uofa/AdelaiDet/blob/master/adet/modeling/backbone/bifpn.py
# The original file is under 2-clause BSD License for academic use, and *non-commercial use*.
import torch
import torch.nn.functional as F
from torch import nn

from detectron2.layers import Conv2d, ShapeSpec, get_norm

from detectron2.modeling.backbone import Backbone, build_resnet_backbone
from detectron2.modeling import BACKBONE_REGISTRY
from .dlafpn import dla34

__all__ = []


def swish(x):
    return x * x.sigmoid()


def split_name(name):
    for i, c in enumerate(name):
        if not c.isalpha():
            return name[:i], int(name[i:])
    raise ValueError()


class FeatureMapResampler(nn.Module):
    def __init__(self, in_channels, out_channels, stride, norm=""):
        super(FeatureMapResampler, self).__init__()
        if in_channels != out_channels:
            self.reduction = Conv2d(
                in_channels, out_channels, kernel_size=1,
                bias=(norm == ""),
                norm=get_norm(norm, out_channels),
                activation=None
            )
        else:
            self.reduction = None

        assert stride <= 2
        self.stride = stride

    def forward(self, x):
        if self.reduction is not None:
            x = self.reduction(x)

        if self.stride == 2:
            x = F.max_pool2d(
                x, kernel_size=self.stride + 1,
                stride=self.stride, padding=1
            )
        elif self.stride == 1:
            pass
        else:
            raise NotImplementedError()
        return x


class BackboneWithTopLevels(Backbone):
    def __init__(self, backbone, out_channels, num_top_levels, norm=""):
        super(BackboneWithTopLevels, self).__init__()
        self.backbone = backbone
        backbone_output_shape = backbone.output_shape()

        self._out_feature_channels = {name: shape.channels for name, shape in backbone_output_shape.items()}
        self._out_feature_strides = {name: shape.stride for name, shape in backbone_output_shape.items()}
        self._out_features = list(self._out_feature_strides.keys())

        last_feature_name = max(self._out_feature_strides.keys(), key=lambda x: split_name(x)[1])
        self.last_feature_name = last_feature_name
        self.num_top_levels = num_top_levels

        last_channels = self._out_feature_channels[last_feature_name]
        last_stride = self._out_feature_strides[last_feature_name]

        prefix, suffix = split_name(last_feature_name)
        prev_channels = last_channels
        for i in range(num_top_levels):
            name = prefix + str(suffix + i + 1)
            self.add_module(name, FeatureMapResampler(
                prev_channels, out_channels, 2, norm
            ))
            prev_channels = out_channels

            self._out_feature_channels[name] = out_channels
            self._out_feature_strides[name] = last_stride * 2 ** (i + 1)
            self._out_features.append(name)

    def forward(self, x):
        outputs = self.backbone(x)
        last_features = outputs[self.last_feature_name]
        prefix, suffix = split_name(self.last_feature_name)

        x = last_features
        for i in range(self.num_top_levels):
            name = prefix + str(suffix + i + 1)
            x = self.__getattr__(name)(x)
            outputs[name] = x

        return outputs


class SingleBiFPN(Backbone):
    """
    This module implements Feature Pyramid Network.
    It creates pyramid features built on top of some input feature maps.
    """

    def __init__(
        self, in_channels_list, out_channels, norm=""
    ):
        """
        Args:
            bottom_up (Backbone): module representing the bottom up subnetwork.
                Must be a subclass of :class:`Backbone`. The multi-scale feature
                maps generated by the bottom up network, and listed in `in_features`,
                are used to generate FPN levels.
            in_features (list[str]): names of the input feature maps coming
                from the backbone to which FPN is attached. For example, if the
                backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
                of these may be used; order must be from high to low resolution.
            out_channels (int): number of channels in the output feature maps.
            norm (str): the normalization to use.
        """
        super(SingleBiFPN, self).__init__()

        self.out_channels = out_channels
        # build 5-levels bifpn
        if len(in_channels_list) == 5:
            self.nodes = [
                {'feat_level': 3, 'inputs_offsets': [3, 4]},
                {'feat_level': 2, 'inputs_offsets': [2, 5]},
                {'feat_level': 1, 'inputs_offsets': [1, 6]},
                {'feat_level': 0, 'inputs_offsets': [0, 7]},
                {'feat_level': 1, 'inputs_offsets': [1, 7, 8]},
                {'feat_level': 2, 'inputs_offsets': [2, 6, 9]},
                {'feat_level': 3, 'inputs_offsets': [3, 5, 10]},
                {'feat_level': 4, 'inputs_offsets': [4, 11]},
            ]
        elif len(in_channels_list) == 3:
            self.nodes = [
                {'feat_level': 1, 'inputs_offsets': [1, 2]},
                {'feat_level': 0, 'inputs_offsets': [0, 3]},
                {'feat_level': 1, 'inputs_offsets': [1, 3, 4]},
                {'feat_level': 2, 'inputs_offsets': [2, 5]},
            ]
        else:
            raise NotImplementedError

        node_info = [_ for _ in in_channels_list]

        num_output_connections = [0 for _ in in_channels_list]
        for fnode in self.nodes:
            feat_level = fnode["feat_level"]
            inputs_offsets = fnode["inputs_offsets"]
            inputs_offsets_str = "_".join(map(str, inputs_offsets))
            for input_offset in inputs_offsets:
                num_output_connections[input_offset] += 1

                in_channels = node_info[input_offset]
                if in_channels != out_channels:
                    lateral_conv = Conv2d(
                        in_channels,
                        out_channels,
                        kernel_size=1,
                        norm=get_norm(norm, out_channels)
                    )
                    self.add_module(
                        "lateral_{}_f{}".format(input_offset, feat_level), lateral_conv
                    )
            node_info.append(out_channels)
            num_output_connections.append(0)

            # generate attention weights
            name = "weights_f{}_{}".format(feat_level, inputs_offsets_str)
            self.__setattr__(name, nn.Parameter(
                    torch.ones(len(inputs_offsets), dtype=torch.float32),
                    requires_grad=True
                ))

            # generate convolutions after combination
            name = "outputs_f{}_{}".format(feat_level, inputs_offsets_str)
            self.add_module(name, Conv2d(
                out_channels,
                out_channels,
                kernel_size=3,
                padding=1,
                norm=get_norm(norm, out_channels),
                bias=(norm == "")
            ))

    def forward(self, feats):
        """
        Args:
            input (dict[str->Tensor]): mapping feature map name (e.g., "p5") to
                feature map tensor for each feature level in high to low resolution order.
        Returns:
            dict[str->Tensor]:
                mapping from feature map name to FPN feature map tensor
                in high to low resolution order. Returned feature names follow the FPN
                paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
                ["n2", "n3", ..., "n6"].
        """
        feats = [_ for _ in feats]
        num_levels = len(feats)
        num_output_connections = [0 for _ in feats]
        for fnode in self.nodes:
            feat_level = fnode["feat_level"]
            inputs_offsets = fnode["inputs_offsets"]
            inputs_offsets_str = "_".join(map(str, inputs_offsets))
            input_nodes = []
            _, _, target_h, target_w = feats[feat_level].size()
            for input_offset in inputs_offsets:
                num_output_connections[input_offset] += 1
                input_node = feats[input_offset]

                # reduction
                if input_node.size(1) != self.out_channels:
                    name = "lateral_{}_f{}".format(input_offset, feat_level)
                    input_node = self.__getattr__(name)(input_node)

                # maybe downsample
                _, _, h, w = input_node.size()
                if h > target_h and w > target_w:
                    height_stride_size = int((h - 1) // target_h + 1)
                    width_stride_size = int((w - 1) // target_w + 1)
                    assert height_stride_size == width_stride_size == 2
                    input_node = F.max_pool2d(
                        input_node, kernel_size=(height_stride_size + 1, width_stride_size + 1),
                        stride=(height_stride_size, width_stride_size), padding=1
                    )
                elif h <= target_h and w <= target_w:
                    if h < target_h or w < target_w:
                        input_node = F.interpolate(
                            input_node,
                            size=(target_h, target_w),
                            mode="nearest"
                        )
                else:
                    raise NotImplementedError()
                input_nodes.append(input_node)

            # attention
            name = "weights_f{}_{}".format(feat_level, inputs_offsets_str)
            weights = F.relu(self.__getattr__(name))
            norm_weights = weights / (weights.sum() + 0.0001)

            new_node = torch.stack(input_nodes, dim=-1)
            new_node = (norm_weights * new_node).sum(dim=-1)
            new_node = swish(new_node)

            name = "outputs_f{}_{}".format(feat_level, inputs_offsets_str)
            feats.append(self.__getattr__(name)(new_node))

            num_output_connections.append(0)

        output_feats = []
        for idx in range(num_levels):
            for i, fnode in enumerate(reversed(self.nodes)):
                if fnode['feat_level'] == idx:
                    output_feats.append(feats[-1 - i])
                    break
            else:
                raise ValueError()
        return output_feats


class BiFPN(Backbone):
    """
    This module implements Feature Pyramid Network.
    It creates pyramid features built on top of some input feature maps.
    """

    def __init__(
        self, bottom_up, in_features, out_channels, num_top_levels, num_repeats, norm=""
    ):
        """
        Args:
            bottom_up (Backbone): module representing the bottom up subnetwork.
                Must be a subclass of :class:`Backbone`. The multi-scale feature
                maps generated by the bottom up network, and listed in `in_features`,
                are used to generate FPN levels.
            in_features (list[str]): names of the input feature maps coming
                from the backbone to which FPN is attached. For example, if the
                backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
                of these may be used; order must be from high to low resolution.
            out_channels (int): number of channels in the output feature maps.
            num_top_levels (int): the number of the top levels (p6 or p7).
            num_repeats (int): the number of repeats of BiFPN.
            norm (str): the normalization to use.
        """
        super(BiFPN, self).__init__()
        assert isinstance(bottom_up, Backbone)

        # add extra feature levels (i.e., 6 and 7)
        self.bottom_up = BackboneWithTopLevels(
            bottom_up, out_channels,
            num_top_levels, norm
        )
        bottom_up_output_shapes = self.bottom_up.output_shape()

        in_features = sorted(in_features, key=lambda x: split_name(x)[1])
        self._size_divisibility = 128 #bottom_up_output_shapes[in_features[-1]].stride
        self.out_channels = out_channels
        self.min_level = split_name(in_features[0])[1]

        # add the names for top blocks
        prefix, last_suffix = split_name(in_features[-1])
        for i in range(num_top_levels):
            in_features.append(prefix + str(last_suffix + i + 1))
        self.in_features = in_features

        # generate output features
        self._out_features = ["p{}".format(split_name(name)[1]) for name in in_features]
        self._out_feature_strides = {
            out_name: bottom_up_output_shapes[in_name].stride
            for out_name, in_name in zip(self._out_features, in_features)
        }
        self._out_feature_channels = {k: out_channels for k in self._out_features}

        # build bifpn
        self.repeated_bifpn = nn.ModuleList()
        for i in range(num_repeats):
            if i == 0:
                in_channels_list = [
                    bottom_up_output_shapes[name].channels for name in in_features
                ]
            else:
                in_channels_list = [
                    self._out_feature_channels[name] for name in self._out_features
                ]
            self.repeated_bifpn.append(SingleBiFPN(
                in_channels_list, out_channels, norm
            ))

    @property
    def size_divisibility(self):
        return self._size_divisibility

    def forward(self, x):
        """
        Args:
            input (dict[str->Tensor]): mapping feature map name (e.g., "p5") to
                feature map tensor for each feature level in high to low resolution order.
        Returns:
            dict[str->Tensor]:
                mapping from feature map name to FPN feature map tensor
                in high to low resolution order. Returned feature names follow the FPN
                paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
                ["n2", "n3", ..., "n6"].
        """
        bottom_up_features = self.bottom_up(x)
        feats = [bottom_up_features[f] for f in self.in_features]

        for bifpn in self.repeated_bifpn:
             feats = bifpn(feats)

        return dict(zip(self._out_features, feats))


def _assert_strides_are_log2_contiguous(strides):
    """
    Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
    """
    for i, stride in enumerate(strides[1:], 1):
        assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(
            stride, strides[i - 1]
        )


@BACKBONE_REGISTRY.register()
def build_fcos_resnet_bifpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode
    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = build_resnet_backbone(cfg, input_shape)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.BIFPN.OUT_CHANNELS
    num_repeats = cfg.MODEL.BIFPN.NUM_BIFPN
    top_levels = 2

    backbone = BiFPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        num_top_levels=top_levels,
        num_repeats=num_repeats,
        norm=cfg.MODEL.BIFPN.NORM
    )
    return backbone


@BACKBONE_REGISTRY.register()
def build_p35_fcos_resnet_bifpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode
    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = build_resnet_backbone(cfg, input_shape)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.BIFPN.OUT_CHANNELS
    num_repeats = cfg.MODEL.BIFPN.NUM_BIFPN
    top_levels = 0

    backbone = BiFPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        num_top_levels=top_levels,
        num_repeats=num_repeats,
        norm=cfg.MODEL.BIFPN.NORM
    )
    return backbone


@BACKBONE_REGISTRY.register()
def build_p35_fcos_dla_bifpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode
    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = dla34(cfg)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.BIFPN.OUT_CHANNELS
    num_repeats = cfg.MODEL.BIFPN.NUM_BIFPN
    top_levels = 0

    backbone = BiFPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        num_top_levels=top_levels,
        num_repeats=num_repeats,
        norm=cfg.MODEL.BIFPN.NORM
    )
    return backbone

@BACKBONE_REGISTRY.register()
def build_p37_fcos_dla_bifpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode
    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = dla34(cfg)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.BIFPN.OUT_CHANNELS
    num_repeats = cfg.MODEL.BIFPN.NUM_BIFPN
    assert cfg.MODEL.BIFPN.NUM_LEVELS == 5
    top_levels = 2

    backbone = BiFPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        num_top_levels=top_levels,
        num_repeats=num_repeats,
        norm=cfg.MODEL.BIFPN.NORM
    )
    return backbone


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/dla.py
================================================
import numpy as np
import math
from os.path import join
import fvcore.nn.weight_init as weight_init
import torch
import torch.nn.functional as F
from torch import nn
import torch.utils.model_zoo as model_zoo

from detectron2.modeling.backbone.resnet import (
    BasicStem, BottleneckBlock, DeformBottleneckBlock)
from detectron2.layers import (
    Conv2d,
    DeformConv,
    FrozenBatchNorm2d,
    ModulatedDeformConv,
    ShapeSpec,
    get_norm,
)

from detectron2.modeling.backbone.backbone import Backbone
from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
from detectron2.modeling.backbone.fpn import FPN

__all__ = [
    "BottleneckBlock",
    "DeformBottleneckBlock",
    "BasicStem",
]

DCNV1 = False

HASH = {
    34: 'ba72cf86',
    60: '24839fc4',
}

def get_model_url(data, name, hash):
    return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))

class BasicBlock(nn.Module):
    def __init__(self, inplanes, planes, stride=1, dilation=1, norm='BN'):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
                               stride=stride, padding=dilation,
                               bias=False, dilation=dilation)
        self.bn1 = get_norm(norm, planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=dilation,
                               bias=False, dilation=dilation)
        self.bn2 = get_norm(norm, planes)
        self.stride = stride

    def forward(self, x, residual=None):
        if residual is None:
            residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        out += residual
        out = self.relu(out)

        return out

class Bottleneck(nn.Module):
    expansion = 2

    def __init__(self, inplanes, planes, stride=1, dilation=1, norm='BN'):
        super(Bottleneck, self).__init__()
        expansion = Bottleneck.expansion
        bottle_planes = planes // expansion
        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
                               kernel_size=1, bias=False)
        self.bn1 = get_norm(norm, bottle_planes)
        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
                               stride=stride, padding=dilation,
                               bias=False, dilation=dilation)
        self.bn2 = get_norm(norm, bottle_planes)
        self.conv3 = nn.Conv2d(bottle_planes, planes,
                               kernel_size=1, bias=False)
        self.bn3 = get_norm(norm, planes)
        self.relu = nn.ReLU(inplace=True)
        self.stride = stride

    def forward(self, x, residual=None):
        if residual is None:
            residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        out += residual
        out = self.relu(out)

        return out

class Root(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, residual, norm='BN'):
        super(Root, self).__init__()
        self.conv = nn.Conv2d(
            in_channels, out_channels, 1,
            stride=1, bias=False, padding=(kernel_size - 1) // 2)
        self.bn = get_norm(norm, out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.residual = residual

    def forward(self, *x):
        children = x
        x = self.conv(torch.cat(x, 1))
        x = self.bn(x)
        if self.residual:
            x += children[0]
        x = self.relu(x)

        return x


class Tree(nn.Module):
    def __init__(self, levels, block, in_channels, out_channels, stride=1,
                 level_root=False, root_dim=0, root_kernel_size=1,
                 dilation=1, root_residual=False, norm='BN'):
        super(Tree, self).__init__()
        if root_dim == 0:
            root_dim = 2 * out_channels
        if level_root:
            root_dim += in_channels
        if levels == 1:
            self.tree1 = block(in_channels, out_channels, stride,
                               dilation=dilation, norm=norm)
            self.tree2 = block(out_channels, out_channels, 1,
                               dilation=dilation, norm=norm)
        else:
            self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
                              stride, root_dim=0,
                              root_kernel_size=root_kernel_size,
                              dilation=dilation, root_residual=root_residual, 
                              norm=norm)
            self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
                              root_dim=root_dim + out_channels,
                              root_kernel_size=root_kernel_size,
                              dilation=dilation, root_residual=root_residual, 
                              norm=norm)
        if levels == 1:
            self.root = Root(root_dim, out_channels, root_kernel_size,
                             root_residual, norm=norm)
        self.level_root = level_root
        self.root_dim = root_dim
        self.downsample = None
        self.project = None
        self.levels = levels
        if stride > 1:
            self.downsample = nn.MaxPool2d(stride, stride=stride)
        if in_channels != out_channels:
            self.project = nn.Sequential(
                nn.Conv2d(in_channels, out_channels,
                          kernel_size=1, stride=1, bias=False),
                get_norm(norm, out_channels)
            )

    def forward(self, x, residual=None, children=None):
        children = [] if children is None else children
        bottom = self.downsample(x) if self.downsample else x
        residual = self.project(bottom) if self.project else bottom
        if self.level_root:
            children.append(bottom)
        x1 = self.tree1(x, residual)
        if self.levels == 1:
            x2 = self.tree2(x1)
            x = self.root(x2, x1, *children)
        else:
            children.append(x1)
            x = self.tree2(x1, children=children)
        return x

class DLA(nn.Module):
    def __init__(self, num_layers, levels, channels, 
        block=BasicBlock, residual_root=False, norm='BN'):
        """
        Args:
        """
        super(DLA, self).__init__()
        self.norm = norm
        self.channels = channels
        self.base_layer = nn.Sequential(
            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
                      padding=3, bias=False),
            get_norm(self.norm, channels[0]),
            nn.ReLU(inplace=True))
        self.level0 = self._make_conv_level(
            channels[0], channels[0], levels[0])
        self.level1 = self._make_conv_level(
            channels[0], channels[1], levels[1], stride=2)
        self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
                           level_root=False,
                           root_residual=residual_root, norm=norm)
        self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
                           level_root=True, root_residual=residual_root, 
                           norm=norm)
        self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
                           level_root=True, root_residual=residual_root, 
                           norm=norm)
        self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
                           level_root=True, root_residual=residual_root, 
                           norm=norm)
        self.load_pretrained_model(
            data='imagenet', name='dla{}'.format(num_layers), 
            hash=HASH[num_layers])

    def load_pretrained_model(self, data, name, hash):
        model_url = get_model_url(data, name, hash)
        model_weights = model_zoo.load_url(model_url)
        num_classes = len(model_weights[list(model_weights.keys())[-1]])
        self.fc = nn.Conv2d(
            self.channels[-1], num_classes,
            kernel_size=1, stride=1, padding=0, bias=True)
        print('Loading pretrained')
        self.load_state_dict(model_weights, strict=False)

    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
        modules = []
        for i in range(convs):
            modules.extend([
                nn.Conv2d(inplanes, planes, kernel_size=3,
                          stride=stride if i == 0 else 1,
                          padding=dilation, bias=False, dilation=dilation),
                get_norm(self.norm, planes),
                nn.ReLU(inplace=True)])
            inplanes = planes
        return nn.Sequential(*modules)

    def forward(self, x):
        y = []
        x = self.base_layer(x)
        for i in range(6):
            x = getattr(self, 'level{}'.format(i))(x)
            y.append(x)
        return y


def fill_up_weights(up):
    w = up.weight.data
    f = math.ceil(w.size(2) / 2)
    c = (2 * f - 1 - f % 2) / (2. * f)
    for i in range(w.size(2)):
        for j in range(w.size(3)):
            w[0, 0, i, j] = \
                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
    for c in range(1, w.size(0)):
        w[c, 0, :, :] = w[0, 0, :, :]


class _DeformConv(nn.Module):
    def __init__(self, chi, cho, norm='BN'):
        super(_DeformConv, self).__init__()
        self.actf = nn.Sequential(
            get_norm(norm, cho),
            nn.ReLU(inplace=True)
        )
        if DCNV1:
            self.offset = Conv2d(
                chi, 18, kernel_size=3, stride=1,
                padding=1, dilation=1)
            self.conv = DeformConv(
                chi, cho, kernel_size=(3,3), stride=1, padding=1,
                dilation=1, deformable_groups=1)
        else:
            self.offset = Conv2d(
                chi, 27, kernel_size=3, stride=1,
                padding=1, dilation=1)
            self.conv = ModulatedDeformConv(
                chi, cho, kernel_size=3, stride=1, padding=1,
                dilation=1, deformable_groups=1)
        nn.init.constant_(self.offset.weight, 0)
        nn.init.constant_(self.offset.bias, 0)
        
    def forward(self, x):
        if DCNV1:
            offset = self.offset(x)
            x = self.conv(x, offset)
        else:
            offset_mask = self.offset(x)
            offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
            offset = torch.cat((offset_x, offset_y), dim=1)
            mask = mask.sigmoid()
            x = self.conv(x, offset, mask)
        x = self.actf(x)
        return x


class IDAUp(nn.Module):
    def __init__(self, o, channels, up_f, norm='BN'):
        super(IDAUp, self).__init__()
        for i in range(1, len(channels)):
            c = channels[i]
            f = int(up_f[i])  
            proj = _DeformConv(c, o, norm=norm)
            node = _DeformConv(o, o, norm=norm)
     
            up = nn.ConvTranspose2d(o, o, f * 2, stride=f, 
                                    padding=f // 2, output_padding=0,
                                    groups=o, bias=False)
            fill_up_weights(up)

            setattr(self, 'proj_' + str(i), proj)
            setattr(self, 'up_' + str(i), up)
            setattr(self, 'node_' + str(i), node)
                 
        
    def forward(self, layers, startp, endp):
        for i in range(startp + 1, endp):
            upsample = getattr(self, 'up_' + str(i - startp))
            project = getattr(self, 'proj_' + str(i - startp))
            layers[i] = upsample(project(layers[i]))
            node = getattr(self, 'node_' + str(i - startp))
            layers[i] = node(layers[i] + layers[i - 1])


class DLAUp(nn.Module):
    def __init__(self, startp, channels, scales, in_channels=None, norm='BN'):
        super(DLAUp, self).__init__()
        self.startp = startp
        if in_channels is None:
            in_channels = channels
        self.channels = channels
        channels = list(channels)
        scales = np.array(scales, dtype=int)
        for i in range(len(channels) - 1):
            j = -i - 2
            setattr(self, 'ida_{}'.format(i),
                    IDAUp(channels[j], in_channels[j:],
                          scales[j:] // scales[j], norm=norm))
            scales[j + 1:] = scales[j]
            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]

    def forward(self, layers):
        out = [layers[-1]] # start with 32
        for i in range(len(layers) - self.startp - 1):
            ida = getattr(self, 'ida_{}'.format(i))
            ida(layers, len(layers) -i - 2, len(layers))
            out.insert(0, layers[-1])
        return out

DLA_CONFIGS = {
    34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512], BasicBlock),
    60: ([1, 1, 1, 2, 3, 1], [16, 32, 128, 256, 512, 1024], Bottleneck)
}


class DLASeg(Backbone):
    def __init__(self, num_layers, out_features, use_dla_up=True, 
        ms_output=False, norm='BN'):
        super(DLASeg, self).__init__()
        # depth = 34
        levels, channels, Block = DLA_CONFIGS[num_layers]
        self.base = DLA(num_layers=num_layers,
            levels=levels, channels=channels, block=Block, norm=norm)
        down_ratio = 4
        self.first_level = int(np.log2(down_ratio))
        self.ms_output = ms_output
        self.last_level = 5 if not self.ms_output else 6
        channels = self.base.channels
        scales = [2 ** i for i in range(len(channels[self.first_level:]))]
        self.use_dla_up = use_dla_up
        if self.use_dla_up:
            self.dla_up = DLAUp(
                self.first_level, channels[self.first_level:], scales, 
                norm=norm)
        out_channel = channels[self.first_level]
        if not self.ms_output: # stride 4 DLA
            self.ida_up = IDAUp(
                out_channel, channels[self.first_level:self.last_level], 
                [2 ** i for i in range(self.last_level - self.first_level)], 
                norm=norm)
        self._out_features = out_features
        self._out_feature_channels = {
            'dla{}'.format(i): channels[i] for i in range(6)}
        self._out_feature_strides = {
            'dla{}'.format(i): 2 ** i for i in range(6)}
        self._size_divisibility = 32

    @property
    def size_divisibility(self):
        return self._size_divisibility

    def forward(self, x):
        x = self.base(x)
        if self.use_dla_up:
            x = self.dla_up(x)
        if not self.ms_output: # stride 4 dla
            y = []
            for i in range(self.last_level - self.first_level):
                y.append(x[i].clone())
            self.ida_up(y, 0, len(y))
            ret = {}
            for i in range(self.last_level - self.first_level):
                out_feature = 'dla{}'.format(i)
                if out_feature in self._out_features:
                    ret[out_feature] = y[i]
        else:
            ret = {}
            st = self.first_level if self.use_dla_up else 0
            for i in range(self.last_level - st):
                out_feature = 'dla{}'.format(i + st)
                if out_feature in self._out_features:
                    ret[out_feature] = x[i]
        
        return ret


@BACKBONE_REGISTRY.register()
def build_dla_backbone(cfg, input_shape):
    """
    Create a ResNet instance from config.

    Returns:
        ResNet: a :class:`ResNet` instance.
    """
    return DLASeg(
        out_features=cfg.MODEL.DLA.OUT_FEATURES, 
        num_layers=cfg.MODEL.DLA.NUM_LAYERS,
        use_dla_up=cfg.MODEL.DLA.USE_DLA_UP,
        ms_output=cfg.MODEL.DLA.MS_OUTPUT,
        norm=cfg.MODEL.DLA.NORM)

class LastLevelP6P7(nn.Module):
    """
    This module is used in RetinaNet to generate extra layers, P6 and P7 from
    C5 feature.
    """

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.num_levels = 2
        self.in_feature = "dla5"
        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
        for module in [self.p6, self.p7]:
            weight_init.c2_xavier_fill(module)

    def forward(self, c5):
        p6 = self.p6(c5)
        p7 = self.p7(F.relu(p6))
        return [p6, p7]

@BACKBONE_REGISTRY.register()
def build_retinanet_dla_fpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode
    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = build_dla_backbone(cfg, input_shape)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
    in_channels_p6p7 = bottom_up.output_shape()['dla5'].channels
    backbone = FPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        norm=cfg.MODEL.FPN.NORM,
        top_block=LastLevelP6P7(in_channels_p6p7, out_channels),
        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
    )
    return backbone


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/dlafpn.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# this file is from https://github.com/ucbdrive/dla/blob/master/dla.py.

import math
from os.path import join
import numpy as np

import torch
from torch import nn
import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F
import fvcore.nn.weight_init as weight_init

from detectron2.modeling.backbone import FPN
from detectron2.layers import ShapeSpec, ModulatedDeformConv, Conv2d
from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
from detectron2.layers.batch_norm import get_norm
from detectron2.modeling.backbone import Backbone

WEB_ROOT = 'http://dl.yf.io/dla/models'


def get_model_url(data, name, hash):
    return join(
        'http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))


def conv3x3(in_planes, out_planes, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock(nn.Module):
    def __init__(self, cfg, inplanes, planes, stride=1, dilation=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
                               stride=stride, padding=dilation,
                               bias=False, dilation=dilation)
        self.bn1 = get_norm(cfg.MODEL.DLA.NORM, planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=dilation,
                               bias=False, dilation=dilation)
        self.bn2 = get_norm(cfg.MODEL.DLA.NORM, planes)
        self.stride = stride

    def forward(self, x, residual=None):
        if residual is None:
            residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 2

    def __init__(self, cfg, inplanes, planes, stride=1, dilation=1):
        super(Bottleneck, self).__init__()
        expansion = Bottleneck.expansion
        bottle_planes = planes // expansion
        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
                               kernel_size=1, bias=False)
        self.bn1 = get_norm(cfg.MODEL.DLA.NORM, bottle_planes)
        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
                               stride=stride, padding=dilation,
                               bias=False, dilation=dilation)
        self.bn2 = get_norm(cfg.MODEL.DLA.NORM, bottle_planes)
        self.conv3 = nn.Conv2d(bottle_planes, planes,
                               kernel_size=1, bias=False)
        self.bn3 = get_norm(cfg.MODEL.DLA.NORM, planes)
        self.relu = nn.ReLU(inplace=True)
        self.stride = stride

    def forward(self, x, residual=None):
        if residual is None:
            residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        out += residual
        out = self.relu(out)

        return out


class Root(nn.Module):
    def __init__(self, cfg, in_channels, out_channels, kernel_size, residual):
        super(Root, self).__init__()
        self.conv = nn.Conv2d(
            in_channels, out_channels, kernel_size,
            stride=1, bias=False, padding=(kernel_size - 1) // 2)
        self.bn = get_norm(cfg.MODEL.DLA.NORM, out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.residual = residual

    def forward(self, *x):
        children = x
        x = self.conv(torch.cat(x, 1))
        x = self.bn(x)
        if self.residual:
            x += children[0]
        x = self.relu(x)

        return x


class Tree(nn.Module):
    def __init__(self, cfg, levels, block, in_channels, out_channels, stride=1,
                 level_root=False, root_dim=0, root_kernel_size=1,
                 dilation=1, root_residual=False):
        super(Tree, self).__init__()
        if root_dim == 0:
            root_dim = 2 * out_channels
        if level_root:
            root_dim += in_channels
        if levels == 1:
            self.tree1 = block(cfg, in_channels, out_channels, stride,
                               dilation=dilation)
            self.tree2 = block(cfg, out_channels, out_channels, 1,
                               dilation=dilation)
        else:
            self.tree1 = Tree(cfg, levels - 1, block, in_channels, out_channels,
                              stride, root_dim=0,
                              root_kernel_size=root_kernel_size,
                              dilation=dilation, root_residual=root_residual)
            self.tree2 = Tree(cfg, levels - 1, block, out_channels, out_channels,
                              root_dim=root_dim + out_channels,
                              root_kernel_size=root_kernel_size,
                              dilation=dilation, root_residual=root_residual)
        if levels == 1:
            self.root = Root(cfg, root_dim, out_channels, root_kernel_size,
                             root_residual)
        self.level_root = level_root
        self.root_dim = root_dim
        self.downsample = None
        self.project = None
        self.levels = levels
        if stride > 1:
            self.downsample = nn.MaxPool2d(stride, stride=stride)
        if in_channels != out_channels:
            self.project = nn.Sequential(
                nn.Conv2d(in_channels, out_channels,
                          kernel_size=1, stride=1, bias=False),
                get_norm(cfg.MODEL.DLA.NORM, out_channels)
            )

    def forward(self, x, residual=None, children=None):
        if self.training and residual is not None:
            x = x + residual.sum() * 0.0
        children = [] if children is None else children
        bottom = self.downsample(x) if self.downsample else x
        residual = self.project(bottom) if self.project else bottom
        if self.level_root:
            children.append(bottom)
        x1 = self.tree1(x, residual)
        if self.levels == 1:
            x2 = self.tree2(x1)
            x = self.root(x2, x1, *children)
        else:
            children.append(x1)
            x = self.tree2(x1, children=children)
        return x


class DLA(Backbone):
    def __init__(self, cfg, levels, channels, block=BasicBlock, residual_root=False):
        super(DLA, self).__init__()
        self.cfg = cfg
        self.channels = channels

        self._out_features = ["dla{}".format(i) for i in range(6)]
        self._out_feature_channels = {k: channels[i] for i, k in enumerate(self._out_features)}
        self._out_feature_strides = {k: 2 ** i for i, k in enumerate(self._out_features)}

        self.base_layer = nn.Sequential(
            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
                      padding=3, bias=False),
            get_norm(cfg.MODEL.DLA.NORM, channels[0]),
            nn.ReLU(inplace=True))
        self.level0 = self._make_conv_level(
            channels[0], channels[0], levels[0])
        self.level1 = self._make_conv_level(
            channels[0], channels[1], levels[1], stride=2)
        self.level2 = Tree(cfg, levels[2], block, channels[1], channels[2], 2,
                           level_root=False,
                           root_residual=residual_root)
        self.level3 = Tree(cfg, levels[3], block, channels[2], channels[3], 2,
                           level_root=True, root_residual=residual_root)
        self.level4 = Tree(cfg, levels[4], block, channels[3], channels[4], 2,
                           level_root=True, root_residual=residual_root)
        self.level5 = Tree(cfg, levels[5], block, channels[4], channels[5], 2,
                           level_root=True, root_residual=residual_root)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))

        self.load_pretrained_model(
            data='imagenet', name='dla34', hash='ba72cf86')

    def load_pretrained_model(self, data, name, hash):
        model_url = get_model_url(data, name, hash)
        model_weights = model_zoo.load_url(model_url)
        del model_weights['fc.weight']
        del model_weights['fc.bias']
        print('Loading pretrained DLA!')
        self.load_state_dict(model_weights, strict=True)

    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
        modules = []
        for i in range(convs):
            modules.extend([
                nn.Conv2d(inplanes, planes, kernel_size=3,
                          stride=stride if i == 0 else 1,
                          padding=dilation, bias=False, dilation=dilation),
                get_norm(self.cfg.MODEL.DLA.NORM, planes),
                nn.ReLU(inplace=True)])
            inplanes = planes
        return nn.Sequential(*modules)

    def forward(self, x):
        y = {}
        x = self.base_layer(x)
        for i in range(6):
            name = 'level{}'.format(i)
            x = getattr(self, name)(x)
            y['dla{}'.format(i)] = x
        return y


def fill_up_weights(up):
    w = up.weight.data
    f = math.ceil(w.size(2) / 2)
    c = (2 * f - 1 - f % 2) / (2. * f)
    for i in range(w.size(2)):
        for j in range(w.size(3)):
            w[0, 0, i, j] = \
                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
    for c in range(1, w.size(0)):
        w[c, 0, :, :] = w[0, 0, :, :]


class Conv(nn.Module):
    def __init__(self, chi, cho, norm):
        super(Conv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(chi, cho, kernel_size=1, stride=1, bias=False),
            get_norm(norm, cho),
            nn.ReLU(inplace=True))
    
    def forward(self, x):
        return self.conv(x)


class DeformConv(nn.Module):
    def __init__(self, chi, cho, norm):
        super(DeformConv, self).__init__()
        self.actf = nn.Sequential(
            get_norm(norm, cho),
            nn.ReLU(inplace=True)
        )
        self.offset = Conv2d(
            chi, 27, kernel_size=3, stride=1,
            padding=1, dilation=1)
        self.conv = ModulatedDeformConv(
            chi, cho, kernel_size=3, stride=1, padding=1,
            dilation=1, deformable_groups=1)
        nn.init.constant_(self.offset.weight, 0)
        nn.init.constant_(self.offset.bias, 0)

    def forward(self, x):
        offset_mask = self.offset(x)
        offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
        offset = torch.cat((offset_x, offset_y), dim=1)
        mask = mask.sigmoid()
        x = self.conv(x, offset, mask)
        x = self.actf(x)
        return x


class IDAUp(nn.Module):
    def __init__(self, o, channels, up_f, norm='FrozenBN', node_type=Conv):
        super(IDAUp, self).__init__()
        for i in range(1, len(channels)):
            c = channels[i]
            f = int(up_f[i])  
            proj = node_type(c, o, norm)
            node = node_type(o, o, norm)
     
            up = nn.ConvTranspose2d(o, o, f * 2, stride=f, 
                                    padding=f // 2, output_padding=0,
                                    groups=o, bias=False)
            fill_up_weights(up)

            setattr(self, 'proj_' + str(i), proj)
            setattr(self, 'up_' + str(i), up)
            setattr(self, 'node_' + str(i), node)
                 
        
    def forward(self, layers, startp, endp):
        for i in range(startp + 1, endp):
            upsample = getattr(self, 'up_' + str(i - startp))
            project = getattr(self, 'proj_' + str(i - startp))
            layers[i] = upsample(project(layers[i]))
            node = getattr(self, 'node_' + str(i - startp))
            layers[i] = node(layers[i] + layers[i - 1])


DLAUP_NODE_MAP = {
    'conv': Conv,
    'dcn': DeformConv,
}

class DLAUP(Backbone):
    def __init__(self, bottom_up, in_features, norm, dlaup_node='conv'):
        super(DLAUP, self).__init__()
        assert isinstance(bottom_up, Backbone)
        self.bottom_up = bottom_up
        input_shapes = bottom_up.output_shape()
        in_strides = [input_shapes[f].stride for f in in_features]
        in_channels = [input_shapes[f].channels for f in in_features] 
        in_levels = [int(math.log2(input_shapes[f].stride)) for f in in_features]
        self.in_features = in_features
        out_features = ['dlaup{}'.format(l) for l in in_levels]
        self._out_features = out_features
        self._out_feature_channels = {
            'dlaup{}'.format(l): in_channels[i] for i, l in enumerate(in_levels)}
        self._out_feature_strides = {
            'dlaup{}'.format(l): 2 ** l for l in in_levels}

        print('self._out_features', self._out_features)
        print('self._out_feature_channels', self._out_feature_channels)
        print('self._out_feature_strides', self._out_feature_strides)
        self._size_divisibility = 32

        node_type = DLAUP_NODE_MAP[dlaup_node]

        self.startp = int(math.log2(in_strides[0]))
        self.channels = in_channels
        channels = list(in_channels)
        scales = np.array([2 ** i for i in range(len(out_features))], dtype=int)
        for i in range(len(channels) - 1):
            j = -i - 2
            setattr(self, 'ida_{}'.format(i),
                    IDAUp(channels[j], in_channels[j:],
                          scales[j:] // scales[j],
                          norm=norm,
                          node_type=node_type))
            scales[j + 1:] = scales[j]
            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]

    @property
    def size_divisibility(self):
        return self._size_divisibility

    def forward(self, x):
        bottom_up_features = self.bottom_up(x)
        layers = [bottom_up_features[f] for f in self.in_features]
        out = [layers[-1]] # start with 32
        for i in range(len(layers) - 1):
            ida = getattr(self, 'ida_{}'.format(i))
            ida(layers, len(layers) - i - 2, len(layers))
            out.insert(0, layers[-1])
        ret = {}
        for k, v in zip(self._out_features, out):
            ret[k] = v
        # import pdb; pdb.set_trace()
        return ret


def dla34(cfg, pretrained=None):  # DLA-34
    model = DLA(cfg, [1, 1, 1, 2, 2, 1],
                [16, 32, 64, 128, 256, 512],
                block=BasicBlock)
    return model


class LastLevelP6P7(nn.Module):
    """
    This module is used in RetinaNet to generate extra layers, P6 and P7 from
    C5 feature.
    """

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.num_levels = 2
        self.in_feature = "dla5"
        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
        for module in [self.p6, self.p7]:
            weight_init.c2_xavier_fill(module)

    def forward(self, c5):
        p6 = self.p6(c5)
        p7 = self.p7(F.relu(p6))
        return [p6, p7]


@BACKBONE_REGISTRY.register()
def build_dla_fpn3_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode
    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """

    depth_to_creator = {"dla34": dla34}
    bottom_up = depth_to_creator['dla{}'.format(cfg.MODEL.DLA.NUM_LAYERS)](cfg)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.FPN.OUT_CHANNELS

    backbone = FPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        norm=cfg.MODEL.FPN.NORM,
        top_block=None,
        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
    )

    return backbone

@BACKBONE_REGISTRY.register()
def build_dla_fpn5_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode
    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """

    depth_to_creator = {"dla34": dla34}
    bottom_up = depth_to_creator['dla{}'.format(cfg.MODEL.DLA.NUM_LAYERS)](cfg)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
    in_channels_top = bottom_up.output_shape()['dla5'].channels

    backbone = FPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        norm=cfg.MODEL.FPN.NORM,
        top_block=LastLevelP6P7(in_channels_top, out_channels),
        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
    )

    return backbone


@BACKBONE_REGISTRY.register()
def build_dlaup_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode
    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """

    depth_to_creator = {"dla34": dla34}
    bottom_up = depth_to_creator['dla{}'.format(cfg.MODEL.DLA.NUM_LAYERS)](cfg)

    backbone = DLAUP(
        bottom_up=bottom_up,
        in_features=cfg.MODEL.DLA.DLAUP_IN_FEATURES,
        norm=cfg.MODEL.DLA.NORM,
        dlaup_node=cfg.MODEL.DLA.DLAUP_NODE,
    )

    return backbone


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/fpn_p5.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
import fvcore.nn.weight_init as weight_init
import torch.nn.functional as F
from torch import nn

from detectron2.layers import Conv2d, ShapeSpec, get_norm

from detectron2.modeling.backbone import Backbone
from detectron2.modeling.backbone.fpn import FPN 
from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
from detectron2.modeling.backbone.resnet import build_resnet_backbone


class LastLevelP6P7_P5(nn.Module):
    """
    This module is used in RetinaNet to generate extra layers, P6 and P7 from
    C5 feature.
    """

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.num_levels = 2
        self.in_feature = "p5"
        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
        for module in [self.p6, self.p7]:
            weight_init.c2_xavier_fill(module)

    def forward(self, c5):
        p6 = self.p6(c5)
        p7 = self.p7(F.relu(p6))
        return [p6, p7]


@BACKBONE_REGISTRY.register()
def build_p67_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode

    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = build_resnet_backbone(cfg, input_shape)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
    backbone = FPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        norm=cfg.MODEL.FPN.NORM,
        top_block=LastLevelP6P7_P5(out_channels, out_channels),
        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
    )
    return backbone

@BACKBONE_REGISTRY.register()
def build_p35_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode

    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = build_resnet_backbone(cfg, input_shape)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
    backbone = FPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        norm=cfg.MODEL.FPN.NORM,
        top_block=None,
        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
    )
    return backbone


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/backbone/res2net.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# This file is modified from https://github.com/Res2Net/Res2Net-detectron2/blob/master/detectron2/modeling/backbone/resnet.py
# The original file is under Apache-2.0 License
import numpy as np
import fvcore.nn.weight_init as weight_init
import torch
import torch.nn.functional as F
from torch import nn

from detectron2.layers import (
    CNNBlockBase,
    Conv2d,
    DeformConv,
    ModulatedDeformConv,
    ShapeSpec,
    get_norm,
)

from detectron2.modeling.backbone import Backbone
from detectron2.modeling.backbone.fpn import FPN 
from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
from .fpn_p5 import LastLevelP6P7_P5
from .bifpn import BiFPN

__all__ = [
    "ResNetBlockBase",
    "BasicBlock",
    "BottleneckBlock",
    "DeformBottleneckBlock",
    "BasicStem",
    "ResNet",
    "make_stage",
    "build_res2net_backbone",
]


ResNetBlockBase = CNNBlockBase
"""
Alias for backward compatibiltiy.
"""


class BasicBlock(CNNBlockBase):
    """
    The basic residual block for ResNet-18 and ResNet-34, with two 3x3 conv layers
    and a projection shortcut if needed.
    """

    def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
        """
        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            stride (int): Stride for the first conv.
            norm (str or callable): normalization for all conv layers.
                See :func:`layers.get_norm` for supported format.
        """
        super().__init__(in_channels, out_channels, stride)

        if in_channels != out_channels:
            self.shortcut = Conv2d(
                in_channels,
                out_channels,
                kernel_size=1,
                stride=stride,
                bias=False,
                norm=get_norm(norm, out_channels),
            )
        else:
            self.shortcut = None

        self.conv1 = Conv2d(
            in_channels,
            out_channels,
            kernel_size=3,
            stride=stride,
            padding=1,
            bias=False,
            norm=get_norm(norm, out_channels),
        )

        self.conv2 = Conv2d(
            out_channels,
            out_channels,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=False,
            norm=get_norm(norm, out_channels),
        )

        for layer in [self.conv1, self.conv2, self.shortcut]:
            if layer is not None:  # shortcut can be None
                weight_init.c2_msra_fill(layer)

    def forward(self, x):
        out = self.conv1(x)
        out = F.relu_(out)
        out = self.conv2(out)

        if self.shortcut is not None:
            shortcut = self.shortcut(x)
        else:
            shortcut = x

        out += shortcut
        out = F.relu_(out)
        return out


class BottleneckBlock(CNNBlockBase):
    """
    The standard bottle2neck residual block used by Res2Net-50, 101 and 152.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        *,
        bottleneck_channels,
        stride=1,
        num_groups=1,
        norm="BN",
        stride_in_1x1=False,
        dilation=1,
        basewidth=26, 
        scale=4,
    ):
        """
        Args:
            bottleneck_channels (int): number of output channels for the 3x3
                "bottleneck" conv layers.
            num_groups (int): number of groups for the 3x3 conv layer.
            norm (str or callable): normalization for all conv layers.
                See :func:`layers.get_norm` for supported format.
            stride_in_1x1 (bool): when stride>1, whether to put stride in the
                first 1x1 convolution or the bottleneck 3x3 convolution.
            dilation (int): the dilation rate of the 3x3 conv layer.
        """
        super().__init__(in_channels, out_channels, stride)

        if in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.AvgPool2d(kernel_size=stride, stride=stride, 
                    ceil_mode=True, count_include_pad=False),
                Conv2d(
                    in_channels,
                    out_channels,
                    kernel_size=1,
                    stride=1,
                    bias=False,
                    norm=get_norm(norm, out_channels),
                )
            )
        else:
            self.shortcut = None

        # The original MSRA ResNet models have stride in the first 1x1 conv
        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
        # stride in the 3x3 conv
        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
        width = bottleneck_channels//scale

        self.conv1 = Conv2d(
            in_channels,
            bottleneck_channels,
            kernel_size=1,
            stride=stride_1x1,
            bias=False,
            norm=get_norm(norm, bottleneck_channels),
        )
        if scale == 1:
          self.nums = 1
        else:
          self.nums = scale -1
        if self.in_channels!=self.out_channels and stride_3x3!=2:
            self.pool = nn.AvgPool2d(kernel_size=3, stride = stride_3x3, padding=1)

        convs = []
        bns = []
        for i in range(self.nums):
            convs.append(nn.Conv2d(
                            width, 
                            width, 
                            kernel_size=3, 
                            stride=stride_3x3, 
                            padding=1 * dilation, 
                            bias=False,
                            groups=num_groups,
                            dilation=dilation,
                            ))
            bns.append(get_norm(norm, width))
        self.convs = nn.ModuleList(convs)
        self.bns = nn.ModuleList(bns)

        self.conv3 = Conv2d(
            bottleneck_channels,
            out_channels,
            kernel_size=1,
            bias=False,
            norm=get_norm(norm, out_channels),
        )
        self.scale = scale
        self.width = width
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride_3x3 = stride_3x3
        for layer in [self.conv1, self.conv3]:
            if layer is not None:  # shortcut can be None
                weight_init.c2_msra_fill(layer)
        if self.shortcut is not None:
            for layer in self.shortcut.modules():
                if isinstance(layer, Conv2d):
                    weight_init.c2_msra_fill(layer)
                
        for layer in self.convs:
            if layer is not None:  # shortcut can be None
                weight_init.c2_msra_fill(layer)

        # Zero-initialize the last normalization in each residual branch,
        # so that at the beginning, the residual branch starts with zeros,
        # and each residual block behaves like an identity.
        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
        # "For BN layers, the learnable scaling coefficient γ is initialized
        # to be 1, except for each residual block's last BN
        # where γ is initialized to be 0."

        # nn.init.constant_(self.conv3.norm.weight, 0)
        # TODO this somehow hurts performance when training GN models from scratch.
        # Add it as an option when we need to use this code to train a backbone.

    def forward(self, x):
        out = self.conv1(x)
        out = F.relu_(out)

        spx = torch.split(out, self.width, 1)
        for i in range(self.nums):
            if i==0 or self.in_channels!=self.out_channels:
                sp = spx[i]
            else:
                sp = sp + spx[i]
            sp = self.convs[i](sp)
            sp = F.relu_(self.bns[i](sp))
            if i==0:
                out = sp
            else:
                out = torch.cat((out, sp), 1)
        if self.scale!=1 and self.stride_3x3==1:
            out = torch.cat((out, spx[self.nums]), 1)
        elif self.scale != 1 and self.stride_3x3==2:
            out = torch.cat((out, self.pool(spx[self.nums])), 1)

        out = self.conv3(out)

        if self.shortcut is not None:
            shortcut = self.shortcut(x)
        else:
            shortcut = x

        out += shortcut
        out = F.relu_(out)
        return out


class DeformBottleneckBlock(ResNetBlockBase):
    """
    Not implemented for res2net yet.
    Similar to :class:`BottleneckBlock`, but with deformable conv in the 3x3 convolution.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        *,
        bottleneck_channels,
        stride=1,
        num_groups=1,
        norm="BN",
        stride_in_1x1=False,
        dilation=1,
        deform_modulated=False,
        deform_num_groups=1,
        basewidth=26, 
        scale=4,
    ):
        super().__init__(in_channels, out_channels, stride)
        self.deform_modulated = deform_modulated

        if in_channels != out_channels:
            # self.shortcut = Conv2d(
            #     in_channels,
            #     out_channels,
            #     kernel_size=1,
            #     stride=stride,
            #     bias=False,
            #     norm=get_norm(norm, out_channels),
            # )
            self.shortcut = nn.Sequential(
                nn.AvgPool2d(kernel_size=stride, stride=stride, 
                    ceil_mode=True, count_include_pad=False),
                Conv2d(
                    in_channels,
                    out_channels,
                    kernel_size=1,
                    stride=1,
                    bias=False,
                    norm=get_norm(norm, out_channels),
                )
            )
        else:
            self.shortcut = None

        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
        width = bottleneck_channels//scale

        self.conv1 = Conv2d(
            in_channels,
            bottleneck_channels,
            kernel_size=1,
            stride=stride_1x1,
            bias=False,
            norm=get_norm(norm, bottleneck_channels),
        )

        if scale == 1:
          self.nums = 1
        else:
          self.nums = scale -1
        if self.in_channels!=self.out_channels and stride_3x3!=2:
            self.pool = nn.AvgPool2d(kernel_size=3, stride = stride_3x3, padding=1)

        if deform_modulated:
            deform_conv_op = ModulatedDeformConv
            # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
            offset_channels = 27
        else:
            deform_conv_op = DeformConv
            offset_channels = 18

        # self.conv2_offset = Conv2d(
        #     bottleneck_channels,
        #     offset_channels * deform_num_groups,
        #     kernel_size=3,
        #     stride=stride_3x3,
        #     padding=1 * dilation,
        #     dilation=dilation,
        # )
        # self.conv2 = deform_conv_op(
        #     bottleneck_channels,
        #     bottleneck_channels,
        #     kernel_size=3,
        #     stride=stride_3x3,
        #     padding=1 * dilation,
        #     bias=False,
        #     groups=num_groups,
        #     dilation=dilation,
        #     deformable_groups=deform_num_groups,
        #     norm=get_norm(norm, bottleneck_channels),
        # )

        conv2_offsets = []
        convs = []
        bns = []
        for i in range(self.nums):
            conv2_offsets.append(Conv2d(
                            width, 
                            offset_channels * deform_num_groups, 
                            kernel_size=3, 
                            stride=stride_3x3, 
                            padding=1 * dilation, 
                            bias=False,
                            groups=num_groups,
                            dilation=dilation,
                            ))
            convs.append(deform_conv_op(
                            width, 
                            width, 
                            kernel_size=3, 
                            stride=stride_3x3, 
                            padding=1 * dilation, 
                            bias=False,
                            groups=num_groups,
                            dilation=dilation,
                            deformable_groups=deform_num_groups,
                            ))
            bns.append(get_norm(norm, width))
        self.conv2_offsets = nn.ModuleList(conv2_offsets)
        self.convs = nn.ModuleList(convs)
        self.bns = nn.ModuleList(bns)

        self.conv3 = Conv2d(
            bottleneck_channels,
            out_channels,
            kernel_size=1,
            bias=False,
            norm=get_norm(norm, out_channels),
        )
        self.scale = scale
        self.width = width
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride_3x3 = stride_3x3
        # for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
        #     if layer is not None:  # shortcut can be None
        #         weight_init.c2_msra_fill(layer)

        # nn.init.constant_(self.conv2_offset.weight, 0)
        # nn.init.constant_(self.conv2_offset.bias, 0)
        for layer in [self.conv1, self.conv3]:
            if layer is not None:  # shortcut can be None
                weight_init.c2_msra_fill(layer)
        if self.shortcut is not None:
            for layer in self.shortcut.modules():
                if isinstance(layer, Conv2d):
                    weight_init.c2_msra_fill(layer)
                
        for layer in self.convs:
            if layer is not None:  # shortcut can be None
                weight_init.c2_msra_fill(layer)

        for layer in self.conv2_offsets:
            if layer.weight is not None:
                nn.init.constant_(layer.weight, 0)
            if layer.bias is not None:
                nn.init.constant_(layer.bias, 0)

    def forward(self, x):
        out = self.conv1(x)
        out = F.relu_(out)

        # if self.deform_modulated:
        #     offset_mask = self.conv2_offset(out)
        #     offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
        #     offset = torch.cat((offset_x, offset_y), dim=1)
        #     mask = mask.sigmoid()
        #     out = self.conv2(out, offset, mask)
        # else:
        #     offset = self.conv2_offset(out)
        #     out = self.conv2(out, offset)
        # out = F.relu_(out)

        spx = torch.split(out, self.width, 1)
        for i in range(self.nums):
            if i==0 or self.in_channels!=self.out_channels:
                sp = spx[i].contiguous()
            else:
                sp = sp + spx[i].contiguous()
            
            # sp = self.convs[i](sp)
            if self.deform_modulated:
                offset_mask = self.conv2_offsets[i](sp)
                offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
                offset = torch.cat((offset_x, offset_y), dim=1)
                mask = mask.sigmoid()
                sp = self.convs[i](sp, offset, mask)
            else:
                offset = self.conv2_offsets[i](sp)
                sp = self.convs[i](sp, offset)
            sp = F.relu_(self.bns[i](sp))
            if i==0:
                out = sp
            else:
                out = torch.cat((out, sp), 1)
        if self.scale!=1 and self.stride_3x3==1:
            out = torch.cat((out, spx[self.nums]), 1)
        elif self.scale != 1 and self.stride_3x3==2:
            out = torch.cat((out, self.pool(spx[self.nums])), 1)

        out = self.conv3(out)

        if self.shortcut is not None:
            shortcut = self.shortcut(x)
        else:
            shortcut = x

        out += shortcut
        out = F.relu_(out)
        return out


def make_stage(block_class, num_blocks, first_stride, *, in_channels, out_channels, **kwargs):
    """
    Create a list of blocks just like those in a ResNet stage.
    Args:
        block_class (type): a subclass of ResNetBlockBase
        num_blocks (int):
        first_stride (int): the stride of the first block. The other blocks will have stride=1.
        in_channels (int): input channels of the entire stage.
        out_channels (int): output channels of **every block** in the stage.
        kwargs: other arguments passed to the constructor of every block.
    Returns:
        list[nn.Module]: a list of block module.
    """
    assert "stride" not in kwargs, "Stride of blocks in make_stage cannot be changed."
    blocks = []
    for i in range(num_blocks):
        blocks.append(
            block_class(
                in_channels=in_channels,
                out_channels=out_channels,
                stride=first_stride if i == 0 else 1,
                **kwargs,
            )
        )
        in_channels = out_channels
    return blocks


class BasicStem(CNNBlockBase):
    """
    The standard ResNet stem (layers before the first residual block).
    """

    def __init__(self, in_channels=3, out_channels=64, norm="BN"):
        """
        Args:
            norm (str or callable): norm after the first conv layer.
                See :func:`layers.get_norm` for supported format.
        """
        super().__init__(in_channels, out_channels, 4)
        self.in_channels = in_channels
        self.conv1 = nn.Sequential(
            Conv2d(
                in_channels,
                32,
                kernel_size=3,
                stride=2,
                padding=1,
                bias=False,
                ),
            get_norm(norm, 32),
            nn.ReLU(inplace=True),
            Conv2d(
                32,
                32,
                kernel_size=3,
                stride=1,
                padding=1,
                bias=False,
                ),
            get_norm(norm, 32),
            nn.ReLU(inplace=True),
            Conv2d(
                32,
                out_channels,
                kernel_size=3,
                stride=1,
                padding=1,
                bias=False,
                ),
        )
        self.bn1 = get_norm(norm, out_channels)

        for layer in self.conv1:
            if isinstance(layer, Conv2d):
                weight_init.c2_msra_fill(layer)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu_(x)
        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
        return x


class ResNet(Backbone):
    def __init__(self, stem, stages, num_classes=None, out_features=None):
        """
        Args:
            stem (nn.Module): a stem module
            stages (list[list[CNNBlockBase]]): several (typically 4) stages,
                each contains multiple :class:`CNNBlockBase`.
            num_classes (None or int): if None, will not perform classification.
                Otherwise, will create a linear layer.
            out_features (list[str]): name of the layers whose outputs should
                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
                If None, will return the output of the last layer.
        """
        super(ResNet, self).__init__()
        self.stem = stem
        self.num_classes = num_classes

        current_stride = self.stem.stride
        self._out_feature_strides = {"stem": current_stride}
        self._out_feature_channels = {"stem": self.stem.out_channels}

        self.stages_and_names = []
        for i, blocks in enumerate(stages):
            assert len(blocks) > 0, len(blocks)
            for block in blocks:
                assert isinstance(block, CNNBlockBase), block

            name = "res" + str(i + 2)
            stage = nn.Sequential(*blocks)

            self.add_module(name, stage)
            self.stages_and_names.append((stage, name))

            self._out_feature_strides[name] = current_stride = int(
                current_stride * np.prod([k.stride for k in blocks])
            )
            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels

        if num_classes is not None:
            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
            self.linear = nn.Linear(curr_channels, num_classes)

            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
            # "The 1000-way fully-connected layer is initialized by
            # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
            nn.init.normal_(self.linear.weight, std=0.01)
            name = "linear"

        if out_features is None:
            out_features = [name]
        self._out_features = out_features
        assert len(self._out_features)
        children = [x[0] for x in self.named_children()]
        for out_feature in self._out_features:
            assert out_feature in children, "Available children: {}".format(", ".join(children))

    def forward(self, x):
        outputs = {}
        x = self.stem(x)
        if "stem" in self._out_features:
            outputs["stem"] = x
        for stage, name in self.stages_and_names:
            x = stage(x)
            if name in self._out_features:
                outputs[name] = x
        if self.num_classes is not None:
            x = self.avgpool(x)
            x = torch.flatten(x, 1)
            x = self.linear(x)
            if "linear" in self._out_features:
                outputs["linear"] = x
        return outputs

    def output_shape(self):
        return {
            name: ShapeSpec(
                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
            )
            for name in self._out_features
        }

    def freeze(self, freeze_at=0):
        """
        Freeze the first several stages of the ResNet. Commonly used in
        fine-tuning.
        Args:
            freeze_at (int): number of stem and stages to freeze.
                `1` means freezing the stem. `2` means freezing the stem and
                the first stage, etc.
        Returns:
            nn.Module: this ResNet itself
        """
        if freeze_at >= 1:
            self.stem.freeze()
        for idx, (stage, _) in enumerate(self.stages_and_names, start=2):
            if freeze_at >= idx:
                for block in stage.children():
                    block.freeze()
        return self


@BACKBONE_REGISTRY.register()
def build_res2net_backbone(cfg, input_shape):
    """
    Create a Res2Net instance from config.
    Returns:
        ResNet: a :class:`ResNet` instance.
    """
    # need registration of new blocks/stems?
    norm = cfg.MODEL.RESNETS.NORM
    stem = BasicStem(
        in_channels=input_shape.channels,
        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
        norm=norm,
    )

    # fmt: off
    freeze_at           = cfg.MODEL.BACKBONE.FREEZE_AT
    out_features        = cfg.MODEL.RESNETS.OUT_FEATURES
    depth               = cfg.MODEL.RESNETS.DEPTH
    num_groups          = cfg.MODEL.RESNETS.NUM_GROUPS
    width_per_group     = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
    scale              = 4
    bottleneck_channels = num_groups * width_per_group * scale
    in_channels         = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
    out_channels        = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
    stride_in_1x1       = cfg.MODEL.RESNETS.STRIDE_IN_1X1
    res5_dilation       = cfg.MODEL.RESNETS.RES5_DILATION
    deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
    deform_modulated    = cfg.MODEL.RESNETS.DEFORM_MODULATED
    deform_num_groups   = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
    # fmt: on
    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)

    num_blocks_per_stage = {
        18: [2, 2, 2, 2],
        34: [3, 4, 6, 3],
        50: [3, 4, 6, 3],
        101: [3, 4, 23, 3],
        152: [3, 8, 36, 3],
    }[depth]

    if depth in [18, 34]:
        assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
        assert not any(
            deform_on_per_stage
        ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34"
        assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
        assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"

    stages = []

    # Avoid creating variables without gradients
    # It consumes extra memory and may cause allreduce to fail
    out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features]
    max_stage_idx = max(out_stage_idx)
    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
        dilation = res5_dilation if stage_idx == 5 else 1
        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
        stage_kargs = {
            "num_blocks": num_blocks_per_stage[idx],
            "first_stride": first_stride,
            "in_channels": in_channels,
            "out_channels": out_channels,
            "norm": norm,
        }
        # Use BasicBlock for R18 and R34.
        if depth in [18, 34]:
            stage_kargs["block_class"] = BasicBlock
        else:
            stage_kargs["bottleneck_channels"] = bottleneck_channels
            stage_kargs["stride_in_1x1"] = stride_in_1x1
            stage_kargs["dilation"] = dilation
            stage_kargs["num_groups"] = num_groups
            stage_kargs["scale"] = scale

            if deform_on_per_stage[idx]:
                stage_kargs["block_class"] = DeformBottleneckBlock
                stage_kargs["deform_modulated"] = deform_modulated
                stage_kargs["deform_num_groups"] = deform_num_groups
            else:
                stage_kargs["block_class"] = BottleneckBlock
        blocks = make_stage(**stage_kargs)
        in_channels = out_channels
        out_channels *= 2
        bottleneck_channels *= 2
        stages.append(blocks)
    return ResNet(stem, stages, out_features=out_features).freeze(freeze_at)


@BACKBONE_REGISTRY.register()
def build_p67_res2net_fpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode

    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = build_res2net_backbone(cfg, input_shape)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
    backbone = FPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        norm=cfg.MODEL.FPN.NORM,
        top_block=LastLevelP6P7_P5(out_channels, out_channels),
        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
    )
    return backbone


@BACKBONE_REGISTRY.register()
def build_res2net_bifpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode

    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = build_res2net_backbone(cfg, input_shape)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    backbone = BiFPN(
        cfg=cfg,
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=cfg.MODEL.BIFPN.OUT_CHANNELS,
        norm=cfg.MODEL.BIFPN.NORM,
        num_levels=cfg.MODEL.BIFPN.NUM_LEVELS,
        num_bifpn=cfg.MODEL.BIFPN.NUM_BIFPN,
        separable_conv=cfg.MODEL.BIFPN.SEPARABLE_CONV,
    )
    return backbone


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/debug.py
================================================
import cv2
import numpy as np
import torch
import torch.nn.functional as F

COLORS = ((np.random.rand(1300, 3) * 0.4 + 0.6) * 255).astype(
  np.uint8).reshape(1300, 1, 1, 3)

def _get_color_image(heatmap):
  heatmap = heatmap.reshape(
    heatmap.shape[0], heatmap.shape[1], heatmap.shape[2], 1)
  if heatmap.shape[0] == 1:
      color_map = (heatmap * np.ones((1, 1, 1, 3), np.uint8) * 255).max(
          axis=0).astype(np.uint8) # H, W, 3
  else:
      color_map = (heatmap * COLORS[:heatmap.shape[0]]).max(axis=0).astype(np.uint8) # H, W, 3

  return color_map

def _blend_image(image, color_map, a=0.7):
  color_map = cv2.resize(color_map, (image.shape[1], image.shape[0]))
  ret = np.clip(image * (1 - a) + color_map * a, 0, 255).astype(np.uint8)
  return ret

def _blend_image_heatmaps(image, color_maps, a=0.7):
    merges = np.zeros((image.shape[0], image.shape[1], 3), np.float32)
    for color_map in color_maps:
        color_map = cv2.resize(color_map, (image.shape[1], image.shape[0]))
        merges = np.maximum(merges, color_map)
    ret = np.clip(image * (1 - a) + merges * a, 0, 255).astype(np.uint8)
    return ret

def _decompose_level(x, shapes_per_level, N):
    '''
    x: LNHiWi x C
    '''
    x = x.view(x.shape[0], -1)
    ret = []
    st = 0
    for l in range(len(shapes_per_level)):
        ret.append([])
        h = shapes_per_level[l][0].int().item()
        w = shapes_per_level[l][1].int().item()
        for i in range(N):
            ret[l].append(x[st + h * w * i:st + h * w * (i + 1)].view(
                h, w, -1).permute(2, 0, 1))
        st += h * w * N
    return ret

def _imagelist_to_tensor(images):
    images = [x for x in images]
    image_sizes = [x.shape[-2:] for x in images]
    h = max([size[0] for size in image_sizes])
    w = max([size[1] for size in image_sizes])
    S = 32
    h, w = ((h - 1) // S + 1) * S, ((w - 1) // S + 1) * S
    images = [F.pad(x, (0, w - x.shape[2], 0, h - x.shape[1], 0, 0)) \
        for x in images]
    images = torch.stack(images)
    return images


def _ind2il(ind, shapes_per_level, N):
    r = ind
    l = 0
    S = 0
    while r - S >= N * shapes_per_level[l][0] * shapes_per_level[l][1]:
        S += N * shapes_per_level[l][0] * shapes_per_level[l][1]
        l += 1
    i = (r - S) // (shapes_per_level[l][0] * shapes_per_level[l][1])
    return i, l

def debug_train(
    images, gt_instances, flattened_hms, reg_targets, labels, pos_inds,
    shapes_per_level, locations, strides):
    '''
    images: N x 3 x H x W
    flattened_hms: LNHiWi x C
    shapes_per_level: L x 2 [(H_i, W_i)]
    locations: LNHiWi x 2
    '''
    reg_inds = torch.nonzero(
        reg_targets.max(dim=1)[0] > 0).squeeze(1)
    N = len(images)
    images = _imagelist_to_tensor(images)
    repeated_locations = [torch.cat([loc] * N, dim=0) \
        for loc in locations]
    locations = torch.cat(repeated_locations, dim=0)
    gt_hms = _decompose_level(flattened_hms, shapes_per_level, N)
    masks = flattened_hms.new_zeros((flattened_hms.shape[0], 1))
    masks[pos_inds] = 1
    masks = _decompose_level(masks, shapes_per_level, N)
    for i in range(len(images)):
        image = images[i].detach().cpu().numpy().transpose(1, 2, 0)
        color_maps = []
        for l in range(len(gt_hms)):
            color_map = _get_color_image(
                gt_hms[l][i].detach().cpu().numpy())
            color_maps.append(color_map)
            cv2.imshow('gthm_{}'.format(l), color_map)
        blend = _blend_image_heatmaps(image.copy(), color_maps)
        if gt_instances is not None:
            bboxes = gt_instances[i].gt_boxes.tensor
            for j in range(len(bboxes)):
                bbox = bboxes[j]
                cv2.rectangle(
                    blend, 
                    (int(bbox[0]), int(bbox[1])),
                    (int(bbox[2]), int(bbox[3])),
                    (0, 0, 255), 3, cv2.LINE_AA)
    
        for j in range(len(pos_inds)):
            image_id, l = _ind2il(pos_inds[j], shapes_per_level, N)
            if image_id != i:
                continue
            loc = locations[pos_inds[j]]
            cv2.drawMarker(
                blend, (int(loc[0]), int(loc[1])), (0, 255, 255),
                markerSize=(l + 1) * 16)
        
        for j in range(len(reg_inds)):
            image_id, l = _ind2il(reg_inds[j], shapes_per_level, N)
            if image_id != i:
                continue
            ltrb = reg_targets[reg_inds[j]]
            ltrb *= strides[l]
            loc = locations[reg_inds[j]]
            bbox = [(loc[0] - ltrb[0]), (loc[1] - ltrb[1]),
                    (loc[0] + ltrb[2]), (loc[1] + ltrb[3])]
            cv2.rectangle(
                blend, 
                (int(bbox[0]), int(bbox[1])),
                (int(bbox[2]), int(bbox[3])),
                (255, 0, 0), 1, cv2.LINE_AA)  
            cv2.circle(blend, (int(loc[0]), int(loc[1])), 2, (255, 0, 0), -1)

        cv2.imshow('blend', blend)
        cv2.waitKey()


def debug_test(
    images, logits_pred, reg_pred, agn_hm_pred=[], preds=[], 
    vis_thresh=0.3, debug_show_name=False, mult_agn=False):
    '''
    images: N x 3 x H x W
    class_target: LNHiWi x C
    cat_agn_heatmap: LNHiWi
    shapes_per_level: L x 2 [(H_i, W_i)]
    '''
    N = len(images)
    for i in range(len(images)):
        image = images[i].detach().cpu().numpy().transpose(1, 2, 0)
        result = image.copy().astype(np.uint8)
        pred_image = image.copy().astype(np.uint8)
        color_maps = []
        L = len(logits_pred)
        for l in range(L):
            if logits_pred[0] is not None:
                stride = min(image.shape[0], image.shape[1]) / min(
                    logits_pred[l][i].shape[1], logits_pred[l][i].shape[2])
            else:
                stride = min(image.shape[0], image.shape[1]) / min(
                    agn_hm_pred[l][i].shape[1], agn_hm_pred[l][i].shape[2])
            stride = stride if stride < 60 else 64 if stride < 100 else 128
            if logits_pred[0] is not None:
                if mult_agn:
                    logits_pred[l][i] = logits_pred[l][i] * agn_hm_pred[l][i]
                color_map = _get_color_image(
                    logits_pred[l][i].detach().cpu().numpy())
                color_maps.append(color_map)
                cv2.imshow('predhm_{}'.format(l), color_map)

            if debug_show_name:
                from detectron2.data.datasets.lvis_v1_categories import LVIS_CATEGORIES 
                cat2name = [x['name'] for x in LVIS_CATEGORIES]
            for j in range(len(preds[i].scores) if preds is not None else 0):
                if preds[i].scores[j] > vis_thresh:
                    bbox = preds[i].proposal_boxes[j] \
                        if preds[i].has('proposal_boxes') else \
                        preds[i].pred_boxes[j]
                    bbox = bbox.tensor[0].detach().cpu().numpy().astype(np.int32)
                    cat = int(preds[i].pred_classes[j]) \
                        if preds[i].has('pred_classes') else 0
                    cl = COLORS[cat, 0, 0]
                    cv2.rectangle(
                        pred_image, (int(bbox[0]), int(bbox[1])), 
                        (int(bbox[2]), int(bbox[3])), 
                        (int(cl[0]), int(cl[1]), int(cl[2])), 2, cv2.LINE_AA)
                    if debug_show_name:
                        txt = '{}{:.1f}'.format(
                            cat2name[cat] if cat > 0 else '', 
                            preds[i].scores[j])
                        font = cv2.FONT_HERSHEY_SIMPLEX
                        cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0]
                        cv2.rectangle(
                            pred_image,
                            (int(bbox[0]), int(bbox[1] - cat_size[1] - 2)),
                            (int(bbox[0] + cat_size[0]), int(bbox[1] - 2)), 
                            (int(cl[0]), int(cl[1]), int(cl[2])), -1)
                        cv2.putText(
                            pred_image, txt, (int(bbox[0]), int(bbox[1] - 2)), 
                            font, 0.5, (0, 0, 0), thickness=1, lineType=cv2.LINE_AA)


            if agn_hm_pred[l] is not None:
                agn_hm_ = agn_hm_pred[l][i, 0, :, :, None].detach().cpu().numpy()
                agn_hm_ = (agn_hm_ * np.array([255, 255, 255]).reshape(
                    1, 1, 3)).astype(np.uint8)
                cv2.imshow('agn_hm_{}'.format(l), agn_hm_)
        blend = _blend_image_heatmaps(image.copy(), color_maps)
        cv2.imshow('blend', blend)
        cv2.imshow('preds', pred_image)
        cv2.waitKey()

global cnt
cnt = 0

def debug_second_stage(images, instances, proposals=None, vis_thresh=0.3, 
    save_debug=False, debug_show_name=False):
    images = _imagelist_to_tensor(images)
    if debug_show_name:
        from detectron2.data.datasets.lvis_v1_categories import LVIS_CATEGORIES
        cat2name = [x['name'] for x in LVIS_CATEGORIES]
    for i in range(len(images)):
        image = images[i].detach().cpu().numpy().transpose(1, 2, 0).astype(np.uint8).copy()
        if instances[i].has('gt_boxes'):
            bboxes = instances[i].gt_boxes.tensor.cpu().numpy()
            scores = np.ones(bboxes.shape[0])
            cats = instances[i].gt_classes.cpu().numpy()
        else:
            bboxes = instances[i].pred_boxes.tensor.cpu().numpy()
            scores = instances[i].scores.cpu().numpy()
            cats = instances[i].pred_classes.cpu().numpy()
        for j in range(len(bboxes)):
            if scores[j] > vis_thresh:
                bbox = bboxes[j]
                cl = COLORS[cats[j], 0, 0]
                cl = (int(cl[0]), int(cl[1]), int(cl[2]))
                cv2.rectangle(
                    image, 
                    (int(bbox[0]), int(bbox[1])),
                    (int(bbox[2]), int(bbox[3])),
                    cl, 2, cv2.LINE_AA)
                if debug_show_name:
                    cat = cats[j]
                    txt = '{}{:.1f}'.format(
                        cat2name[cat] if cat > 0 else '', 
                        scores[j])
                    font = cv2.FONT_HERSHEY_SIMPLEX
                    cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0]
                    cv2.rectangle(
                        image,
                        (int(bbox[0]), int(bbox[1] - cat_size[1] - 2)),
                        (int(bbox[0] + cat_size[0]), int(bbox[1] - 2)), 
                        (int(cl[0]), int(cl[1]), int(cl[2])), -1)
                    cv2.putText(
                        image, txt, (int(bbox[0]), int(bbox[1] - 2)), 
                        font, 0.5, (0, 0, 0), thickness=1, lineType=cv2.LINE_AA)
        if proposals is not None:
            proposal_image = images[i].detach().cpu().numpy().transpose(1, 2, 0).astype(np.uint8).copy()
            bboxes = proposals[i].proposal_boxes.tensor.cpu().numpy()
            if proposals[i].has('scores'):
                scores = proposals[i].scores.cpu().numpy()
            else:
                scores = proposals[i].objectness_logits.sigmoid().cpu().numpy()
            for j in range(len(bboxes)):
                if scores[j] > vis_thresh:
                    bbox = bboxes[j]
                    cl = (209, 159, 83)
                    cv2.rectangle(
                        proposal_image, 
                        (int(bbox[0]), int(bbox[1])),
                        (int(bbox[2]), int(bbox[3])),
                        cl, 2, cv2.LINE_AA)
                            
        cv2.imshow('image', image)
        if proposals is not None:
            cv2.imshow('proposals', proposal_image)
            if save_debug:
                global cnt
                cnt += 1
                cv2.imwrite('output/save_debug/{}.jpg'.format(cnt), proposal_image)
        cv2.waitKey()

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/dense_heads/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/dense_heads/centernet.py
================================================

import math
import json
import copy
from typing import List, Dict
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

from detectron2.modeling.proposal_generator.build import PROPOSAL_GENERATOR_REGISTRY
from detectron2.layers import ShapeSpec, cat
from detectron2.structures import Instances, Boxes
from detectron2.modeling import detector_postprocess
from detectron2.utils.comm import get_world_size
from detectron2.config import configurable

from ..layers.heatmap_focal_loss import heatmap_focal_loss_jit
from ..layers.heatmap_focal_loss import  binary_heatmap_focal_loss
from ..layers.iou_loss import IOULoss
from ..layers.ml_nms import ml_nms
from ..debug import debug_train, debug_test
from .utils import reduce_sum, _transpose
from .centernet_head import CenterNetHead

__all__ = ["CenterNet"]

INF = 100000000

@PROPOSAL_GENERATOR_REGISTRY.register()
class CenterNet(nn.Module):
    @configurable
    def __init__(self, 
        # input_shape: Dict[str, ShapeSpec],
        in_channels=256,
        *,
        num_classes=80,
        in_features=("p3", "p4", "p5", "p6", "p7"),
        strides=(8, 16, 32, 64, 128),
        score_thresh=0.05,
        hm_min_overlap=0.8,
        loc_loss_type='giou',
        min_radius=4,
        hm_focal_alpha=0.25,
        hm_focal_beta=4,
        loss_gamma=2.0,
        reg_weight=2.0,
        not_norm_reg=True,
        with_agn_hm=False,
        only_proposal=False,
        as_proposal=False,
        not_nms=False,
        pos_weight=1.,
        neg_weight=1.,
        sigmoid_clamp=1e-4,
        ignore_high_fp=-1.,
        center_nms=False,
        sizes_of_interest=[[0,80],[64,160],[128,320],[256,640],[512,10000000]],
        more_pos=False,
        more_pos_thresh=0.2,
        more_pos_topk=9,
        pre_nms_topk_train=1000,
        pre_nms_topk_test=1000,
        post_nms_topk_train=100,
        post_nms_topk_test=100,
        nms_thresh_train=0.6,
        nms_thresh_test=0.6,
        no_reduce=False,
        debug=False,
        vis_thresh=0.5,
        pixel_mean=[103.530,116.280,123.675],
        pixel_std=[1.0,1.0,1.0],
        device='cuda',
        centernet_head=None,
    ):
        super().__init__()
        self.num_classes = num_classes
        self.in_features = in_features
        self.strides = strides
        self.score_thresh = score_thresh
        self.min_radius = min_radius
        self.hm_focal_alpha = hm_focal_alpha
        self.hm_focal_beta = hm_focal_beta
        self.loss_gamma = loss_gamma
        self.reg_weight = reg_weight
        self.not_norm_reg = not_norm_reg
        self.with_agn_hm = with_agn_hm
        self.only_proposal = only_proposal
        self.as_proposal = as_proposal
        self.not_nms = not_nms
        self.pos_weight = pos_weight
        self.neg_weight = neg_weight
        self.sigmoid_clamp = sigmoid_clamp
        self.ignore_high_fp = ignore_high_fp
        self.center_nms = center_nms
        self.sizes_of_interest = sizes_of_interest
        self.more_pos = more_pos
        self.more_pos_thresh = more_pos_thresh
        self.more_pos_topk = more_pos_topk
        self.pre_nms_topk_train = pre_nms_topk_train
        self.pre_nms_topk_test = pre_nms_topk_test
        self.post_nms_topk_train = post_nms_topk_train
        self.post_nms_topk_test = post_nms_topk_test
        self.nms_thresh_train = nms_thresh_train
        self.nms_thresh_test = nms_thresh_test
        self.no_reduce = no_reduce
        self.debug = debug
        self.vis_thresh = vis_thresh
        if self.center_nms:
            self.not_nms = True
        self.iou_loss = IOULoss(loc_loss_type)
        assert (not self.only_proposal) or self.with_agn_hm
        # delta for rendering heatmap
        self.delta = (1 - hm_min_overlap) / (1 + hm_min_overlap)
        if centernet_head is None:
            self.centernet_head = CenterNetHead(
                in_channels=in_channels,
                num_levels=len(in_features),
                with_agn_hm=with_agn_hm,
                only_proposal=only_proposal)
        else:
            self.centernet_head = centernet_head
        if self.debug:
            pixel_mean = torch.Tensor(pixel_mean).to(
                torch.device(device)).view(3, 1, 1)
            pixel_std = torch.Tensor(pixel_std).to(
                torch.device(device)).view(3, 1, 1)
            self.denormalizer = lambda x: x * pixel_std + pixel_mean

    @classmethod
    def from_config(cls, cfg, input_shape):
        ret = {
            # 'input_shape': input_shape,
            'in_channels': input_shape[
                cfg.MODEL.CENTERNET.IN_FEATURES[0]].channels,
            'num_classes': cfg.MODEL.CENTERNET.NUM_CLASSES,
            'in_features': cfg.MODEL.CENTERNET.IN_FEATURES,
            'strides': cfg.MODEL.CENTERNET.FPN_STRIDES,
            'score_thresh': cfg.MODEL.CENTERNET.INFERENCE_TH,
            'loc_loss_type': cfg.MODEL.CENTERNET.LOC_LOSS_TYPE,
            'hm_min_overlap': cfg.MODEL.CENTERNET.HM_MIN_OVERLAP,
            'min_radius': cfg.MODEL.CENTERNET.MIN_RADIUS,
            'hm_focal_alpha': cfg.MODEL.CENTERNET.HM_FOCAL_ALPHA,
            'hm_focal_beta': cfg.MODEL.CENTERNET.HM_FOCAL_BETA,
            'loss_gamma': cfg.MODEL.CENTERNET.LOSS_GAMMA,
            'reg_weight': cfg.MODEL.CENTERNET.REG_WEIGHT,
            'not_norm_reg': cfg.MODEL.CENTERNET.NOT_NORM_REG,
            'with_agn_hm': cfg.MODEL.CENTERNET.WITH_AGN_HM,
            'only_proposal': cfg.MODEL.CENTERNET.ONLY_PROPOSAL,
            'as_proposal': cfg.MODEL.CENTERNET.AS_PROPOSAL,
            'not_nms': cfg.MODEL.CENTERNET.NOT_NMS,
            'pos_weight': cfg.MODEL.CENTERNET.POS_WEIGHT,
            'neg_weight': cfg.MODEL.CENTERNET.NEG_WEIGHT,
            'sigmoid_clamp': cfg.MODEL.CENTERNET.SIGMOID_CLAMP,
            'ignore_high_fp': cfg.MODEL.CENTERNET.IGNORE_HIGH_FP,
            'center_nms': cfg.MODEL.CENTERNET.CENTER_NMS,
            'sizes_of_interest': cfg.MODEL.CENTERNET.SOI,
            'more_pos': cfg.MODEL.CENTERNET.MORE_POS,
            'more_pos_thresh': cfg.MODEL.CENTERNET.MORE_POS_THRESH,
            'more_pos_topk': cfg.MODEL.CENTERNET.MORE_POS_TOPK,
            'pre_nms_topk_train': cfg.MODEL.CENTERNET.PRE_NMS_TOPK_TRAIN,
            'pre_nms_topk_test': cfg.MODEL.CENTERNET.PRE_NMS_TOPK_TEST,
            'post_nms_topk_train': cfg.MODEL.CENTERNET.POST_NMS_TOPK_TRAIN,
            'post_nms_topk_test': cfg.MODEL.CENTERNET.POST_NMS_TOPK_TEST,
            'nms_thresh_train': cfg.MODEL.CENTERNET.NMS_TH_TRAIN,
            'nms_thresh_test': cfg.MODEL.CENTERNET.NMS_TH_TEST,
            'no_reduce': cfg.MODEL.CENTERNET.NO_REDUCE,
            'debug': cfg.DEBUG,
            'vis_thresh': cfg.VIS_THRESH,
            'pixel_mean': cfg.MODEL.PIXEL_MEAN,
            'pixel_std': cfg.MODEL.PIXEL_STD,
            'device': cfg.MODEL.DEVICE,
            'centernet_head': CenterNetHead(
                cfg, [input_shape[f] for f in cfg.MODEL.CENTERNET.IN_FEATURES]),
        }
        return ret


    def forward(self, images, features_dict, gt_instances):
        features = [features_dict[f] for f in self.in_features]
        clss_per_level, reg_pred_per_level, agn_hm_pred_per_level = \
            self.centernet_head(features)
        grids = self.compute_grids(features)
        shapes_per_level = grids[0].new_tensor(
                    [(x.shape[2], x.shape[3]) for x in reg_pred_per_level])
        
        if not self.training:
            return self.inference(
                images, clss_per_level, reg_pred_per_level, 
                agn_hm_pred_per_level, grids)
        else:
            pos_inds, labels, reg_targets, flattened_hms = \
                self._get_ground_truth(
                    grids, shapes_per_level, gt_instances)
            # logits_pred: M x F, reg_pred: M x 4, agn_hm_pred: M
            logits_pred, reg_pred, agn_hm_pred = self._flatten_outputs(
                clss_per_level, reg_pred_per_level, agn_hm_pred_per_level)

            if self.more_pos:
                # add more pixels as positive if \
                #   1. they are within the center3x3 region of an object
                #   2. their regression losses are small (<self.more_pos_thresh)
                pos_inds, labels = self._add_more_pos(
                    reg_pred, gt_instances, shapes_per_level)
            
            losses = self.losses(
                pos_inds, labels, reg_targets, flattened_hms,
                logits_pred, reg_pred, agn_hm_pred)
            
            proposals = None
            if self.only_proposal:
                agn_hm_pred_per_level = [x.sigmoid() for x in agn_hm_pred_per_level]
                proposals = self.predict_instances(
                    grids, agn_hm_pred_per_level, reg_pred_per_level, 
                    images.image_sizes, [None for _ in agn_hm_pred_per_level])
            elif self.as_proposal: # category specific bbox as agnostic proposals
                clss_per_level = [x.sigmoid() for x in clss_per_level]
                proposals = self.predict_instances(
                    grids, clss_per_level, reg_pred_per_level, 
                    images.image_sizes, agn_hm_pred_per_level)
            if self.only_proposal or self.as_proposal:
                for p in range(len(proposals)):
                    proposals[p].proposal_boxes = proposals[p].get('pred_boxes')
                    proposals[p].objectness_logits = proposals[p].get('scores')
                    proposals[p].remove('pred_boxes')
                    proposals[p].remove('scores')
                    proposals[p].remove('pred_classes')

            if self.debug:
                debug_train(
                    [self.denormalizer(x) for x in images], 
                    gt_instances, flattened_hms, reg_targets, 
                    labels, pos_inds, shapes_per_level, grids, self.strides)
            return proposals, losses


    def losses(
        self, pos_inds, labels, reg_targets, flattened_hms,
        logits_pred, reg_pred, agn_hm_pred):
        '''
        Inputs:
            pos_inds: N
            labels: N
            reg_targets: M x 4
            flattened_hms: M x C
            logits_pred: M x C
            reg_pred: M x 4
            agn_hm_pred: M x 1 or None
            N: number of positive locations in all images
            M: number of pixels from all FPN levels
            C: number of classes
        '''
        assert (torch.isfinite(reg_pred).all().item())
        num_pos_local = pos_inds.numel()
        num_gpus = get_world_size()
        if self.no_reduce:
            total_num_pos = num_pos_local * num_gpus
        else:
            total_num_pos = reduce_sum(
                pos_inds.new_tensor([num_pos_local])).item()
        num_pos_avg = max(total_num_pos / num_gpus, 1.0)

        losses = {}
        if not self.only_proposal:
            pos_loss, neg_loss = heatmap_focal_loss_jit(
                logits_pred, flattened_hms, pos_inds, labels,
                alpha=self.hm_focal_alpha, 
                beta=self.hm_focal_beta, 
                gamma=self.loss_gamma, 
                reduction='sum',
                sigmoid_clamp=self.sigmoid_clamp,
                ignore_high_fp=self.ignore_high_fp,
            )
            pos_loss = self.pos_weight * pos_loss / num_pos_avg
            neg_loss = self.neg_weight * neg_loss / num_pos_avg
            losses['loss_centernet_pos'] = pos_loss
            losses['loss_centernet_neg'] = neg_loss
        
        reg_inds = torch.nonzero(reg_targets.max(dim=1)[0] >= 0).squeeze(1)
        reg_pred = reg_pred[reg_inds]
        reg_targets_pos = reg_targets[reg_inds]
        reg_weight_map = flattened_hms.max(dim=1)[0]
        reg_weight_map = reg_weight_map[reg_inds]
        reg_weight_map = reg_weight_map * 0 + 1 \
            if self.not_norm_reg else reg_weight_map
        if self.no_reduce:
            reg_norm = max(reg_weight_map.sum(), 1)
        else:
            reg_norm = max(reduce_sum(reg_weight_map.sum()).item() / num_gpus, 1)
        
        reg_loss = self.reg_weight * self.iou_loss(
            reg_pred, reg_targets_pos, reg_weight_map,
            reduction='sum') / reg_norm
        losses['loss_centernet_loc'] = reg_loss

        if self.with_agn_hm:
            cat_agn_heatmap = flattened_hms.max(dim=1)[0] # M
            agn_pos_loss, agn_neg_loss = binary_heatmap_focal_loss(
                agn_hm_pred, cat_agn_heatmap, pos_inds,
                alpha=self.hm_focal_alpha, 
                beta=self.hm_focal_beta, 
                gamma=self.loss_gamma,
                sigmoid_clamp=self.sigmoid_clamp,
                ignore_high_fp=self.ignore_high_fp,
            )
            agn_pos_loss = self.pos_weight * agn_pos_loss / num_pos_avg
            agn_neg_loss = self.neg_weight * agn_neg_loss / num_pos_avg
            losses['loss_centernet_agn_pos'] = agn_pos_loss
            losses['loss_centernet_agn_neg'] = agn_neg_loss
    
        if self.debug:
            print('losses', losses)
            print('total_num_pos', total_num_pos)
        return losses


    def compute_grids(self, features):
        grids = []
        for level, feature in enumerate(features):
            h, w = feature.size()[-2:]
            shifts_x = torch.arange(
                0, w * self.strides[level], 
                step=self.strides[level],
                dtype=torch.float32, device=feature.device)
            shifts_y = torch.arange(
                0, h * self.strides[level], 
                step=self.strides[level],
                dtype=torch.float32, device=feature.device)
            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
            shift_x = shift_x.reshape(-1)
            shift_y = shift_y.reshape(-1)
            grids_per_level = torch.stack((shift_x, shift_y), dim=1) + \
                self.strides[level] // 2
            grids.append(grids_per_level)
        return grids


    def _get_ground_truth(self, grids, shapes_per_level, gt_instances):
        '''
        Input:
            grids: list of tensors [(hl x wl, 2)]_l
            shapes_per_level: list of tuples L x 2:
            gt_instances: gt instances
        Retuen:
            pos_inds: N
            labels: N
            reg_targets: M x 4
            flattened_hms: M x C or M x 1
            N: number of objects in all images
            M: number of pixels from all FPN levels
        '''

        # get positive pixel index
        if not self.more_pos:
            pos_inds, labels = self._get_label_inds(
                gt_instances, shapes_per_level) 
        else:
            pos_inds, labels = None, None
        heatmap_channels = self.num_classes
        L = len(grids)
        num_loc_list = [len(loc) for loc in grids]
        strides = torch.cat([
            shapes_per_level.new_ones(num_loc_list[l]) * self.strides[l] \
            for l in range(L)]).float() # M
        reg_size_ranges = torch.cat([
            shapes_per_level.new_tensor(self.sizes_of_interest[l]).float().view(
            1, 2).expand(num_loc_list[l], 2) for l in range(L)]) # M x 2
        grids = torch.cat(grids, dim=0) # M x 2
        M = grids.shape[0]

        reg_targets = []
        flattened_hms = []
        for i in range(len(gt_instances)): # images
            boxes = gt_instances[i].gt_boxes.tensor # N x 4
            area = gt_instances[i].gt_boxes.area() # N
            gt_classes = gt_instances[i].gt_classes # N in [0, self.num_classes]

            N = boxes.shape[0]
            if N == 0:
                reg_targets.append(grids.new_zeros((M, 4)) - INF)
                flattened_hms.append(
                    grids.new_zeros((
                        M, 1 if self.only_proposal else heatmap_channels)))
                continue
            
            l = grids[:, 0].view(M, 1) - boxes[:, 0].view(1, N) # M x N
            t = grids[:, 1].view(M, 1) - boxes[:, 1].view(1, N) # M x N
            r = boxes[:, 2].view(1, N) - grids[:, 0].view(M, 1) # M x N
            b = boxes[:, 3].view(1, N) - grids[:, 1].view(M, 1) # M x N
            reg_target = torch.stack([l, t, r, b], dim=2) # M x N x 4

            centers = ((boxes[:, [0, 1]] + boxes[:, [2, 3]]) / 2) # N x 2
            centers_expanded = centers.view(1, N, 2).expand(M, N, 2) # M x N x 2
            strides_expanded = strides.view(M, 1, 1).expand(M, N, 2)
            centers_discret = ((centers_expanded / strides_expanded).int() * \
                strides_expanded).float() + strides_expanded / 2 # M x N x 2
            
            is_peak = (((grids.view(M, 1, 2).expand(M, N, 2) - \
                centers_discret) ** 2).sum(dim=2) == 0) # M x N
            is_in_boxes = reg_target.min(dim=2)[0] > 0 # M x N
            is_center3x3 = self.get_center3x3(
                grids, centers, strides) & is_in_boxes # M x N
            is_cared_in_the_level = self.assign_reg_fpn(
                reg_target, reg_size_ranges) # M x N
            reg_mask = is_center3x3 & is_cared_in_the_level # M x N

            dist2 = ((grids.view(M, 1, 2).expand(M, N, 2) - \
                centers_expanded) ** 2).sum(dim=2) # M x N
            dist2[is_peak] = 0
            radius2 = self.delta ** 2 * 2 * area # N
            radius2 = torch.clamp(
                radius2, min=self.min_radius ** 2)
            weighted_dist2 = dist2 / radius2.view(1, N).expand(M, N) # M x N            
            reg_target = self._get_reg_targets(
                reg_target, weighted_dist2.clone(), reg_mask, area) # M x 4

            if self.only_proposal:
                flattened_hm = self._create_agn_heatmaps_from_dist(
                    weighted_dist2.clone()) # M x 1
            else:
                flattened_hm = self._create_heatmaps_from_dist(
                    weighted_dist2.clone(), gt_classes, 
                    channels=heatmap_channels) # M x C

            reg_targets.append(reg_target)
            flattened_hms.append(flattened_hm)
        
        # transpose im first training_targets to level first ones
        reg_targets = _transpose(reg_targets, num_loc_list)
        flattened_hms = _transpose(flattened_hms, num_loc_list)
        for l in range(len(reg_targets)):
            reg_targets[l] = reg_targets[l] / float(self.strides[l])
        reg_targets = cat([x for x in reg_targets], dim=0) # MB x 4
        flattened_hms = cat([x for x in flattened_hms], dim=0) # MB x C
        
        return pos_inds, labels, reg_targets, flattened_hms


    def _get_label_inds(self, gt_instances, shapes_per_level):
        '''
        Inputs:
            gt_instances: [n_i], sum n_i = N
            shapes_per_level: L x 2 [(h_l, w_l)]_L
        Returns:
            pos_inds: N'
            labels: N'
        '''
        pos_inds = []
        labels = []
        L = len(self.strides)
        B = len(gt_instances)
        shapes_per_level = shapes_per_level.long()
        loc_per_level = (shapes_per_level[:, 0] * shapes_per_level[:, 1]).long() # L
        level_bases = []
        s = 0
        for l in range(L):
            level_bases.append(s)
            s = s + B * loc_per_level[l]
        level_bases = shapes_per_level.new_tensor(level_bases).long() # L
        strides_default = shapes_per_level.new_tensor(self.strides).float() # L
        for im_i in range(B):
            targets_per_im = gt_instances[im_i]
            bboxes = targets_per_im.gt_boxes.tensor # n x 4
            n = bboxes.shape[0]
            centers = ((bboxes[:, [0, 1]] + bboxes[:, [2, 3]]) / 2) # n x 2
            centers = centers.view(n, 1, 2).expand(n, L, 2)
            strides = strides_default.view(1, L, 1).expand(n, L, 2)
            centers_inds = (centers / strides).long() # n x L x 2
            Ws = shapes_per_level[:, 1].view(1, L).expand(n, L)
            pos_ind = level_bases.view(1, L).expand(n, L) + \
                       im_i * loc_per_level.view(1, L).expand(n, L) + \
                       centers_inds[:, :, 1] * Ws + \
                       centers_inds[:, :, 0] # n x L
            is_cared_in_the_level = self.assign_fpn_level(bboxes)
            pos_ind = pos_ind[is_cared_in_the_level].view(-1)
            label = targets_per_im.gt_classes.view(
                n, 1).expand(n, L)[is_cared_in_the_level].view(-1)

            pos_inds.append(pos_ind) # n'
            labels.append(label) # n'
        pos_inds = torch.cat(pos_inds, dim=0).long()
        labels = torch.cat(labels, dim=0)
        return pos_inds, labels # N, N


    def assign_fpn_level(self, boxes):
        '''
        Inputs:
            boxes: n x 4
            size_ranges: L x 2
        Return:
            is_cared_in_the_level: n x L
        '''
        size_ranges = boxes.new_tensor(
            self.sizes_of_interest).view(len(self.sizes_of_interest), 2) # L x 2
        crit = ((boxes[:, 2:] - boxes[:, :2]) **2).sum(dim=1) ** 0.5 / 2 # n
        n, L = crit.shape[0], size_ranges.shape[0]
        crit = crit.view(n, 1).expand(n, L)
        size_ranges_expand = size_ranges.view(1, L, 2).expand(n, L, 2)
        is_cared_in_the_level = (crit >= size_ranges_expand[:, :, 0]) & \
            (crit <= size_ranges_expand[:, :, 1])
        return is_cared_in_the_level
    

    def assign_reg_fpn(self, reg_targets_per_im, size_ranges):
        '''
        TODO (Xingyi): merge it with assign_fpn_level
        Inputs:
            reg_targets_per_im: M x N x 4
            size_ranges: M x 2
        '''
        crit = ((reg_targets_per_im[:, :, :2] + \
            reg_targets_per_im[:, :, 2:])**2).sum(dim=2) ** 0.5 / 2 # M x N
        is_cared_in_the_level = (crit >= size_ranges[:, [0]]) & \
            (crit <= size_ranges[:, [1]])
        return is_cared_in_the_level


    def _get_reg_targets(self, reg_targets, dist, mask, area):
        '''
          reg_targets (M x N x 4): long tensor
          dist (M x N)
          is_*: M x N
        '''
        dist[mask == 0] = INF * 1.0
        min_dist, min_inds = dist.min(dim=1) # M
        reg_targets_per_im = reg_targets[
            range(len(reg_targets)), min_inds] # M x N x 4 --> M x 4
        reg_targets_per_im[min_dist == INF] = - INF
        return reg_targets_per_im


    def _create_heatmaps_from_dist(self, dist, labels, channels):
        '''
        dist: M x N
        labels: N
        return:
          heatmaps: M x C
        '''
        heatmaps = dist.new_zeros((dist.shape[0], channels))
        for c in range(channels):
            inds = (labels == c) # N
            if inds.int().sum() == 0:
                continue
            heatmaps[:, c] = torch.exp(-dist[:, inds].min(dim=1)[0])
            zeros = heatmaps[:, c] < 1e-4
            heatmaps[zeros, c] = 0
        return heatmaps


    def _create_agn_heatmaps_from_dist(self, dist):
        '''
        TODO (Xingyi): merge it with _create_heatmaps_from_dist
        dist: M x N
        return:
          heatmaps: M x 1
        '''
        heatmaps = dist.new_zeros((dist.shape[0], 1))
        heatmaps[:, 0] = torch.exp(-dist.min(dim=1)[0])
        zeros = heatmaps < 1e-4
        heatmaps[zeros] = 0
        return heatmaps


    def _flatten_outputs(self, clss, reg_pred, agn_hm_pred):
        # Reshape: (N, F, Hl, Wl) -> (N, Hl, Wl, F) -> (sum_l N*Hl*Wl, F)
        clss = cat([x.permute(0, 2, 3, 1).reshape(-1, x.shape[1]) \
            for x in clss], dim=0) if clss[0] is not None else None
        reg_pred = cat(
            [x.permute(0, 2, 3, 1).reshape(-1, 4) for x in reg_pred], dim=0)            
        agn_hm_pred = cat([x.permute(0, 2, 3, 1).reshape(-1) \
            for x in agn_hm_pred], dim=0) if self.with_agn_hm else None
        return clss, reg_pred, agn_hm_pred


    def get_center3x3(self, locations, centers, strides):
        '''
        Inputs:
            locations: M x 2
            centers: N x 2
            strides: M
        '''
        M, N = locations.shape[0], centers.shape[0]
        locations_expanded = locations.view(M, 1, 2).expand(M, N, 2) # M x N x 2
        centers_expanded = centers.view(1, N, 2).expand(M, N, 2) # M x N x 2
        strides_expanded = strides.view(M, 1, 1).expand(M, N, 2) # M x N
        centers_discret = ((centers_expanded / strides_expanded).int() * \
            strides_expanded).float() + strides_expanded / 2 # M x N x 2
        dist_x = (locations_expanded[:, :, 0] - centers_discret[:, :, 0]).abs()
        dist_y = (locations_expanded[:, :, 1] - centers_discret[:, :, 1]).abs()
        return (dist_x <= strides_expanded[:, :, 0]) & \
            (dist_y <= strides_expanded[:, :, 0])


    def inference(self, images, clss_per_level, reg_pred_per_level, 
        agn_hm_pred_per_level, grids):
        logits_pred = [x.sigmoid() if x is not None else None \
            for x in clss_per_level]
        agn_hm_pred_per_level = [x.sigmoid() if x is not None else None \
            for x in agn_hm_pred_per_level]

        if self.only_proposal:
            proposals = self.predict_instances(
                grids, agn_hm_pred_per_level, reg_pred_per_level, 
                images.image_sizes, [None for _ in agn_hm_pred_per_level])
        else:
            proposals = self.predict_instances(
                grids, logits_pred, reg_pred_per_level, 
                images.image_sizes, agn_hm_pred_per_level)
        if self.as_proposal or self.only_proposal:
            for p in range(len(proposals)):
                proposals[p].proposal_boxes = proposals[p].get('pred_boxes')
                proposals[p].objectness_logits = proposals[p].get('scores')
                proposals[p].remove('pred_boxes')

        if self.debug:
            debug_test(
                [self.denormalizer(x) for x in images], 
                logits_pred, reg_pred_per_level, 
                agn_hm_pred_per_level, preds=proposals,
                vis_thresh=self.vis_thresh, 
                debug_show_name=False)
        return proposals, {}


    def predict_instances(
        self, grids, logits_pred, reg_pred, image_sizes, agn_hm_pred, 
        is_proposal=False):
        sampled_boxes = []
        for l in range(len(grids)):
            sampled_boxes.append(self.predict_single_level(
                grids[l], logits_pred[l], reg_pred[l] * self.strides[l],
                image_sizes, agn_hm_pred[l], l, is_proposal=is_proposal))
        boxlists = list(zip(*sampled_boxes))
        boxlists = [Instances.cat(boxlist) for boxlist in boxlists]
        boxlists = self.nms_and_topK(
            boxlists, nms=not self.not_nms)
        return boxlists


    def predict_single_level(
        self, grids, heatmap, reg_pred, image_sizes, agn_hm, level, 
        is_proposal=False):
        N, C, H, W = heatmap.shape
        # put in the same format as grids
        if self.center_nms:
            heatmap_nms = nn.functional.max_pool2d(
                heatmap, (3, 3), stride=1, padding=1)
            heatmap = heatmap * (heatmap_nms == heatmap).float()
        heatmap = heatmap.permute(0, 2, 3, 1) # N x H x W x C
        heatmap = heatmap.reshape(N, -1, C) # N x HW x C
        box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1) # N x H x W x 4 
        box_regression = box_regression.reshape(N, -1, 4)

        candidate_inds = heatmap > self.score_thresh # 0.05
        pre_nms_top_n = candidate_inds.view(N, -1).sum(1) # N
        pre_nms_topk = self.pre_nms_topk_train if self.training else self.pre_nms_topk_test
        pre_nms_top_n = pre_nms_top_n.clamp(max=pre_nms_topk) # N

        if agn_hm is not None:
            agn_hm = agn_hm.view(N, 1, H, W).permute(0, 2, 3, 1)
            agn_hm = agn_hm.reshape(N, -1)
            heatmap = heatmap * agn_hm[:, :, None]

        results = []
        for i in range(N):
            per_box_cls = heatmap[i] # HW x C
            per_candidate_inds = candidate_inds[i] # n
            per_box_cls = per_box_cls[per_candidate_inds] # n

            per_candidate_nonzeros = per_candidate_inds.nonzero() # n
            per_box_loc = per_candidate_nonzeros[:, 0] # n
            per_class = per_candidate_nonzeros[:, 1] # n

            per_box_regression = box_regression[i] # HW x 4
            per_box_regression = per_box_regression[per_box_loc] # n x 4
            per_grids = grids[per_box_loc] # n x 2

            per_pre_nms_top_n = pre_nms_top_n[i] # 1

            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
                per_box_cls, top_k_indices = \
                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)
                per_class = per_class[top_k_indices]
                per_box_regression = per_box_regression[top_k_indices]
                per_grids = per_grids[top_k_indices]
            
            detections = torch.stack([
                per_grids[:, 0] - per_box_regression[:, 0],
                per_grids[:, 1] - per_box_regression[:, 1],
                per_grids[:, 0] + per_box_regression[:, 2],
                per_grids[:, 1] + per_box_regression[:, 3],
            ], dim=1) # n x 4

            # avoid invalid boxes in RoI heads
            detections[:, 2] = torch.max(detections[:, 2], detections[:, 0] + 0.01)
            detections[:, 3] = torch.max(detections[:, 3], detections[:, 1] + 0.01)
            boxlist = Instances(image_sizes[i])
            boxlist.scores = torch.sqrt(per_box_cls) \
                if self.with_agn_hm else per_box_cls # n
            # import pdb; pdb.set_trace()
            boxlist.pred_boxes = Boxes(detections)
            boxlist.pred_classes = per_class
            results.append(boxlist)
        return results


    def nms_and_topK(self, boxlists, nms=True):
        num_images = len(boxlists)
        results = []
        for i in range(num_images):
            nms_thresh = self.nms_thresh_train if self.training else \
                self.nms_thresh_test
            result = ml_nms(boxlists[i], nms_thresh) if nms else boxlists[i]
            if self.debug:
                print('#proposals before nms', len(boxlists[i]))
                print('#proposals after nms', len(result))
            num_dets = len(result)
            post_nms_topk = self.post_nms_topk_train if self.training else \
                self.post_nms_topk_test
            if num_dets > post_nms_topk:
                cls_scores = result.scores
                image_thresh, _ = torch.kthvalue(
                    cls_scores.float().cpu(),
                    num_dets - post_nms_topk + 1
                )
                keep = cls_scores >= image_thresh.item()
                keep = torch.nonzero(keep).squeeze(1)
                result = result[keep]
            if self.debug:
                print('#proposals after filter', len(result))
            results.append(result)
        return results


    def _add_more_pos(self, reg_pred, gt_instances, shapes_per_level):
        labels, level_masks, c33_inds, c33_masks, c33_regs = \
            self._get_c33_inds(gt_instances, shapes_per_level)
        N, L, K = labels.shape[0], len(self.strides), 9
        c33_inds[c33_masks == 0] = 0
        reg_pred_c33 = reg_pred[c33_inds].detach() # N x L x K
        invalid_reg = c33_masks == 0
        c33_regs_expand = c33_regs.view(N * L * K, 4).clamp(min=0)
        if N > 0:
            with torch.no_grad():
                c33_reg_loss = self.iou_loss(
                    reg_pred_c33.view(N * L * K, 4), 
                    c33_regs_expand, None,
                    reduction='none').view(N, L, K).detach() # N x L x K
        else:
            c33_reg_loss = reg_pred_c33.new_zeros((N, L, K)).detach()
        c33_reg_loss[invalid_reg] = INF # N x L x K
        c33_reg_loss.view(N * L, K)[level_masks.view(N * L), 4] = 0 # real center
        c33_reg_loss = c33_reg_loss.view(N, L * K)
        if N == 0:
            loss_thresh = c33_reg_loss.new_ones((N)).float()
        else:
            loss_thresh = torch.kthvalue(
                c33_reg_loss, self.more_pos_topk, dim=1)[0] # N
        loss_thresh[loss_thresh > self.more_pos_thresh] = self.more_pos_thresh # N
        new_pos = c33_reg_loss.view(N, L, K) < \
            loss_thresh.view(N, 1, 1).expand(N, L, K)
        pos_inds = c33_inds[new_pos].view(-1) # P
        labels = labels.view(N, 1, 1).expand(N, L, K)[new_pos].view(-1)
        return pos_inds, labels
        
    
    def _get_c33_inds(self, gt_instances, shapes_per_level):
        '''
        TODO (Xingyi): The current implementation is ugly. Refactor.
        Get the center (and the 3x3 region near center) locations of each objects
        Inputs:
            gt_instances: [n_i], sum n_i = N
            shapes_per_level: L x 2 [(h_l, w_l)]_L
        '''
        labels = []
        level_masks = []
        c33_inds = []
        c33_masks = []
        c33_regs = []
        L = len(self.strides)
        B = len(gt_instances)
        shapes_per_level = shapes_per_level.long()
        loc_per_level = (shapes_per_level[:, 0] * shapes_per_level[:, 1]).long() # L
        level_bases = []
        s = 0
        for l in range(L):
            level_bases.append(s)
            s = s + B * loc_per_level[l]
        level_bases = shapes_per_level.new_tensor(level_bases).long() # L
        strides_default = shapes_per_level.new_tensor(self.strides).float() # L
        K = 9
        dx = shapes_per_level.new_tensor([-1, 0, 1, -1, 0, 1, -1, 0, 1]).long()
        dy = shapes_per_level.new_tensor([-1, -1, -1, 0, 0, 0, 1, 1, 1]).long()
        for im_i in range(B):
            targets_per_im = gt_instances[im_i]
            bboxes = targets_per_im.gt_boxes.tensor # n x 4
            n = bboxes.shape[0]
            if n == 0:
                continue
            centers = ((bboxes[:, [0, 1]] + bboxes[:, [2, 3]]) / 2) # n x 2
            centers = centers.view(n, 1, 2).expand(n, L, 2)

            strides = strides_default.view(1, L, 1).expand(n, L, 2) # 
            centers_inds = (centers / strides).long() # n x L x 2
            center_grids = centers_inds * strides + strides // 2# n x L x 2
            l = center_grids[:, :, 0] - bboxes[:, 0].view(n, 1).expand(n, L)
            t = center_grids[:, :, 1] - bboxes[:, 1].view(n, 1).expand(n, L)
            r = bboxes[:, 2].view(n, 1).expand(n, L) - center_grids[:, :, 0]
            b = bboxes[:, 3].view(n, 1).expand(n, L) - center_grids[:, :, 1] # n x L
            reg = torch.stack([l, t, r, b], dim=2) # n x L x 4
            reg = reg / strides_default.view(1, L, 1).expand(n, L, 4).float()
            
            Ws = shapes_per_level[:, 1].view(1, L).expand(n, L)
            Hs = shapes_per_level[:, 0].view(1, L).expand(n, L)
            expand_Ws = Ws.view(n, L, 1).expand(n, L, K)
            expand_Hs = Hs.view(n, L, 1).expand(n, L, K)
            label = targets_per_im.gt_classes.view(n).clone()
            mask = reg.min(dim=2)[0] >= 0 # n x L
            mask = mask & self.assign_fpn_level(bboxes)
            labels.append(label) # n
            level_masks.append(mask) # n x L

            Dy = dy.view(1, 1, K).expand(n, L, K)
            Dx = dx.view(1, 1, K).expand(n, L, K)
            c33_ind = level_bases.view(1, L, 1).expand(n, L, K) + \
                       im_i * loc_per_level.view(1, L, 1).expand(n, L, K) + \
                       (centers_inds[:, :, 1:2].expand(n, L, K) + Dy) * expand_Ws + \
                       (centers_inds[:, :, 0:1].expand(n, L, K) + Dx) # n x L x K
            
            c33_mask = \
                ((centers_inds[:, :, 1:2].expand(n, L, K) + dy) < expand_Hs) & \
                ((centers_inds[:, :, 1:2].expand(n, L, K) + dy) >= 0) & \
                ((centers_inds[:, :, 0:1].expand(n, L, K) + dx) < expand_Ws) & \
                ((centers_inds[:, :, 0:1].expand(n, L, K) + dx) >= 0)
            # TODO (Xingyi): think about better way to implement this
            # Currently it hard codes the 3x3 region
            c33_reg = reg.view(n, L, 1, 4).expand(n, L, K, 4).clone()
            c33_reg[:, :, [0, 3, 6], 0] -= 1
            c33_reg[:, :, [0, 3, 6], 2] += 1
            c33_reg[:, :, [2, 5, 8], 0] += 1
            c33_reg[:, :, [2, 5, 8], 2] -= 1
            c33_reg[:, :, [0, 1, 2], 1] -= 1
            c33_reg[:, :, [0, 1, 2], 3] += 1
            c33_reg[:, :, [6, 7, 8], 1] += 1
            c33_reg[:, :, [6, 7, 8], 3] -= 1
            c33_mask = c33_mask & (c33_reg.min(dim=3)[0] >= 0) # n x L x K
            c33_inds.append(c33_ind)
            c33_masks.append(c33_mask)
            c33_regs.append(c33_reg)
        
        if len(level_masks) > 0:
            labels = torch.cat(labels, dim=0)
            level_masks = torch.cat(level_masks, dim=0)
            c33_inds = torch.cat(c33_inds, dim=0).long()
            c33_regs = torch.cat(c33_regs, dim=0)
            c33_masks = torch.cat(c33_masks, dim=0)
        else:
            labels = shapes_per_level.new_zeros((0)).long()
            level_masks = shapes_per_level.new_zeros((0, L)).bool()
            c33_inds = shapes_per_level.new_zeros((0, L, K)).long()
            c33_regs = shapes_per_level.new_zeros((0, L, K, 4)).float()
            c33_masks = shapes_per_level.new_zeros((0, L, K)).bool()
        return labels, level_masks, c33_inds, c33_masks, c33_regs # N x L, N x L x K


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/dense_heads/centernet_head.py
================================================
import math
from typing import List
import torch
from torch import nn
from torch.nn import functional as F

from detectron2.layers import ShapeSpec, get_norm
from detectron2.config import configurable
from ..layers.deform_conv import DFConv2d

__all__ = ["CenterNetHead"]

class Scale(nn.Module):
    def __init__(self, init_value=1.0):
        super(Scale, self).__init__()
        self.scale = nn.Parameter(torch.FloatTensor([init_value]))

    def forward(self, input):
        return input * self.scale

class CenterNetHead(nn.Module):
    @configurable
    def __init__(self, 
        # input_shape: List[ShapeSpec],
        in_channels,
        num_levels,
        *,
        num_classes=80,
        with_agn_hm=False,
        only_proposal=False,
        norm='GN',
        num_cls_convs=4,
        num_box_convs=4,
        num_share_convs=0,
        use_deformable=False,
        prior_prob=0.01):
        super().__init__()
        self.num_classes = num_classes
        self.with_agn_hm = with_agn_hm
        self.only_proposal = only_proposal
        self.out_kernel = 3

        head_configs = {
            "cls": (num_cls_convs if not self.only_proposal else 0, \
                use_deformable),
            "bbox": (num_box_convs, use_deformable),
            "share": (num_share_convs, use_deformable)}

        # in_channels = [s.channels for s in input_shape]
        # assert len(set(in_channels)) == 1, \
        #     "Each level must have the same channel!"
        # in_channels = in_channels[0]
        channels = {
            'cls': in_channels,
            'bbox': in_channels,
            'share': in_channels,
        }
        for head in head_configs:
            tower = []
            num_convs, use_deformable = head_configs[head]
            channel = channels[head]
            for i in range(num_convs):
                if use_deformable and i == num_convs - 1:
                    conv_func = DFConv2d
                else:
                    conv_func = nn.Conv2d
                tower.append(conv_func(
                        in_channels if i == 0 else channel,
                        channel, 
                        kernel_size=3, stride=1,
                        padding=1, bias=True
                ))
                if norm == 'GN' and channel % 32 != 0:
                    tower.append(nn.GroupNorm(25, channel))
                elif norm != '':
                    tower.append(get_norm(norm, channel))
                tower.append(nn.ReLU())
            self.add_module('{}_tower'.format(head),
                            nn.Sequential(*tower))

        self.bbox_pred = nn.Conv2d(
            in_channels, 4, kernel_size=self.out_kernel,
            stride=1, padding=self.out_kernel // 2
        )

        self.scales = nn.ModuleList(
            [Scale(init_value=1.0) for _ in range(num_levels)])

        for modules in [
            self.cls_tower, self.bbox_tower,
            self.share_tower,
            self.bbox_pred,
        ]:
            for l in modules.modules():
                if isinstance(l, nn.Conv2d):
                    torch.nn.init.normal_(l.weight, std=0.01)
                    torch.nn.init.constant_(l.bias, 0)
        
        torch.nn.init.constant_(self.bbox_pred.bias, 8.)
        prior_prob = prior_prob
        bias_value = -math.log((1 - prior_prob) / prior_prob)

        if self.with_agn_hm:
            self.agn_hm = nn.Conv2d(
                in_channels, 1, kernel_size=self.out_kernel,
                stride=1, padding=self.out_kernel // 2
            )
            torch.nn.init.constant_(self.agn_hm.bias, bias_value)
            torch.nn.init.normal_(self.agn_hm.weight, std=0.01)

        if not self.only_proposal:
            cls_kernel_size = self.out_kernel
            self.cls_logits = nn.Conv2d(
                in_channels, self.num_classes,
                kernel_size=cls_kernel_size, 
                stride=1,
                padding=cls_kernel_size // 2,
            )

            torch.nn.init.constant_(self.cls_logits.bias, bias_value)
            torch.nn.init.normal_(self.cls_logits.weight, std=0.01)

    @classmethod
    def from_config(cls, cfg, input_shape):
        ret = {
            # 'input_shape': input_shape,
            'in_channels': [s.channels for s in input_shape][0],
            'num_levels': len(input_shape),
            'num_classes': cfg.MODEL.CENTERNET.NUM_CLASSES,
            'with_agn_hm': cfg.MODEL.CENTERNET.WITH_AGN_HM,
            'only_proposal': cfg.MODEL.CENTERNET.ONLY_PROPOSAL,
            'norm': cfg.MODEL.CENTERNET.NORM,
            'num_cls_convs': cfg.MODEL.CENTERNET.NUM_CLS_CONVS,
            'num_box_convs': cfg.MODEL.CENTERNET.NUM_BOX_CONVS,
            'num_share_convs': cfg.MODEL.CENTERNET.NUM_SHARE_CONVS,
            'use_deformable': cfg.MODEL.CENTERNET.USE_DEFORMABLE,
            'prior_prob': cfg.MODEL.CENTERNET.PRIOR_PROB,
        }
        return ret

    def forward(self, x):
        clss = []
        bbox_reg = []
        agn_hms = []
        for l, feature in enumerate(x):
            feature = self.share_tower(feature)
            cls_tower = self.cls_tower(feature)
            bbox_tower = self.bbox_tower(feature)
            if not self.only_proposal:
                clss.append(self.cls_logits(cls_tower))
            else:
                clss.append(None)

            if self.with_agn_hm:
                agn_hms.append(self.agn_hm(bbox_tower))
            else:
                agn_hms.append(None)
            reg = self.bbox_pred(bbox_tower)
            reg = self.scales[l](reg)
            bbox_reg.append(F.relu(reg))
        
        return clss, bbox_reg, agn_hms

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/dense_heads/utils.py
================================================
import cv2
import torch
from torch import nn
from detectron2.utils.comm import get_world_size
from detectron2.structures import pairwise_iou, Boxes
# from .data import CenterNetCrop
import torch.nn.functional as F
import numpy as np
from detectron2.structures import Boxes, ImageList, Instances

__all__ = ['reduce_sum', '_transpose']

INF = 1000000000

def _transpose(training_targets, num_loc_list):
    '''
    This function is used to transpose image first training targets to 
        level first ones
    :return: level first training targets
    '''
    for im_i in range(len(training_targets)):
        training_targets[im_i] = torch.split(
            training_targets[im_i], num_loc_list, dim=0)

    targets_level_first = []
    for targets_per_level in zip(*training_targets):
        targets_level_first.append(
            torch.cat(targets_per_level, dim=0))
    return targets_level_first


def reduce_sum(tensor):
    world_size = get_world_size()
    if world_size < 2:
        return tensor
    tensor = tensor.clone()
    torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
    return tensor

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/layers/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/layers/deform_conv.py
================================================
import torch
from torch import nn

from detectron2.layers import Conv2d


class _NewEmptyTensorOp(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, new_shape):
        ctx.shape = x.shape
        return x.new_empty(new_shape)

    @staticmethod
    def backward(ctx, grad):
        shape = ctx.shape
        return _NewEmptyTensorOp.apply(grad, shape), None


class DFConv2d(nn.Module):
    """Deformable convolutional layer"""
    def __init__(
            self,
            in_channels,
            out_channels,
            with_modulated_dcn=True,
            kernel_size=3,
            stride=1,
            groups=1,
            dilation=1,
            deformable_groups=1,
            bias=False,
            padding=None
    ):
        super(DFConv2d, self).__init__()
        if isinstance(kernel_size, (list, tuple)):
            assert isinstance(stride, (list, tuple))
            assert isinstance(dilation, (list, tuple))
            assert len(kernel_size) == 2
            assert len(stride) == 2
            assert len(dilation) == 2
            padding = (
                dilation[0] * (kernel_size[0] - 1) // 2,
                dilation[1] * (kernel_size[1] - 1) // 2
            )
            offset_base_channels = kernel_size[0] * kernel_size[1]
        else:
            padding = dilation * (kernel_size - 1) // 2
            offset_base_channels = kernel_size * kernel_size
        if with_modulated_dcn:
            from detectron2.layers.deform_conv import ModulatedDeformConv
            offset_channels = offset_base_channels * 3  # default: 27
            conv_block = ModulatedDeformConv
        else:
            from detectron2.layers.deform_conv import DeformConv
            offset_channels = offset_base_channels * 2  # default: 18
            conv_block = DeformConv
        self.offset = Conv2d(
            in_channels,
            deformable_groups * offset_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            groups=1,
            dilation=dilation
        )
        nn.init.constant_(self.offset.weight, 0)
        nn.init.constant_(self.offset.bias, 0)
        '''
        for l in [self.offset, ]:
            nn.init.kaiming_uniform_(l.weight, a=1)
            torch.nn.init.constant_(l.bias, 0.)
        '''
        self.conv = conv_block(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            deformable_groups=deformable_groups,
            bias=bias
        )
        self.with_modulated_dcn = with_modulated_dcn
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.offset_split = offset_base_channels * deformable_groups * 2

    def forward(self, x, return_offset=False):
        if x.numel() > 0:
            if not self.with_modulated_dcn:
                offset_mask = self.offset(x)
                x = self.conv(x, offset_mask)
            else:
                offset_mask = self.offset(x)
                offset = offset_mask[:, :self.offset_split, :, :]
                mask = offset_mask[:, self.offset_split:, :, :].sigmoid()
                x = self.conv(x, offset, mask)
            if return_offset:
                return x, offset_mask
            return x
        # get output shape
        output_shape = [
            (i + 2 * p - (di * (k - 1) + 1)) // d + 1
            for i, p, di, k, d in zip(
                x.shape[-2:],
                self.padding,
                self.dilation,
                self.kernel_size,
                self.stride
            )
        ]
        output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape
        return _NewEmptyTensorOp.apply(x, output_shape)

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/layers/heatmap_focal_loss.py
================================================
import torch
from torch.nn import functional as F

# TODO: merge these two function
def heatmap_focal_loss(
    inputs,
    targets,
    pos_inds,
    labels,
    alpha: float = -1,
    beta: float = 4,
    gamma: float = 2,
    reduction: str = 'sum',
    sigmoid_clamp: float = 1e-4,
    ignore_high_fp: float = -1.,
):
    """
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
    Args:
        inputs:  (sum_l N*Hl*Wl, C)
        targets: (sum_l N*Hl*Wl, C)
        pos_inds: N
        labels: N
    Returns:
        Loss tensor with the reduction option applied.
    """
    pred = torch.clamp(inputs.sigmoid_(), min=sigmoid_clamp, max=1-sigmoid_clamp)
    neg_weights = torch.pow(1 - targets, beta)
    pos_pred_pix = pred[pos_inds] # N x C
    pos_pred = pos_pred_pix.gather(1, labels.unsqueeze(1))
    pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma)
    neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights

    if ignore_high_fp > 0:
        not_high_fp = (pred < ignore_high_fp).float()
        neg_loss = not_high_fp * neg_loss

    if reduction == "sum":
        pos_loss = pos_loss.sum()
        neg_loss = neg_loss.sum()

    if alpha >= 0:
        pos_loss = alpha * pos_loss
        neg_loss = (1 - alpha) * neg_loss

    return - pos_loss, - neg_loss

heatmap_focal_loss_jit = torch.jit.script(heatmap_focal_loss)
# heatmap_focal_loss_jit = heatmap_focal_loss

def binary_heatmap_focal_loss(
    inputs,
    targets,
    pos_inds,
    alpha: float = -1,
    beta: float = 4,
    gamma: float = 2,
    sigmoid_clamp: float = 1e-4,
    ignore_high_fp: float = -1.,
):
    """
    Args:
        inputs:  (sum_l N*Hl*Wl,)
        targets: (sum_l N*Hl*Wl,)
        pos_inds: N
    Returns:
        Loss tensor with the reduction option applied.
    """
    pred = torch.clamp(inputs.sigmoid_(), min=sigmoid_clamp, max=1-sigmoid_clamp)
    neg_weights = torch.pow(1 - targets, beta)
    for i, ind in enumerate(pos_inds):
        if ind >= pred.shape[0]:
            print('%'*100)
            print(pred.shape, ind, pos_inds)
            pos_inds[i] = pred.shape[0] - 1
    pos_pred = pred[pos_inds] # N
    pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma)
    neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights
    if ignore_high_fp > 0:
        not_high_fp = (pred < ignore_high_fp).float()
        neg_loss = not_high_fp * neg_loss

    pos_loss = - pos_loss.sum()
    neg_loss = - neg_loss.sum()

    if alpha >= 0:
        pos_loss = alpha * pos_loss
        neg_loss = (1 - alpha) * neg_loss

    return pos_loss, neg_loss

# binary_heatmap_focal_loss_jit = torch.jit.script(binary_heatmap_focal_loss)

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/layers/iou_loss.py
================================================
import torch
from torch import nn


class IOULoss(nn.Module):
    def __init__(self, loc_loss_type='iou'):
        super(IOULoss, self).__init__()
        self.loc_loss_type = loc_loss_type

    def forward(self, pred, target, weight=None, reduction='sum'):
        pred_left = pred[:, 0]
        pred_top = pred[:, 1]
        pred_right = pred[:, 2]
        pred_bottom = pred[:, 3]

        target_left = target[:, 0]
        target_top = target[:, 1]
        target_right = target[:, 2]
        target_bottom = target[:, 3]

        target_aera = (target_left + target_right) * \
                      (target_top + target_bottom)
        pred_aera = (pred_left + pred_right) * \
                    (pred_top + pred_bottom)

        w_intersect = torch.min(pred_left, target_left) + \
                      torch.min(pred_right, target_right)
        h_intersect = torch.min(pred_bottom, target_bottom) + \
                      torch.min(pred_top, target_top)

        g_w_intersect = torch.max(pred_left, target_left) + \
                        torch.max(pred_right, target_right)
        g_h_intersect = torch.max(pred_bottom, target_bottom) + \
                        torch.max(pred_top, target_top)
        ac_uion = g_w_intersect * g_h_intersect

        area_intersect = w_intersect * h_intersect
        area_union = target_aera + pred_aera - area_intersect

        ious = (area_intersect + 1.0) / (area_union + 1.0)
        gious = ious - (ac_uion - area_union) / ac_uion
        if self.loc_loss_type == 'iou':
            losses = -torch.log(ious)
        elif self.loc_loss_type == 'linear_iou':
            losses = 1 - ious
        elif self.loc_loss_type == 'giou':
            losses = 1 - gious
        else:
            raise NotImplementedError

        if weight is not None:
            losses = losses * weight
        else:
            losses = losses

        if reduction == 'sum':
            return losses.sum()
        elif reduction == 'batch':
            return losses.sum(dim=[1])
        elif reduction == 'none':
            return losses
        else:
            raise NotImplementedError


def giou_loss(
    boxes1: torch.Tensor,
    boxes2: torch.Tensor,
    reduction: str = "none",
    eps: float = 1e-7,
) -> torch.Tensor:
    """
    Generalized Intersection over Union Loss (Hamid Rezatofighi et. al)
    https://arxiv.org/abs/1902.09630
    Gradient-friendly IoU loss with an additional penalty that is non-zero when the
    boxes do not overlap and scales with the size of their smallest enclosing box.
    This loss is symmetric, so the boxes1 and boxes2 arguments are interchangeable.
    Args:
        boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
        reduction: 'none' | 'mean' | 'sum'
                 'none': No reduction will be applied to the output.
                 'mean': The output will be averaged.
                 'sum': The output will be summed.
        eps (float): small number to prevent division by zero
    """

    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)

    assert (x2 >= x1).all(), "bad box: x1 larger than x2"
    assert (y2 >= y1).all(), "bad box: y1 larger than y2"

    # Intersection keypoints
    xkis1 = torch.max(x1, x1g)
    ykis1 = torch.max(y1, y1g)
    xkis2 = torch.min(x2, x2g)
    ykis2 = torch.min(y2, y2g)

    intsctk = torch.zeros_like(x1)
    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
    intsctk[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
    unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsctk
    iouk = intsctk / (unionk + eps)

    # smallest enclosing box
    xc1 = torch.min(x1, x1g)
    yc1 = torch.min(y1, y1g)
    xc2 = torch.max(x2, x2g)
    yc2 = torch.max(y2, y2g)

    area_c = (xc2 - xc1) * (yc2 - yc1)
    miouk = iouk - ((area_c - unionk) / (area_c + eps))

    loss = 1 - miouk

    if reduction == "mean":
        loss = loss.mean()
    elif reduction == "sum":
        loss = loss.sum()

    return loss

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/layers/ml_nms.py
================================================
from detectron2.layers import batched_nms


def ml_nms(boxlist, nms_thresh, max_proposals=-1,
           score_field="scores", label_field="labels"):
    """
    Performs non-maximum suppression on a boxlist, with scores specified
    in a boxlist field via score_field.
    Arguments:
        boxlist(BoxList)
        nms_thresh (float)
        max_proposals (int): if > 0, then only the top max_proposals are kept
            after non-maximum suppression
        score_field (str)
    """
    if nms_thresh <= 0:
        return boxlist
    if boxlist.has('pred_boxes'):
        boxes = boxlist.pred_boxes.tensor
        labels = boxlist.pred_classes
    else:
        boxes = boxlist.proposal_boxes.tensor
        labels = boxlist.proposal_boxes.tensor.new_zeros(
            len(boxlist.proposal_boxes.tensor))
    scores = boxlist.scores
    
    keep = batched_nms(boxes, scores, labels, nms_thresh)
    if max_proposals > 0:
        keep = keep[: max_proposals]
    boxlist = boxlist[keep]
    return boxlist


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/meta_arch/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/meta_arch/centernet_detector.py
================================================
import math
import json
import numpy as np
import torch
from torch import nn

from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
from detectron2.modeling import build_backbone, build_proposal_generator
from detectron2.modeling import detector_postprocess
from detectron2.structures import ImageList

@META_ARCH_REGISTRY.register()
class CenterNetDetector(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.mean, self.std = cfg.MODEL.PIXEL_MEAN, cfg.MODEL.PIXEL_STD
        self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
        self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
        
        self.backbone = build_backbone(cfg)
        self.proposal_generator = build_proposal_generator(
            cfg, self.backbone.output_shape()) # TODO: change to a more precise name
    
    
    def forward(self, batched_inputs):
        if not self.training:
            return self.inference(batched_inputs)
        images = self.preprocess_image(batched_inputs)
        features = self.backbone(images.tensor)
        gt_instances = [x["instances"].to(self.device) for x in batched_inputs]

        _, proposal_losses = self.proposal_generator(
            images, features, gt_instances)
        return proposal_losses


    @property
    def device(self):
        return self.pixel_mean.device


    @torch.no_grad()
    def inference(self, batched_inputs, do_postprocess=True):
        images = self.preprocess_image(batched_inputs)
        inp = images.tensor
        features = self.backbone(inp)
        proposals, _ = self.proposal_generator(images, features, None)

        processed_results = []
        for results_per_image, input_per_image, image_size in zip(
            proposals, batched_inputs, images.image_sizes):
            if do_postprocess:
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            else:
                r = results_per_image
                processed_results.append(r)
        return processed_results

    def preprocess_image(self, batched_inputs):
        """
        Normalize, pad and batch the input images.
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
        return images


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/roi_heads/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/roi_heads/custom_fast_rcnn.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Part of the code is from https://github.com/tztztztztz/eql.detectron2/blob/master/projects/EQL/eql/fast_rcnn.py
import logging
import math
import json
from typing import Dict, Union
import torch
from fvcore.nn import giou_loss, smooth_l1_loss
from torch import nn
from torch.nn import functional as F

from detectron2.config import configurable
from detectron2.layers import Linear, ShapeSpec, batched_nms, cat, nonzero_tuple
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.structures import Boxes, Instances
from detectron2.utils.events import get_event_storage
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference
from detectron2.modeling.roi_heads.fast_rcnn import _log_classification_stats
from detectron2.utils.comm import get_world_size
from .fed_loss import load_class_freq, get_fed_loss_inds

__all__ = ["CustomFastRCNNOutputLayers"]

class CustomFastRCNNOutputLayers(FastRCNNOutputLayers):
    def __init__(
        self, 
        cfg, 
        input_shape: ShapeSpec,
        **kwargs
    ):
        super().__init__(cfg, input_shape, **kwargs)

        self.cfg = cfg

    def losses(self, predictions, proposals):
        """
        enable advanced loss
        """
        scores, proposal_deltas = predictions
        gt_classes = (
            cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
        )
        num_classes = self.num_classes
        _log_classification_stats(scores, gt_classes)

        if len(proposals):
            proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)  # Nx4
            assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
            gt_boxes = cat(
                [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
                dim=0,
            )
        else:
            proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)

        loss_cls = self.softmax_cross_entropy_loss(scores, gt_classes)
        return {
            "loss_cls": loss_cls, 
            "loss_box_reg": self.box_reg_loss(
                proposal_boxes, gt_boxes, proposal_deltas, gt_classes)
        }


    def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes):
        if pred_class_logits.numel() == 0:
            return pred_class_logits.new_zeros([1])[0] # This is more robust than .sum() * 0.

        B = pred_class_logits.shape[0]
        C = pred_class_logits.shape[1] - 1

        target = pred_class_logits.new_zeros(B, C + 1)
        target[range(len(gt_classes)), gt_classes] = 1 # B x (C + 1)
        target = target[:, :C] # B x C

        weight = 1

        cls_loss = F.binary_cross_entropy_with_logits(
            pred_class_logits[:, :-1], target, reduction='none') # B x C
        loss =  torch.sum(cls_loss * weight) / B  
        return loss
        
    
    def softmax_cross_entropy_loss(self, pred_class_logits, gt_classes):
        """
        change _no_instance handling
        """
        if pred_class_logits.numel() == 0:
            return pred_class_logits.new_zeros([1])[0]

        loss = F.cross_entropy(
            pred_class_logits, gt_classes, reduction="mean")
        return loss


    def inference(self, predictions, proposals):
        """
        enable use proposal boxes
        """
        boxes = self.predict_boxes(predictions, proposals)
        scores = self.predict_probs(predictions, proposals)
        if self.cfg.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE:
            proposal_scores = [p.get('objectness_logits') for p in proposals]
            scores = [(s * ps[:, None]) ** 0.5 \
                for s, ps in zip(scores, proposal_scores)]
        image_shapes = [x.image_size for x in proposals]
        return fast_rcnn_inference(
            boxes,
            scores,
            image_shapes,
            self.test_score_thresh,
            self.test_nms_thresh,
            self.test_topk_per_image,
        )


    def predict_probs(self, predictions, proposals):
        """
        support sigmoid
        """
        scores, _ = predictions
        num_inst_per_image = [len(p) for p in proposals]
        probs = F.softmax(scores, dim=-1)
        return probs.split(num_inst_per_image, dim=0)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/roi_heads/custom_roi_heads.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
import json
import math
import torch
from torch import nn
from torch.autograd.function import Function
from typing import Dict, List, Optional, Tuple, Union

from detectron2.layers import ShapeSpec
from detectron2.structures import Boxes, Instances, pairwise_iou
from detectron2.utils.events import get_event_storage

from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference
from detectron2.modeling.roi_heads.roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
from detectron2.modeling.roi_heads.cascade_rcnn import CascadeROIHeads
from detectron2.modeling.roi_heads.box_head import build_box_head
from .custom_fast_rcnn import CustomFastRCNNOutputLayers


@ROI_HEADS_REGISTRY.register()
class CustomROIHeads(StandardROIHeads):
    @classmethod
    def _init_box_head(self, cfg, input_shape):
        ret = super()._init_box_head(cfg, input_shape)
        del ret['box_predictor']
        ret['box_predictor'] = CustomFastRCNNOutputLayers(
            cfg, ret['box_head'].output_shape)
        self.debug = cfg.DEBUG
        if self.debug:
            self.debug_show_name = cfg.DEBUG_SHOW_NAME
            self.save_debug = cfg.SAVE_DEBUG
            self.vis_thresh = cfg.VIS_THRESH
            self.pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(
                torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1)
            self.pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(
                torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1)
        return ret

    def forward(self, images, features, proposals, targets=None):
        """
        enable debug
        """
        if not self.debug:
            del images
        if self.training:
            assert targets
            proposals = self.label_and_sample_proposals(proposals, targets)
        del targets

        if self.training:
            losses = self._forward_box(features, proposals)
            losses.update(self._forward_mask(features, proposals))
            losses.update(self._forward_keypoint(features, proposals))
            return proposals, losses
        else:
            pred_instances = self._forward_box(features, proposals)
            pred_instances = self.forward_with_given_boxes(features, pred_instances)
            if self.debug:
                from ..debug import debug_second_stage
                denormalizer = lambda x: x * self.pixel_std + self.pixel_mean
                debug_second_stage(
                    [denormalizer(images[0].clone())],
                    pred_instances, proposals=proposals,
                    debug_show_name=self.debug_show_name)
            return pred_instances, {}


@ROI_HEADS_REGISTRY.register()
class CustomCascadeROIHeads(CascadeROIHeads):
    @classmethod
    def _init_box_head(self, cfg, input_shape):
        self.mult_proposal_score = cfg.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE
        ret = super()._init_box_head(cfg, input_shape)
        del ret['box_predictors']
        cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS
        box_predictors = []
        for box_head, bbox_reg_weights in zip(ret['box_heads'], cascade_bbox_reg_weights):
            box_predictors.append(
                CustomFastRCNNOutputLayers(
                    cfg, box_head.output_shape,
                    box2box_transform=Box2BoxTransform(weights=bbox_reg_weights)
                ))
        ret['box_predictors'] = box_predictors
        self.debug = cfg.DEBUG
        if self.debug:
            self.debug_show_name = cfg.DEBUG_SHOW_NAME
            self.save_debug = cfg.SAVE_DEBUG
            self.vis_thresh = cfg.VIS_THRESH
            self.pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(
                torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1)
            self.pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(
                torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1)
        return ret


    def _forward_box(self, features, proposals, targets=None):
        """
        Add mult proposal scores at testing
        """
        if (not self.training) and self.mult_proposal_score:
            if len(proposals) > 0 and proposals[0].has('scores'):
                proposal_scores = [
                    p.get('scores') for p in proposals]
            else:
                proposal_scores = [
                    p.get('objectness_logits') for p in proposals]
        
        features = [features[f] for f in self.box_in_features]
        head_outputs = []  # (predictor, predictions, proposals)
        prev_pred_boxes = None
        image_sizes = [x.image_size for x in proposals]
        for k in range(self.num_cascade_stages):
            if k > 0:
                proposals = self._create_proposals_from_boxes(prev_pred_boxes, image_sizes)
                if self.training:
                    proposals = self._match_and_label_boxes(proposals, k, targets)
            predictions = self._run_stage(features, proposals, k)
            prev_pred_boxes = self.box_predictor[k].predict_boxes(predictions, proposals)
            head_outputs.append((self.box_predictor[k], predictions, proposals))

        if self.training:
            losses = {}
            storage = get_event_storage()
            for stage, (predictor, predictions, proposals) in enumerate(head_outputs):
                with storage.name_scope("stage{}".format(stage)):
                    stage_losses = predictor.losses(predictions, proposals)
                losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()})
            return losses
        else:
            # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1)
            scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs]
            scores = [
                sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages)
                for scores_per_image in zip(*scores_per_stage)
            ]
            
            if self.mult_proposal_score:
                scores = [(s * ps[:, None]) ** 0.5 \
                    for s, ps in zip(scores, proposal_scores)]

            predictor, predictions, proposals = head_outputs[-1]
            boxes = predictor.predict_boxes(predictions, proposals)
            pred_instances, _ = fast_rcnn_inference(
                boxes,
                scores,
                image_sizes,
                predictor.test_score_thresh,
                predictor.test_nms_thresh,
                predictor.test_topk_per_image,
            )
            
            return pred_instances

    def forward(self, images, features, proposals, targets=None):
        '''
        enable debug
        '''
        if not self.debug:
            del images
        if self.training:
            proposals = self.label_and_sample_proposals(proposals, targets)

        if self.training:
            losses = self._forward_box(features, proposals, targets)
            losses.update(self._forward_mask(features, proposals))
            losses.update(self._forward_keypoint(features, proposals))
            return proposals, losses
        else:
            # import pdb; pdb.set_trace()
            pred_instances = self._forward_box(features, proposals)
            pred_instances = self.forward_with_given_boxes(features, pred_instances)
            if self.debug:
                from ..debug import debug_second_stage
                denormalizer = lambda x: x * self.pixel_std + self.pixel_mean
                debug_second_stage(
                    [denormalizer(x.clone()) for x in images],
                    pred_instances, proposals=proposals,
                    save_debug=self.save_debug,
                    debug_show_name=self.debug_show_name,
                    vis_thresh=self.vis_thresh)
            return pred_instances, {}


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/grit_src/centernet2/centernet/modeling/roi_heads/fed_loss.py
================================================
import torch
import json
import numpy as np
from torch.nn import functional as F

def load_class_freq(
    path='datasets/lvis/lvis_v1_train_cat_info.json', 
    freq_weight=0.5):
    cat_info = json.load(open(path, 'r'))
    cat_info = torch.tensor(
        [c['image_count'] for c in sorted(cat_info, key=lambda x: x['id'])])
    freq_weight = cat_info.float() ** freq_weight
    return freq_weight

def get_fed_loss_inds(
    gt_classes, num_sample_cats=50, C=1203, \
    weight=None, fed_cls_inds=-1):
    appeared = torch.unique(gt_classes) # C'
    prob = appeared.new_ones(C + 1).float()
    prob[-1] = 0
    if len(appeared) < num_sample_cats:
        if weight is not None:
            prob[:C] = weight.float().clone()
        prob[appeared] = 0
        if fed_cls_inds > 0:
            prob[fed_cls_inds:] = 0
        more_appeared = torch.multinomial(
            prob, num_sample_cats - len(appeared),
            replacement=False)
        appeared = torch.cat([appeared, more_appeared])
    return appeared

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/__init__.py
================================================
import sys
sys.path.append('third_party/grit_src')


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/med.py
================================================
'''
 * Copyright (c) 2022, salesforce.com, inc.
 * All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 * By Junnan Li
 * Based on huggingface code base
 * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
'''

import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
from torch import Tensor, device, dtype, nn
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F

from transformers.activations import ACT2FN
from transformers.file_utils import (
    ModelOutput,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    NextSentencePredictorOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from transformers.modeling_utils import (
    PreTrainedModel,
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from transformers.utils import logging
from transformers.models.bert.configuration_bert import BertConfig


logger = logging.get_logger(__name__)


class BertEmbeddings_nopos(nn.Module):
    """Construct the embeddings from word and position embeddings."""

    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        # self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        
        self.config = config

    def forward(
        self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        # if position_ids is None:
            # position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        embeddings = inputs_embeds

        # if self.position_embedding_type == "absolute":
        #     position_embeddings = self.position_embeddings(position_ids)
        #     # print('add position_embeddings!!!!')
        #     embeddings += position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


class BertEmbeddings(nn.Module):
    """Construct the embeddings from word and position embeddings."""

    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        
        self.config = config

    def forward(
        self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        embeddings = inputs_embeds

        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            # print('add position_embeddings!!!!')
            embeddings += position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


class BertSelfAttention(nn.Module):
    def __init__(self, config, is_cross_attention):
        super().__init__()
        self.config = config
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
            )
        
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        if is_cross_attention:
            self.key = nn.Linear(config.encoder_width, self.all_head_size)
            self.value = nn.Linear(config.encoder_width, self.all_head_size)
        else:
            self.key = nn.Linear(config.hidden_size, self.all_head_size)
            self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
        self.save_attention = False   
            
    def save_attn_gradients(self, attn_gradients):
        self.attn_gradients = attn_gradients
        
    def get_attn_gradients(self):
        return self.attn_gradients
    
    def save_attention_map(self, attention_map):
        self.attention_map = attention_map
        
    def get_attention_map(self):
        return self.attention_map
    
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
    ):
        mixed_query_layer = self.query(hidden_states)

        # If this is instantiated as a cross-attention module, the keys
        # and values come from an encoder; the attention mask needs to be
        # such that the encoder's padding tokens are not attended to.
        is_cross_attention = encoder_hidden_states is not None

        if is_cross_attention:
            # print(self.key.weight.shape)
            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
            attention_mask = encoder_attention_mask
        elif past_key_value is not None:
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            value_layer = self.transpose_for_scores(self.value(hidden_states))
            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
        else:
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            value_layer = self.transpose_for_scores(self.value(hidden_states))

        query_layer = self.transpose_for_scores(mixed_query_layer)
       
        if key_layer.shape[0] > query_layer.shape[0]:
            key_layer = key_layer[:query_layer.shape[0], :, :, :]
            attention_mask = attention_mask[:query_layer.shape[0], :, :]
            value_layer = value_layer[:query_layer.shape[0], :, :, :]
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        past_key_value = (key_layer, value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            seq_length = hidden_states.size()[1]
            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
            distance = position_ids_l - position_ids_r
            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility

            if self.position_embedding_type == "relative_key":
                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                attention_scores = attention_scores + relative_position_scores
            elif self.position_embedding_type == "relative_key_query":
                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key

        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        if attention_mask is not None:
            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
            attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        
        if is_cross_attention and self.save_attention:
            self.save_attention_map(attention_probs)
            attention_probs.register_hook(self.save_attn_gradients)         

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs_dropped = self.dropout(attention_probs)

        # Mask heads if we want to
        if head_mask is not None:
            attention_probs_dropped = attention_probs_dropped * head_mask

        context_layer = torch.matmul(attention_probs_dropped, value_layer)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        outputs = outputs + (past_key_value,)
        return outputs


class BertSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class BertAttention(nn.Module):
    def __init__(self, config, is_cross_attention=False):
        super().__init__()
        self.self = BertSelfAttention(config, is_cross_attention)
        self.output = BertSelfOutput(config)
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # Prune linear layers
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # Update hyper params and store pruned heads
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
    ):
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs


class BertIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


class BertOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class BertLayer(nn.Module):
    def __init__(self, config, layer_num):
        super().__init__()
        self.config = config
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        self.attention = BertAttention(config)      
        self.layer_num = layer_num          
        if self.config.add_cross_attention:
            self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        mode=None,
    ):
        
        if mode == 'mlr':

            assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"

            # print('attention_output.shape',attention_output.shape)
            # print('encoder_hidden_states.shape',encoder_hidden_states.shape)
            cross_attention_outputs = self.crossattention(
                hidden_states,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                output_attentions=output_attentions,
            )
            attention_output = cross_attention_outputs[0]
            outputs = cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights  

            present_key_value = cross_attention_outputs[-1]

        else:
            # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
            self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
            self_attention_outputs = self.attention(
                hidden_states,
                attention_mask,
                head_mask,
                output_attentions=output_attentions,
                past_key_value=self_attn_past_key_value,
            )
            attention_output = self_attention_outputs[0]

            outputs = self_attention_outputs[1:-1]
            present_key_value = self_attention_outputs[-1]

            if mode=='multimodal':
                assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"

                cross_attention_outputs = self.crossattention(
                    attention_output,
                    attention_mask,
                    head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    output_attentions=output_attentions,
                )
                attention_output = cross_attention_outputs[0]
                outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights                               
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        outputs = (layer_output,) + outputs

        outputs = outputs + (present_key_value,)

        return outputs

    def feed_forward_chunk(self, attention_output):
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


class BertEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        mode='multimodal',
    ):
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        next_decoder_cache = () if use_cache else None
               
        for i in range(self.config.num_hidden_layers):
            layer_module = self.layer[i]
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_head_mask = head_mask[i] if head_mask is not None else None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            if self.gradient_checkpointing and self.training:

                if use_cache:
                    logger.warn(
                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                    )
                    use_cache = False

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs, past_key_value, output_attentions)

                    return custom_forward

                layer_outputs = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(layer_module),
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    mode=mode,
                )
            else:
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                    mode=mode,
                )

            hidden_states = layer_outputs[0]
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )


class BertPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


class BertPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


class BertLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.transform = BertPredictionHeadTransform(config)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
        return hidden_states


class BertOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.predictions = BertLMPredictionHead(config)

    def forward(self, sequence_output):
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


class BertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = BertConfig
    base_model_prefix = "bert"
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()


class BertModel(BertPreTrainedModel):
    """
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
    input to the forward pass.
    """

    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        self.embeddings = BertEmbeddings(config)
        
        self.encoder = BertEncoder(config)

        self.pooler = BertPooler(config) if add_pooling_layer else None

        self.init_weights()
 

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    
    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor:
        """
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (:obj:`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (:obj:`Tuple[int]`):
                The shape of the input to the model.
            device: (:obj:`torch.device`):
                The device of the input to the model.

        Returns:
            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
        """
        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            # Provided a padding mask of dimensions [batch_size, seq_length]
            # - if the model is a decoder, apply a causal mask in addition to the padding mask
            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
            if is_decoder:
                batch_size, seq_length = input_shape

                seq_ids = torch.arange(seq_length, device=device)
                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
                # causal and attention masks must have same type with pytorch version < 1.3
                causal_mask = causal_mask.to(attention_mask.dtype)
   
                if causal_mask.shape[1] < attention_mask.shape[1]:
                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
                    causal_mask = torch.cat(
                        [
                            torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
                            causal_mask,
                        ],
                        axis=-1,
                    )                     

                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
            else:
                extended_attention_mask = attention_mask[:, None, None, :]
        else:
            raise ValueError(
                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
                    input_shape, attention_mask.shape
                )
            )

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        return extended_attention_mask
    
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        encoder_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        is_decoder=False,
        mode='multimodal',
    ):
        r"""
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
        use_cache (:obj:`bool`, `optional`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if is_decoder:
            use_cache = use_cache if use_cache is not None else self.config.use_cache
        else:
            use_cache = False

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
            batch_size, seq_length = input_shape
            device = input_ids.device
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
            batch_size, seq_length = input_shape
            device = inputs_embeds.device
        elif encoder_embeds is not None:    
            input_shape = encoder_embeds.size()[:-1]
            batch_size, seq_length = input_shape 
            device = encoder_embeds.device
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")

        # past_key_values_length
        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

        if attention_mask is None:
            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
            
        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, 
                                                                                 device, is_decoder)

        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if encoder_hidden_states is not None:
            if type(encoder_hidden_states) == list:
                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
            else:
                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            
            if type(encoder_attention_mask) == list:
                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
            elif encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
            else:    
                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else:
            encoder_extended_attention_mask = None

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
        
        if encoder_embeds is None:
            embedding_output = self.embeddings(
                input_ids=input_ids,
                position_ids=position_ids,
                inputs_embeds=inputs_embeds,
                past_key_values_length=past_key_values_length,
            )
        else:
            embedding_output = encoder_embeds
            
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_extended_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            mode=mode,
        )
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPoolingAndCrossAttentions(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            past_key_values=encoder_outputs.past_key_values,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            cross_attentions=encoder_outputs.cross_attentions,
        )


class BertLMHeadModel(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]

    def __init__(self, config):
        super().__init__(config)

        self.bert = BertModel(config, add_pooling_layer=False)
        self.cls = BertOnlyMLMHead(config)

        self.init_weights()

    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        labels=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        return_logits=False,            
        is_decoder=True,
        reduction='mean',
        mode='multimodal', 
    ):
        r"""
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
        use_cache (:obj:`bool`, `optional`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
        Returns:
        Example::
            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
            >>> import torch
            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
            >>> config = BertConfig.from_pretrained("bert-base-cased")
            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
            >>> outputs = model(**inputs)
            >>> prediction_logits = outputs.logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if labels is not None:
            use_cache = False

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            is_decoder=is_decoder,
            mode=mode,
        )
        
        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)
        # sequence_output.shape torch.Size([85, 30, 768])
        # prediction_scores.shape torch.Size([85, 30, 30524])
        # labels.shape torch.Size([85, 30])


        if return_logits:
            return prediction_scores[:, :-1, :].contiguous()  

        lm_loss = None
        if labels is not None:
            # we are doing next-token prediction; shift prediction scores and input ids by one
            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
            labels = labels[:, 1:].contiguous()
            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) 
            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
            if reduction=='none':
                lm_loss = lm_loss.view(prediction_scores.size(0),-1).sum(1)               

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((lm_loss,) + output) if lm_loss is not None else output

        return CausalLMOutputWithCrossAttentions(
            loss=lm_loss,
            logits=prediction_scores,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )

    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # cut decoder_input_ids if past is used
        if past is not None:
            input_ids = input_ids[:, -1:]

        return {
            "input_ids": input_ids, 
            "attention_mask": attention_mask, 
            "past_key_values": past,
            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
            "is_decoder": True,
        }

    def _reorder_cache(self, past, beam_idx):
        reordered_past = ()
        for layer_past in past:
            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
        return reordered_past


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/swin_transformer.py
================================================
# --------------------------------------------------------
# Swin Transformer
# Copyright (c) 2021 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ze Liu
# --------------------------------------------------------

import numpy as np
from scipy import interpolate

import torch
import torch.nn as nn
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, to_2tuple, trunc_normal_


class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


def window_partition(x, window_size):
    """
    Args:
        x: (B, H, W, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    """
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
    return windows


def window_reverse(windows, window_size, H, W):
    """
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image

    Returns:
        x: (B, H, W, C)
    """
    B = int(windows.shape[0] / (H * W / window_size / window_size))
    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x


class WindowAttention(nn.Module):
    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH

        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer("relative_position_index", relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        trunc_normal_(self.relative_position_bias_table, std=.02)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        """
        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

    def extra_repr(self) -> str:
        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'

    def flops(self, N):
        # calculate flops for 1 window with token length of N
        flops = 0
        # qkv = self.qkv(x)
        flops += N * self.dim * 3 * self.dim
        # attn = (q @ k.transpose(-2, -1))
        flops += self.num_heads * N * (self.dim // self.num_heads) * N
        #  x = (attn @ v)
        flops += self.num_heads * N * N * (self.dim // self.num_heads)
        # x = self.proj(x)
        flops += N * self.dim * self.dim
        return flops


class SwinTransformerBlock(nn.Module):
    r""" Swin Transformer Block.

    Args:
        dim (int): Number of input channels.
        input_resolution (tuple[int]): Input resulotion.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        if min(self.input_resolution) <= self.window_size:
            # if window size is larger than input resolution, we don't partition windows
            self.shift_size = 0
            self.window_size = min(self.input_resolution)
        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)

        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

        if self.shift_size > 0:
            # calculate attention mask for SW-MSA
            H, W = self.input_resolution
            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
            h_slices = (slice(0, -self.window_size),
                        slice(-self.window_size, -self.shift_size),
                        slice(-self.shift_size, None))
            w_slices = (slice(0, -self.window_size),
                        slice(-self.window_size, -self.shift_size),
                        slice(-self.shift_size, None))
            cnt = 0
            for h in h_slices:
                for w in w_slices:
                    img_mask[:, h, w, :] = cnt
                    cnt += 1

            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
        else:
            attn_mask = None

        self.register_buffer("attn_mask", attn_mask)

    def forward(self, x):
        H, W = self.input_resolution
        B, L, C = x.shape
        assert L == H * W, "input feature has wrong size"

        shortcut = x
        x = self.norm1(x)
        x = x.view(B, H, W, C)

        # cyclic shift
        if self.shift_size > 0:
            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
        else:
            shifted_x = x

        # partition windows
        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C

        # W-MSA/SW-MSA
        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C

        # merge windows
        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C

        # reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
        else:
            x = shifted_x
        x = x.view(B, H * W, C)

        # FFN
        x = shortcut + self.drop_path(x)
        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x

    def extra_repr(self) -> str:
        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"

    def flops(self):
        flops = 0
        H, W = self.input_resolution
        # norm1
        flops += self.dim * H * W
        # W-MSA/SW-MSA
        nW = H * W / self.window_size / self.window_size
        flops += nW * self.attn.flops(self.window_size * self.window_size)
        # mlp
        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
        # norm2
        flops += self.dim * H * W
        return flops


class PatchMerging(nn.Module):
    r""" Patch Merging Layer.

    Args:
        input_resolution (tuple[int]): Resolution of input feature.
        dim (int): Number of input channels.
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
        super().__init__()
        self.input_resolution = input_resolution
        self.dim = dim
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
        self.norm = norm_layer(4 * dim)

    def forward(self, x):
        """
        x: B, H*W, C
        """
        H, W = self.input_resolution
        B, L, C = x.shape
        assert L == H * W, "input feature has wrong size"
        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."

        x = x.view(B, H, W, C)

        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C

        x = self.norm(x)
        x = self.reduction(x)

        return x

    def extra_repr(self) -> str:
        return f"input_resolution={self.input_resolution}, dim={self.dim}"

    def flops(self):
        H, W = self.input_resolution
        flops = H * W * self.dim
        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
        return flops


class BasicLayer(nn.Module):
    """ A basic Swin Transformer layer for one stage.

    Args:
        dim (int): Number of input channels.
        input_resolution (tuple[int]): Input resolution.
        depth (int): Number of blocks.
        num_heads (int): Number of attention heads.
        window_size (int): Local window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):

        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.depth = depth
        self.use_checkpoint = use_checkpoint

        # build blocks
        self.blocks = nn.ModuleList([
            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
                                 num_heads=num_heads, window_size=window_size,
                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
                                 mlp_ratio=mlp_ratio,
                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
                                 drop=drop, attn_drop=attn_drop,
                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
                                 norm_layer=norm_layer)
            for i in range(depth)])

        # patch merging layer
        if downsample is not None:
            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
        else:
            self.downsample = None

    def forward(self, x):
        for blk in self.blocks:
            if self.use_checkpoint:
                x = checkpoint.checkpoint(blk, x)
            else:
                x = blk(x)
        if self.downsample is not None:
            x = self.downsample(x)
        return x

    def extra_repr(self) -> str:
        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"

    def flops(self):
        flops = 0
        for blk in self.blocks:
            flops += blk.flops()
        if self.downsample is not None:
            flops += self.downsample.flops()
        return flops


class PatchEmbed(nn.Module):
    r""" Image to Patch Embedding

    Args:
        img_size (int): Image size.  Default: 224.
        patch_size (int): Patch token size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    """

    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
        self.img_size = img_size
        self.patch_size = patch_size
        self.patches_resolution = patches_resolution
        self.num_patches = patches_resolution[0] * patches_resolution[1]

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
            self.norm = None

    def forward(self, x):
        B, C, H, W = x.shape
        # FIXME look at relaxing size constraints
        assert H == self.img_size[0] and W == self.img_size[1], \
            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
        if self.norm is not None:
            x = self.norm(x)
        return x

    def flops(self):
        Ho, Wo = self.patches_resolution
        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
        if self.norm is not None:
            flops += Ho * Wo * self.embed_dim
        return flops


class SwinTransformer(nn.Module):
    r""" Swin Transformer
        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
          https://arxiv.org/pdf/2103.14030

    Args:
        img_size (int | tuple(int)): Input image size. Default 224
        patch_size (int | tuple(int)): Patch size. Default: 4
        in_chans (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 1000
        embed_dim (int): Patch embedding dimension. Default: 96
        depths (tuple(int)): Depth of each Swin Transformer layer.
        num_heads (tuple(int)): Number of attention heads in different layers.
        window_size (int): Window size. Default: 7
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
        drop_rate (float): Dropout rate. Default: 0
        attn_drop_rate (float): Attention dropout rate. Default: 0
        drop_path_rate (float): Stochastic depth rate. Default: 0.1
        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
        patch_norm (bool): If True, add normalization after patch embedding. Default: True
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
    """

    def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
                 use_checkpoint=False, **kwargs):
        super().__init__()

        self.num_classes = num_classes
        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.ape = ape
        self.patch_norm = patch_norm
        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
        self.mlp_ratio = mlp_ratio

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None)
        num_patches = self.patch_embed.num_patches
        patches_resolution = self.patch_embed.patches_resolution
        self.patches_resolution = patches_resolution

        # absolute position embedding
        if self.ape:
            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
            trunc_normal_(self.absolute_pos_embed, std=.02)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth
        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule

        # build layers
        self.layers = nn.ModuleList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
                                                 patches_resolution[1] // (2 ** i_layer)),
                               depth=depths[i_layer],
                               num_heads=num_heads[i_layer],
                               window_size=window_size,
                               mlp_ratio=self.mlp_ratio,
                               qkv_bias=qkv_bias, qk_scale=qk_scale,
                               drop=drop_rate, attn_drop=attn_drop_rate,
                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                               norm_layer=norm_layer,
                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
                               use_checkpoint=use_checkpoint)
            self.layers.append(layer)

        self.norm = norm_layer(self.num_features)
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        # self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'absolute_pos_embed'}

    @torch.jit.ignore
    def no_weight_decay_keywords(self):
        return {'relative_position_bias_table'}

    def forward(self, x, idx_to_group_img=None, image_atts=None, **kwargs):
        x = self.patch_embed(x)
        if self.ape:
            x = x + self.absolute_pos_embed
        x = self.pos_drop(x)

        for layer in self.layers:
            x = layer(x)

        x = self.norm(x)  # B L C

        x_cls = self.avgpool(x.transpose(1, 2))  # B C 1

        if idx_to_group_img is None:
            return torch.cat([x_cls.transpose(1, 2), x], dim=1)
        else:
            x_bs = torch.gather(x, dim=0, index=idx_to_group_img.view(-1, 1, 1).expand(-1, x.shape[1], x.shape[2]))
            weights = image_atts[:, 1:].unsqueeze(2)  # B L 1
            x_bs_cls = torch.sum((weights * x_bs).transpose(1, 2), dim=-1, keepdim=True)   # B C 1
            x_bs_cls = x_bs_cls / torch.sum(weights.transpose(1, 2), dim=-1, keepdim=True)  # avgpool

            return torch.cat([x_bs_cls.transpose(1, 2), x_bs], dim=1), \
                   torch.cat([x_cls.transpose(1, 2), x], dim=1)

    def flops(self):
        flops = 0
        flops += self.patch_embed.flops()
        for i, layer in enumerate(self.layers):
            flops += layer.flops()
        flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
        flops += self.num_features * self.num_classes
        return flops


def interpolate_relative_pos_embed(rel_pos_bias, dst_num_pos, param_name=''):
    # from: https://github.com/microsoft/unilm/blob/8a0a1c1f4e7326938ea7580a00d56d7f17d65612/beit/run_class_finetuning.py#L348

    # rel_pos_bias: relative_position_bias_table
    src_num_pos, num_attn_heads = rel_pos_bias.size()

    num_extra_tokens = 0
    src_size = int((src_num_pos - num_extra_tokens) ** 0.5)
    dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5)
    if src_size != dst_size:
        print("Position interpolate %s from %dx%d to %dx%d" % (param_name, src_size, src_size, dst_size, dst_size))

        # extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
        # rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]

        def geometric_progression(a, r, n):
            return a * (1.0 - r ** n) / (1.0 - r)

        left, right = 1.01, 1.5
        while right - left > 1e-6:
            q = (left + right) / 2.0
            gp = geometric_progression(1, q, src_size // 2)
            if gp > dst_size // 2:
                right = q
            else:
                left = q

        # if q > 1.090307:
        #     q = 1.090307

        dis = []
        cur = 1
        for i in range(src_size // 2):
            dis.append(cur)
            cur += q ** (i + 1)

        r_ids = [-_ for _ in reversed(dis)]

        x = r_ids + [0] + dis
        y = r_ids + [0] + dis

        t = dst_size // 2.0
        dx = np.arange(-t, t + 0.1, 1.0)
        dy = np.arange(-t, t + 0.1, 1.0)

        # print("Original positions = %s" % str(x))
        # print("Target positions = %s" % str(dx))

        all_rel_pos_bias = []

        for i in range(num_attn_heads):
            z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy()
            f = interpolate.interp2d(x, y, z, kind='cubic')
            all_rel_pos_bias.append(
                torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device))

        rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)

    return rel_pos_bias

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/tag2text.py
================================================
'''
 * Tag2Text
 * Written by Xinyu Huang
'''
import warnings
warnings.filterwarnings("ignore")

from .vit import VisionTransformer, interpolate_pos_embed
from .swin_transformer import SwinTransformer, interpolate_relative_pos_embed
from .med import BertConfig, BertModel, BertLMHeadModel
from transformers import BertTokenizer

import torch
from torch import nn
import torch.nn.functional as F

import os
CUR_DIR = os.path.dirname(os.path.abspath(__file__))
from urllib.parse import urlparse
from timm.models.hub import download_cached_file
from .tag_class import tra_array
import json
import math
import numpy as np

def read_json(rpath):
    with open(rpath, 'r') as f:
        return json.load(f)

delete_tag_index = [127, 3351, 3265, 3338, 3355, 3359]
        
class Tag2Text_Caption(nn.Module):
    def __init__(self,                 
                 med_config = f'{CUR_DIR}/med_config.json',  
                 image_size = 384,
                 vit = 'base',
                 vit_grad_ckpt = False,
                 vit_ckpt_layer = 0,
                 prompt = 'a picture of ',
                 threshold = 0.7,
                 ):
        """
        Args:
            med_config (str): path for the mixture of encoder-decoder model's configuration file
            image_size (int): input image size
            vit (str): model size of vision transformer
        """            
        super().__init__()

        if vit=='swin_b':
            if image_size == 224:
                vision_config_path = 'configs/swin/config_swinB_224.json'
            elif image_size == 384:
                vision_config_path = f'{CUR_DIR}/config_swinB_384.json'
            vision_config = read_json(vision_config_path)
            assert image_size == vision_config['image_res']

            vision_width = vision_config['vision_width']

            self.visual_encoder = SwinTransformer(img_size=vision_config['image_res'],
                                            patch_size=4,
                                            in_chans=3,
                                            embed_dim=vision_config['embed_dim'],
                                            depths=vision_config['depths'],
                                            num_heads=vision_config['num_heads'],
                                            window_size=vision_config['window_size'],
                                            mlp_ratio=4.,
                                            qkv_bias=True,
                                            drop_rate=0.0,
                                            drop_path_rate=0.1,
                                            ape=False,
                                            patch_norm=True,
                                            use_checkpoint=False)
        
        else:
            self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)


        self.tokenizer = init_tokenizer()   

        # create the decoder
        decoder_config = BertConfig.from_json_file(med_config)
        decoder_config.encoder_width = 768
        self.text_decoder = BertLMHeadModel(config=decoder_config)     

        # create encoder
        encoder_config = BertConfig.from_json_file(med_config)
        encoder_config.encoder_width = vision_width
        self.tag_encoder = BertModel(config=encoder_config, add_pooling_layer=False)
        
        self.prompt = prompt
        self.prompt_length = len(self.tokenizer(self.prompt).input_ids)-1

        self.threshold = threshold
        num_features = 768
        self.num_class = 3429

        q2l_config = BertConfig.from_json_file(f'{CUR_DIR}/q2l_config.json')
        q2l_config.encoder_width = vision_width
        self.vision_multi = BertModel.from_pretrained('bert-base-uncased',config=q2l_config, add_pooling_layer=False)
        self.vision_multi.resize_token_embeddings(len(self.tokenizer)) 
        self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size)
        self.fc =  GroupWiseLinear(self.num_class, num_features, bias=True)
        self.del_selfattention()

        tie_encoder_decoder_weights(self.tag_encoder,self.vision_multi,'',' ')
        self.tag_array = tra_array

    def del_selfattention(self):
        del self.vision_multi.embeddings
        for layer in self.vision_multi.encoder.layer:
            del layer.attention
        
    def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0, tag_input = None, return_tag_predict = False):
        image_embeds = self.visual_encoder(image)
        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)

        #==============generate tag==============#
        if tag_input == None:
            image_spatial_embeds = image_embeds[:,1:,:]
            image_cls_embeds = image_embeds[:,0,:]

            bs = image_spatial_embeds.shape[0]
            label_embed = self.label_embed.weight.unsqueeze(0).repeat(bs,1,1)
            mlr_tagembedding = self.vision_multi(encoder_embeds = label_embed,
                                encoder_hidden_states = image_embeds,
                                encoder_attention_mask = image_atts,      
                                return_dict = False,
                                mode = 'mlr',
                                )  

            logits = self.fc(mlr_tagembedding[0])
            
            targets = torch.where(torch.sigmoid(logits) > self.threshold , torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device))

            tag = targets.cpu().numpy()
            tag[:,delete_tag_index] = 0
            bs = image.size(0)
            tag_input = []
            for b in range(bs):
                index = np.argwhere(tag[b] == 1)
                token = self.tag_array[index].squeeze(axis = 1)
                tag_input.append(' | '.join(token))            
        #========================================#
        
        if not sample:
            image_embeds = image_embeds.repeat_interleave(num_beams,dim=0)
            image_atts = image_atts.repeat_interleave(num_beams,dim=0)
            tag_input_temp = []
            for tag in tag_input:
                for i in range(num_beams):
                    tag_input_temp.append(tag)
            tag_input = tag_input_temp


        tag_input_tokenzier = self.tokenizer(tag_input, padding='max_length', truncation=True, max_length=40, 
                              return_tensors="pt").to(image.device)  
        
        encoder_input_ids = tag_input_tokenzier.input_ids
        encoder_input_ids[:,0] = self.tokenizer.enc_token_id
        # print(encoder_input_ids.size(), tag_input_tokenzier.attention_mask.size(),image_embeds.size(),  image_atts.size())
        # import pdb
        # pdb.set_trace()
        output_tagembedding = self.tag_encoder(encoder_input_ids,
                                       attention_mask = tag_input_tokenzier.attention_mask,
                                       encoder_hidden_states = image_embeds,
                                       encoder_attention_mask = image_atts,      
                                       return_dict = True,
                                      )  
        
        prompt = [self.prompt] * image.size(0)
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(image.device) 
        input_ids[:,0] = self.tokenizer.bos_token_id
        input_ids = input_ids[:, :-1] 

        if sample:
            #nucleus sampling
            model_kwargs = {"encoder_hidden_states": output_tagembedding.last_hidden_state, "encoder_attention_mask":None}
            outputs = self.text_decoder.generate(input_ids=input_ids,
                                                max_length=max_length,
                                                min_length=min_length,
                                                do_sample=True,
                                                top_p=top_p,
                                                num_return_sequences=1,
                                                eos_token_id=self.tokenizer.sep_token_id,
                                                pad_token_id=self.tokenizer.pad_token_id, 
                                                repetition_penalty=1.1,                                            
                                                **model_kwargs)
        else:
            #beam search
            model_kwargs = {"encoder_hidden_states": output_tagembedding.last_hidden_state, "encoder_attention_mask":None}
            outputs = self.text_decoder.generate(input_ids=input_ids,
                                                max_length=max_length,
                                                min_length=min_length,
                                                num_beams=num_beams,
                                                eos_token_id=self.tokenizer.sep_token_id,
                                                pad_token_id=self.tokenizer.pad_token_id,     
                                                repetition_penalty=repetition_penalty,
                                                **model_kwargs)            
            
        captions = []    
        for output in outputs:
            caption = self.tokenizer.decode(output, skip_special_tokens=True)    
            captions.append(caption[len(self.prompt):])
        if return_tag_predict == True:
            if sample:
                return captions, tag_input
            else:
                return captions, tag_input[0:int(len(tag_input)/num_beams)]            
        return captions


def tag2text_caption(pretrained='',**kwargs):
    model = Tag2Text_Caption(**kwargs)
    if pretrained:
        if kwargs['vit'] == 'swin_b':
            model,msg = load_checkpoint_swinbase(model,pretrained,kwargs)
        else:
            model,msg = load_checkpoint(model,pretrained)
        # print('vit:',kwargs['vit'])
        # print('msg_v2',msg)
    return model    


from typing import List
def tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str, skip_key:str):
    uninitialized_encoder_weights: List[str] = []
    if decoder.__class__ != encoder.__class__:
        logger.info(
            f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
        )

    def tie_encoder_to_decoder_recursively(
        decoder_pointer: nn.Module,
        encoder_pointer: nn.Module,
        module_name: str,
        uninitialized_encoder_weights: List[str],
        skip_key: str,
        depth=0,
    ):
        assert isinstance(decoder_pointer, nn.Module) and isinstance(
            encoder_pointer, nn.Module
        ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
        if hasattr(decoder_pointer, "weight") and skip_key not in module_name:
            assert hasattr(encoder_pointer, "weight")
            encoder_pointer.weight = decoder_pointer.weight
            if hasattr(decoder_pointer, "bias"):
                assert hasattr(encoder_pointer, "bias")
                encoder_pointer.bias = decoder_pointer.bias                
            # print(module_name+' is tied')    
            return

        encoder_modules = encoder_pointer._modules
        decoder_modules = decoder_pointer._modules
        if len(decoder_modules) > 0:
            assert (
                len(encoder_modules) > 0
            ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"

            all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()])
            encoder_layer_pos = 0
            for name, module in decoder_modules.items():
                if name.isdigit():
                    encoder_name = str(int(name) + encoder_layer_pos)
                    decoder_name = name
                    if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len(
                        encoder_modules
                    ) != len(decoder_modules):
                        # this can happen if the name corresponds to the position in a list module list of layers
                        # in this case the decoder has added a cross-attention that the encoder does not have
                        # thus skip this step and subtract one layer pos from encoder
                        encoder_layer_pos -= 1
                        continue
                elif name not in encoder_modules:
                    continue
                elif depth > 500:
                    raise ValueError(
                        "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model."
                    )
                else:
                    decoder_name = encoder_name = name
                tie_encoder_to_decoder_recursively(
                    decoder_modules[decoder_name],
                    encoder_modules[encoder_name],
                    module_name + "/" + name,
                    uninitialized_encoder_weights,
                    skip_key,
                    depth=depth + 1,
                )
                all_encoder_weights.remove(module_name + "/" + encoder_name)

            uninitialized_encoder_weights += list(all_encoder_weights)

    # tie weights recursively
    tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights, skip_key)  


class GroupWiseLinear(nn.Module):
    # could be changed to: 
    # output = torch.einsum('ijk,zjk->ij', x, self.W)
    # or output = torch.einsum('ijk,jk->ij', x, self.W[0])
    def __init__(self, num_class, hidden_dim, bias=True):
        super().__init__()
        self.num_class = num_class
        self.hidden_dim = hidden_dim
        self.bias = bias

        self.W = nn.Parameter(torch.Tensor(1, num_class, hidden_dim))
        if bias:
            self.b = nn.Parameter(torch.Tensor(1, num_class))
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.W.size(2))
        for i in range(self.num_class):
            self.W[0][i].data.uniform_(-stdv, stdv)
        if self.bias:
            for i in range(self.num_class):
                self.b[0][i].data.uniform_(-stdv, stdv)

    def forward(self, x):
        # x: B,K,d
        x = (self.W * x).sum(-1)
        if self.bias:
            x = x + self.b
        return x


def init_tokenizer():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer.add_special_tokens({'bos_token':'[DEC]'})
    tokenizer.add_special_tokens({'additional_special_tokens':['[ENC]']})       
    tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]  
    return tokenizer


def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0):
        
    assert vit in ['base', 'large'], "vit parameter must be base or large"
    if vit=='base':
        vision_width = 768
        visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=12, 
                                           num_heads=12, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
                                           drop_path_rate=0 or drop_path_rate
                                          )   
    elif vit=='large':
        vision_width = 1024
        visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=24, 
                                           num_heads=16, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
                                           drop_path_rate=0.1 or drop_path_rate
                                          )   
    return visual_encoder, vision_width

def is_url(url_or_filename):
    parsed = urlparse(url_or_filename)
    return parsed.scheme in ("http", "https")

def load_checkpoint(model,url_or_filename):
    if is_url(url_or_filename):
        cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
        checkpoint = torch.load(cached_file, map_location='cpu') 
    elif os.path.isfile(url_or_filename):        
        checkpoint = torch.load(url_or_filename, map_location='cpu') 
    else:
        raise RuntimeError('checkpoint url or path is invalid')
        
    state_dict = checkpoint['model']
    
    state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder) 
    if 'visual_encoder_m.pos_embed' in model.state_dict().keys():
        state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'],
                                                                         model.visual_encoder_m)    
    for key in model.state_dict().keys():
        if key in state_dict.keys():
            if state_dict[key].shape!=model.state_dict()[key].shape:
                del state_dict[key]
    
    msg = model.load_state_dict(state_dict,strict=False)
    # print('load checkpoint from %s'%url_or_filename)  
    return model,msg
    

def load_checkpoint_swinbase(model,url_or_filename,kwargs):
    if kwargs['image_size'] == 224:
        vision_config_path = 'configs/swin/config_swinB_224.json'
    elif kwargs['image_size'] == 384:
        vision_config_path = f'{CUR_DIR}/config_swinB_384.json'
    elif kwargs['image_size'] == 480:
        vision_config_path = 'configs/swin/config_swinB_480.json'
    elif kwargs['image_size'] == 576:
        vision_config_path = 'configs/swin/config_swinB_576.json'
    elif kwargs['image_size'] == 608:
        vision_config_path = 'configs/swin/config_swinB_608.json'
    window_size = read_json(vision_config_path)['window_size']
    # print('--------------')
    # print(url_or_filename)
    # print('--------------')
    if is_url(url_or_filename):
        cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
        checkpoint = torch.load(cached_file, map_location='cpu') 
    elif os.path.isfile(url_or_filename):        
        checkpoint = torch.load(url_or_filename, map_location='cpu') 
    else:
        raise RuntimeError('checkpoint url or path is invalid')
        
    state_dict = checkpoint['model']

    for k in list(state_dict.keys()):
        if 'relative_position_bias_table' in k:
            dst_num_pos = (2 * window_size - 1) ** 2
            state_dict[k] = interpolate_relative_pos_embed(state_dict[k], dst_num_pos, param_name=k)
        elif ('relative_position_index' in k) or ('attn_mask' in k):
            del state_dict[k]
    
    msg = model.load_state_dict(state_dict,strict=False)
    print('load checkpoint from %s'%url_or_filename)  
    return model,msg
    

if __name__=="__main__":
    model = Tag2Text_Caption()
    import pdb
    pdb.set_trace()


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/tag_class.py
================================================
import numpy as np


tra_array = ['tennis',
'bear cub',
'observatory',
'bicycle',
'hillside',
'judge',
'watercolor illustration',
'granite',
'lobster',
'livery',
'stone',
'ceramic',
'ranch',
'cloth',
'smile',
'building',
'tattoo',
'cricketer',
'cheek',
'pear',
'source',
'winter',
'surface',
'spray',
'ceremony',
'magic',
'curve',
'container',
'fair',
'medicine',
'baby',
'tennis racquet',
'ornament',
'bamboo',
'duckling',
'song',
'safari',
'team presentation',
'daffodil',
'cross',
'toothpaste',
'shield',
'fashion model',
'capsule',
'map',
'creek',
'glass house',
'glass plate',
'siding',
'corner',
'water buffalo',
'bison',
'figure skater',
'diploma',
'tire',
'race',
'cable car',
'brain',
'gas stove',
'soap bubble',
'palette',
'snowboard',
'school child',
'trench coat',
'monk',
'fiber',
'kitchen window',
'sunglass',
'coffee',
'security',
'strawberry',
'penguin',
'tree root',
'loaf',
'engagement ring',
'lamb',
'vector cartoon illustration',
'sandwich',
'mountain village',
'shape',
'charm',
'fiction',
'knot',
'greenhouse',
'sushi',
'text',
'disaster',
'trophy',
'gang',
'strap',
'soccer game',
'cardinal',
'tee',
'turtle',
'water surface',
'grassland',
'dolphin',
'store',
'dirt',
'iceberg',
'pergola',
'farmer market',
'publicity portrait',
'tote bag',
'teenage girl',
'view mirror',
'session',
'commuter',
'dressing room',
'tricycle',
'christmas ball',
'headlight',
'police',
'armchair',
'chart',
'yacht',
'saw',
'printer',
'rock band',
'gingerbread house',
'tag',
'table lamp',
'hockey game',
'slope',
'font',
'wicker basket',
'jewelry',
'quarter',
'software',
'weapon',
'pin',
'worship',
'painter',
'goal',
'morning light',
'bike',
'baseball bat',
'elevator',
'cuisine',
'sausage',
'stunt',
'wrestler',
'statue',
'landing',
'pillar',
'willow tree',
'sea wave',
'chicken',
'peanut',
'muscle',
'bob',
'tv genre',
'bathroom window',
'radish',
'textile',
'pelican',
'marketplace',
'crest',
'elevation map',
'gift',
'parish',
'traffic light',
'campfire',
'fog',
'award winner',
'beach ball',
'mat',
'white house',
'plaster',
'moped',
'football team',
'solution',
'bicyclist',
'bit',
'playground',
'darkness',
'cake',
'maple leave',
'mold',
'cracker',
'blueberry',
'rubble',
'container ship',
'pedestrian bridge',
'snail',
'parrot',
'form',
'circuit',
'highlight',
'pickup truck',
'koala',
'rain',
'system',
'weather',
'raincoat',
'soccer team',
'windshield',
'thunderstorm',
'mike',
'bird house',
'bridge',
'grandfather',
'restroom',
'animation',
'wilderness',
'clown',
'banana',
'brown',
'braid',
'dining room',
'kindergarten',
'launch event',
'purple',
'school',
'stairwell',
'brooch',
'movie poster image',
'mountain river',
'shelf',
'wicket',
'headboard',
'buddha',
'flower field',
'dugout',
'cd',
'bald eagle',
'lagoon',
'seaweed',
'agriculture',
'emergency service',
'maple tree',
'parachute',
'continent',
'amusement park',
'remote',
'bun',
'tackle',
'hospital',
'garage door',
'birthday party',
'friendship',
'go',
'mausoleum',
'jeep',
'raccoon',
'step',
'ice hockey team',
'cigarette',
'lace dress',
'forest floor',
'mall',
'captain',
'milk',
'golf course',
'meal',
'picnic table',
'sail',
'volleyball',
'canal',
'terrace',
'computer desk',
'caravan',
'hotel',
'cheerleader',
'nurse',
'museum',
'marsh',
'fox',
'plateau',
'night',
'twin',
'letter logo',
'autumn tree',
'powder',
'convention',
'creature',
'lighthouse',
'shop window',
'jacket',
'stork',
'taxi',
'trade',
'blackboard',
'olive',
'road sign',
'resort',
'snowflake',
'cemetery',
'travel',
'evening dress',
'picnic',
'drink',
'winter morning',
'football player',
'snack',
'boxing glove',
'dinner party',
'airline',
'swing',
'port',
'wheelbarrow',
'bathroom sink',
'sweater',
'ambulance',
'gear',
'oil',
'wii controller',
'array',
'home office',
'car show',
'mixture',
'profession',
'tree frog',
'square',
'facility',
'coral reef',
'sea wall',
'pizza',
'exhibit',
'demolition',
'trout',
'ring',
'coffee shop',
'bracelet',
'bean',
'lip',
'fencing',
'landscape',
'sitting',
'package',
'metal',
'bust',
'king',
'hair',
'window seat',
'wildlife',
'trunk',
'greenery',
'stencil',
'fire hydrant',
'bridesmaid',
'plaza',
'alps',
'tower bridge',
'crop top',
'crossing',
'cinema',
'pedestrian crossing',
'family',
'shopping cart',
'stomach',
'church building',
'screen door',
'skater',
'soccer field',
'kettle',
'mussel',
'raindrop',
'candy cane',
'water lily',
'flower girl',
'desert',
'enclosure',
'christmas light',
'kitchen',
'caterpillar',
'plaid',
'bath',
'bush',
'mud',
'ballet',
'knee',
'adult',
'raft',
'sea view',
'cactus',
'office chair',
'overall',
'rim',
'scaffolding',
'pig',
'cover',
'poster page',
'sprinkle',
'chandelier',
'algae',
'traffic',
'surfboard',
'book',
'filming',
'flash',
'mansion',
'camouflage',
'trouser',
'ticket',
'weed',
'cab',
'trench',
'elephant',
'huddle',
'sphere',
'christmas decoration',
'city',
'launch',
'doll',
'christmas ornament',
'fabric',
'bikini',
'biplane',
'breakfast',
'neighbourhood',
'race track',
'foliage',
'avocado',
'school bus',
'footwear',
'highway',
'ocean view',
'art vector illustration',
'wall clock',
'curtain',
'teenager',
'kitchen area',
'robot',
'tusk',
'lounge chair',
'beam',
'paddle',
'camel',
'lid',
'world map',
'city view',
'newlywed',
'cargo ship',
'yellow',
'exhibition',
'bend',
'novel',
'wool',
'ontario',
'bread',
'campus',
'coastline',
'cutting board',
'booth',
'table top',
'carpet',
'beach chair',
'workout',
'street food',
'fun',
'costumer film designer',
'gadget',
'artist',
'fishing village',
'builder',
'violinist',
'iphone',
'spider web',
'traffic sign',
'ruin',
'rescue',
'clipboard',
'seal',
'film director',
'paw',
'nursery',
'intersection',
'tomato sauce',
'taste',
'paddy field',
'christmas tree',
'wave',
'stool',
'watering can',
'rug',
'daytime',
'subway station',
'craft',
'pine forest',
'black',
'planet',
'motif',
'christmas market',
'glass window',
'college',
'wheat',
'damage',
'rectangle',
'picture frame',
'chess',
'guest room',
'street corner',
'religion',
'seed',
'puzzle',
'freeway',
'beauty',
'ocean',
'watch',
'mother',
'garage',
'quote',
'dj',
'supporter',
'hip hop artist',
'muffin',
'eiffel tower',
'cash',
'firefighter',
'cauliflower',
'bunker',
'sled',
'manicure',
'shark',
'stall',
'jungle',
'family home',
'tour bus',
'chimney',
'touchdown',
'roundabout',
'coyote',
'street scene',
'tank',
'wedding dress',
'mantle',
'bedroom window',
'coconut',
'chapel',
'goat',
'living space',
'rock wall',
'polka dot',
'railway',
'mandala',
'mango',
'lesson',
'mountain landscape',
'team photo',
'bookshelf',
'meter',
'bulldog',
'evening sun',
'stick',
'card',
'pink',
'fish pond',
'paint',
'pill',
'cart',
'pea',
'van',
'album',
'football college game',
'mountain pass',
'doughnut',
'ski slope',
'match',
'official',
'shadow',
'organ',
'celebration',
'coin',
'log cabin',
'firework display',
'present',
'twig',
'chef',
'confetti',
'footpath',
'tour',
'ponytail',
'artwork',
'race car',
'club',
'season',
'hose',
'pencil',
'aircraft',
'rock formation',
'wardrobe',
'participant',
'politician',
'engineer',
'peace',
'filter',
'sailing boat',
'water bottle',
'service dog',
'poodle',
'loki',
'statesman',
'sleeping bag',
'outskirt',
'clock',
'factory',
'oak tree',
'physician',
'color',
'room',
'stairway',
'company',
'lady',
'graph',
'faucet',
'tablecloth',
'subway train',
'chocolate chip cookie',
'headquarters',
'screw',
'goggle',
'halloween',
'city street',
'swirl',
'cord',
'forward',
'bone',
'bedding',
'archway',
'wig',
'lobby',
'mask',
'attic',
'kitchen table',
'skylight',
'fire',
'exit',
'oil painting',
'passenger',
'meditation',
'salmon',
'fedora',
'rubber stamp',
'orange juice',
'arch',
'scientist',
'stroll',
'manhattan',
'float',
'baseball uniform',
'circle',
'church',
'decker bus',
'competitor',
'zoo',
'basketball team',
'tourist',
'daughter',
'silverware',
'ceiling fan',
'birth',
'vase',
'jack',
'mushroom',
'spiral',
'cage',
'limb',
'salad',
'ad',
'control',
'earth',
'party',
'bolt',
'tractor',
'barley',
'wedding photo',
'hawk',
'warehouse',
'vegetable garden',
'chocolate cake',
'cabbage',
'floor window',
'baby shower',
'magnifying glass',
'table',
'stethoscope',
'reading',
'mission',
'croissant',
'gift box',
'rocket',
'forest road',
'cooking',
'suite',
'hill country',
'motorcycle',
'baseball player',
'angle',
'drug',
'sport association',
'championship',
'family portrait',
'florist',
'softball',
'egret',
'office',
'plywood',
'jockey',
'mosque',
'brunch',
'beanie',
'office building',
'pattern',
'calendar',
'indoor',
'pepper',
'ledge',
'trail',
'fuel',
'laptop computer',
'tennis shoe',
'deck chair',
'guitarist',
'barn',
'surgery',
'cartoon illustration',
'nebula',
'railroad',
'mountain goat',
'goose',
'car door',
'cheer',
'liquid',
'hardwood floor',
'pathway',
'acorn',
'gull',
'airliner',
'couch',
'lake house',
'spaghetti',
'promenade',
'collection',
'garden',
'bank',
'robin',
'tennis ball',
'peony',
'gymnast',
'lavender',
'deck',
'test',
'riverside',
'rapper',
'domino',
'bride',
'mouse',
'basil',
'wedding couple',
'ocean wave',
'arm',
'kitchen floor',
'grove',
'family member',
'backyard',
'raspberry',
'forest fire',
'officer',
'hibiscus',
'canyon',
'composer',
'signature',
'olive oil',
'hibiscus flower',
'rose',
'vector icon',
'sunrise',
'horseback',
'motor scooter',
'office worker',
'tradition',
'ingredient',
'washing machine',
'lighting',
'bagel',
'sailboat',
'policeman',
'mare',
'graphic',
'halloween pumpkin',
'stock',
'pilot',
'education',
'team',
'body',
'horse',
'kimono',
'bazaar',
'bag',
'recording studio',
'parsley',
'entrance',
'denim',
'vet',
'horse farm',
'charcoal',
'architecture',
'glass vase',
'puppy',
'estuary',
'television show host',
'city bus',
'shoulder',
'beast',
'balance',
'golfer',
'roadside',
'denim jacket',
'stone wall',
'counter top',
'app icon',
'toast',
'head coach',
'ham',
'warrior',
'gem',
'refrigerator',
'snowman',
'construction worker',
'coal',
'website',
'morning fog',
'mustard',
'human',
'owl',
'puppy dog',
'piggy bank',
'vegetation',
'pirate',
'action film',
'marshmallow',
'thanksgiving',
'business',
'disease',
'signage',
'greeting',
'skate park',
'tile',
'mouth',
'spinach',
'vacation',
'leader',
'shrine',
'walker',
'science fiction film',
'bill',
'rabbit',
'motor boat',
'bar',
'radio',
'barge',
'tail',
'chainsaw',
'gallery',
'rainbow',
'pasta',
'padlock',
'web',
'pastry',
'ink',
'reef',
'school uniform',
'shawl',
'treasure',
'peach',
'dinner table',
'injury',
'harbor',
'witch',
'car dealership',
'litter',
'gesture',
'documentary',
'marriage',
'sea shell',
'priest',
'dome',
'kit',
'icon',
'seaside',
'bucket',
'entertainment',
'stable',
'hat',
'puddle',
'sock',
'shopper',
'technology',
'harbour',
'orbit',
'antler',
'tube',
'flag waving',
'cook',
'tight',
'commander',
'farmland',
'switch',
'hiker',
'wedding ceremony',
'award ceremony',
'champion',
'chopstick',
'farmhouse',
'performer',
'spike',
'accident',
'cruise ship',
'passenger train',
'attraction',
'entertainer',
'rear view',
'sidewalk',
'parade',
'racing',
'plane',
'ritual',
'peacock',
'pocket',
'plum',
'drop',
'carrot',
'floor',
'sunset',
'troop',
'architect',
'coffee table',
'dust',
'outline',
'leather',
'charity event',
'heat',
'whale',
'laundry',
'coconut tree',
'crosswalk',
'pony',
'ant',
'pipe',
'string',
'coat',
'angel',
'beef',
'church tower',
'dish',
'pitch',
'cupboard',
'thermometer',
'dirt field',
'fireworks',
'minute',
'cane',
'pajama',
'flower garden',
'autumn',
'trash can',
'dachshund',
'banana tree',
'tray',
'moose',
'roadway',
'carnival',
'antenna',
'pole',
'castle wall',
'ram',
'cattle',
'hay',
'cookie',
'swimmer',
'baseball team',
'strait',
'hedge',
'jet',
'fire pit',
'octopus',
'calf',
'cube',
'opera',
'cardboard box',
'tiara',
'kitchen sink',
'prairie',
'bowl',
'galaxy',
'straw hat',
'linen',
'ski resort',
'stitch',
'street lamp',
'motorist',
'icicle',
'stain',
'flora',
'drain',
'kitchen cabinet',
'decor',
'bouquet',
'pound',
'interior design',
'nail polish',
'figurine',
'tomb',
'disc',
'twist',
'blouse',
'ribbon',
'figure',
'burger',
'cork',
'soccer goalkeeper',
'train bridge',
'drinking water',
'dew',
'baker',
'storm cloud',
'tarmac',
'tv drama',
'sponge',
'magnet',
'sailor',
'entry',
'swan',
'exercise',
'sloth',
'jewel',
'scuba diver',
'bite',
'cat tree',
'tent',
'can',
'tennis match',
'ecosystem',
'picket fence',
'palm',
'train car',
'frying pan',
'rally',
'tablet pc',
'reindeer',
'image',
'wolf',
'chin',
'conservatory',
'flood water',
'cityscape',
'beach sand',
'car park',
'pavement',
'farm field',
'swimming',
'winter storm',
'stem',
'pillow',
'inning',
'gorilla',
'desk',
'avenue',
'fern',
'money',
'pearl',
'train station',
'skillet',
'nap',
'barber',
'library',
'freezer',
'label',
'rainforest',
'parking sign',
'mirror',
'wing',
'noodle',
'press room',
'sculpture',
'tablet',
'viewer',
'prayer',
'mini',
'mechanic',
'laugh',
'rice field',
'hand',
'mustache',
'mountain road',
'catwalk',
'conference',
'cape',
'installation',
'musician',
'stream',
'machine',
'speech',
'crocodile',
'soccer match',
'town square',
'passport',
'post box',
'point',
'stone building',
'motorway',
'mix',
'dentist',
'businessperson',
'happiness',
'boat',
'vineyard',
'treadmill',
'glass wall',
'water droplet',
'coffee mug',
'graduate',
'sunflower',
'parliament',
'shepherd',
'movie',
'wine',
'orchard',
'tulip',
'motherboard',
'cup',
'broom',
'spot',
'drawing',
'polo shirt',
'graduation',
'film producer',
'moonlight',
'glow',
'film format',
't shirt',
'rock face',
'sword',
'clinic',
'festival day',
'meadow',
'staple',
'pupil',
'training ground',
'rider',
'flower',
'foal',
'wharf',
'foot bridge',
'shooting',
'top',
'mast',
'police car',
'robe',
'wedding bouquet',
'stop sign',
'birthday cake',
'glitter',
'butter',
'scooter',
'tundra',
'superhero',
'pocket watch',
'inscription',
'youngster',
'fruit tree',
'movie poster',
'engine',
'foundation',
'motorcyclist',
'take',
'woman',
'antelope',
'country artist',
'road trip',
'typewriter',
'tuxedo',
'brand',
'pine',
'bathroom',
'paradise',
'texture',
'balloon',
'dining table',
'home',
'computer screen',
'actor',
'clip',
'tv tower',
'panorama',
'summit',
'cat',
'plot',
'eagle',
'dancer',
'pup',
'studio shot',
'tear',
'bird bath',
'classroom',
'bookstore',
'city wall',
'tv programme',
'blade',
'easel',
'buttercream',
'sweet',
'designer',
'diamond',
'handshake',
'herb',
'corn field',
'seafront',
'concrete',
'street artist',
'gas',
'stamp',
'window display',
'paper',
'note',
'pint',
'quarry',
'research',
'fixture',
'manager',
'soil',
'leopard',
'board game',
'ladder',
'stop light',
'island',
'ramp',
'football match',
'icing',
'drill',
'currency',
'summer evening',
'topping',
'pyramid',
'pomegranate',
'cell',
'ivy',
'squad',
'scenery',
'computer',
'locomotive',
'surf',
'mascot',
'dune',
'path',
'duck',
'twilight',
'wire',
'bow tie',
'strike',
'cormorant',
'car wash',
'crane',
'market',
'philosopher',
'alarm clock',
'camera',
'birch',
'greeting card',
'plain',
'clay',
'donut',
'lock',
'moth',
'laboratory',
'fan',
'violin',
'jazz fusion artist',
'mountain biker',
'terrain',
'magazine',
'pickup',
'comedy film',
'smartphone',
'film',
'bed',
'microwave oven',
'tournament',
'lawn',
'car window',
'alligator',
'screen',
'jetty',
'shopping bag',
'landscape view',
'cabinetry',
'friendly match',
'thing',
'petal',
'shopping center',
'transport',
'ballet dancer',
'shoreline',
'princess',
'car seat',
'parking meter',
'green',
'vodka',
'band',
'rock',
'costume',
'warning sign',
'strip',
'plaque',
'wheelchair',
'headband',
'ginger',
'dice',
'media',
'hairdresser',
'press',
'living room',
'stove',
'player',
'cherry',
'workshop',
'carving',
'embroidery',
'doodle',
'adventure',
'rugby player',
'monument',
'brush',
'marker',
'loft',
'postcard',
'collage',
'ball',
'professor',
'dresser',
'gig',
'festival',
'blackbird',
'makeup artist',
'video camera',
'sticker',
'peak',
'wildflower',
'santa hat',
'rodeo',
'wedding photographer',
'guy',
'staff',
'waterfall',
'operation',
'defender',
'falcon',
'haze',
'individual',
'gentleman',
'greyhound',
'rocking chair',
'rice',
'garbage',
'platter',
'chocolate',
'splash',
'business suit',
'cheetah',
'valley',
'maze',
'trampoline',
'garland',
'slalom',
'unicorn',
'tree stump',
'painting',
'romance',
'fight',
'alcohol',
'ghost',
'fondant',
'spa',
'shutter',
'death',
'demonstration',
'cotton',
'pier',
'flea market',
'history',
'savannah',
'fist',
'aisle',
'crew',
'jug',
'pose',
'anchor',
'teapot',
'boat house',
'business team',
'tripod',
'bee',
'pebble',
'mattress',
'canvas',
'hallway',
'campaign',
'pod',
'lake district',
'article',
'white',
'sofa',
'honey',
'marathon',
'pancake',
'tourist attraction',
'wedding gown',
'battle',
'shelving',
'sea',
'sheet music',
'pie',
'yarn',
'construction site',
'flyer',
'tie',
'star',
'lettuce',
'martial artist',
'dart',
'straw',
'reflection',
'conference room',
'temperature',
'rugby',
'mosquito',
'physicist',
'rock climber',
'crash',
'backdrop',
'toilet seat',
'sand castle',
'water park',
'toy car',
'waste',
'luxury',
'hangar',
'rv',
'tree trunk',
'board',
'gold',
'project picture',
'cap',
'cottage',
'relief',
'attire',
'microscope',
'battery',
'roll',
'line',
'parking garage',
'crystal',
'broadcasting',
'brick wall',
'lab',
'flooring',
'meeting',
'3d cg rendering',
'desktop computer',
'cowboy',
'sailing ship',
'junction',
'hairstyle',
'homework',
'profile',
'model',
'flower pot',
'street light',
'salt lake',
'maple',
'space',
'blizzard',
'throw',
'zebras',
'brochure',
'constellation',
'beak',
'kilt',
'pond',
'blue sky',
'sneaker',
'sand dune',
'morning sun',
'almond',
'grill',
'curl',
'basketball girl game',
'chameleon',
'toilet bowl',
'prince',
'keyboard',
'queen',
'computer monitor',
'writing',
'crown',
'basilica',
'kiss',
'house',
'parking',
'football competition',
'shell',
'sport equipment',
'comedy',
'baboon',
'vendor',
'rise building',
'wrap',
'food truck',
'cat bed',
'rickshaw',
'flare',
'teal',
'nectar',
'eclipse',
'vehicle',
'steam locomotive',
'gorge',
'cow',
'christmas card',
'demonstrator',
'memorial',
'towel',
'jewellery',
'train',
'frisbee',
'baseball game',
'fur',
'afternoon sun',
'community',
'sparkler',
'bandage',
'firework',
'dollar',
'pasture',
'video',
'bus',
'tree house',
'seashore',
'field',
'hamburger',
'souvenir',
'hedgehog',
'worm',
'pine cone',
'osprey',
'dinosaur',
'vegetable',
'junk',
'poster',
'army',
'winger',
'bundle',
'stage',
'growth',
'wedding party',
'service',
'blanket',
'ruler',
'eye',
'credit card',
'castle',
'diner',
'hut',
'elk',
'hard rock artist',
'nun',
'dog breed',
'nest',
'drama film',
'number icon',
'water tank',
'giraffe',
'altar',
'pavilion',
'tv personality',
'suv',
'street vendor',
'street sign',
'ditch',
'debris',
'foam',
'takeoff',
'spice',
'mountain lake',
'tea',
'orchestra',
'spacecraft',
'counter',
'abbey',
'mountain',
'hydrangea',
'racer',
'orange tree',
'tide',
'cowboy hat',
'rapid',
'town',
'wild',
'herd',
'vein',
'driveway',
'jar',
'bark',
'illustration',
'horror film',
'corn',
'stroller',
'industry',
'mountain stream',
'gym',
'neckline',
'pan',
'client',
'spectator',
'eggplant',
'camper',
'fawn',
'hoodie',
'meat',
'lemonade',
'food market',
'slum',
'comic book character',
'flower market',
'love',
'palace',
'gun',
'heel',
'shopping street',
'shooting basketball guard',
'family photo',
'rooftop',
'laundry basket',
'airport runway',
'horn',
'face mask',
'flight',
'appetizer',
'violet',
'country lane',
'cement',
'instrument',
'tv actor',
'spark',
'celebrity',
'award',
'country house',
'standing',
'auction',
'date',
'engagement',
'puck',
'advertisement',
'chair',
'zebra',
'driftwood',
'bumblebee',
'maple leaf',
'bonnet',
'orange',
'water tower',
'door',
'singer',
'floor plan',
'discussion',
'theatre',
'pilgrim',
'mug',
'branch',
'window sill',
'baseball pitcher',
'bakery',
'lollipop',
'basketball player',
'toilet paper',
'chalkboard',
'cabin',
'sign',
'night sky',
'cannon',
'fishing net',
'submarine',
'suit',
'fur coat',
'wine bottle',
'folder',
'street art',
'suspension bridge',
'evening sky',
'billboard',
'postage stamp',
'newspaper',
'transportation',
'surgeon',
'light',
'park',
'horizon',
'road',
'sand bar',
'trumpet',
'lounge',
'cloud forest',
'birthday celebration',
'balcony',
'anime',
'beehive',
'umbrella',
'goldfish',
'baseball cap',
'waterhole',
'ceiling',
'carousel',
'backpack',
'plant pot',
'atmosphere',
'sunflower field',
'spire',
'vision',
'woodpecker',
'chip',
'pool table',
'lotus flower',
'cone',
'humpback whale',
'reservoir',
'hunt',
'piano',
'plate',
'dining area',
'luggage',
'skier',
'dance floor',
'crow',
'stair',
'overpass',
'opera house',
'bear',
'jazz artist',
'water',
'vessel',
'cast',
'yard',
'cathedral',
'basketball hoop',
'graveyard',
'sound',
'berry',
'onlooker',
'fauna',
'birch tree',
'retail',
'hill',
'skeleton',
'journalist',
'frost',
'basket',
'nail',
'dusk',
'trash',
'dawn',
'clover',
'hen',
'volcano',
'basketball coach',
'home decor',
'charge',
'haircut',
'sense',
'university',
'lizard',
'daisy',
'tablet computer',
'grass field',
'prison',
'metal artist',
'bathroom mirror',
'window frame',
'chest',
'flavor',
'pop country artist',
'market square',
'monkey',
'blog',
'deer',
'speech bubble',
'dog',
'independence day',
'girl',
'boy',
'tartan',
'furniture',
'appliance',
'office window',
'fish boat',
'sand box',
'tv sitcom',
'drama',
'sleigh',
'depression',
'paper towel',
'baseball',
'protestor',
'grape',
'wedding cake',
'invitation',
'accessory',
'pick',
'grandparent',
'racket',
'tea plantation',
'outdoors',
'egg',
'glass bowl',
'sun',
'organization',
'lion',
'panel',
'station',
'wallpaper',
'helicopter',
'salt',
'vanity',
'patio',
'lunch',
'street performer',
'mountain range',
'soup',
'bacon',
'power station',
'cantilever bridge',
'hummingbird',
'shirt',
'rope',
'hip',
'chalk',
'pendant',
'choir',
'tv',
'lichen',
'railway bridge',
'art gallery',
'bartender',
'wagon',
'baby elephant',
'accordion',
'horseshoe',
'building site',
'clutch',
'harvest',
'savanna',
'geranium',
'business woman',
'paddock',
'patch',
'beech tree',
'war',
'suburbs',
'hospital bed',
'motorcycle racer',
'moss',
'gravel',
'government agency',
'dollar bill',
'father',
'fjord',
'concert',
'nut',
'wedding photography',
'finish line',
'home plate',
'food',
'nose',
'thumb',
'village',
'dining room table',
'bumper',
'monster',
'blackberry',
'lime',
'conflict',
'gala',
'wallet',
'wrist',
'hug',
'mermaid',
'lava',
'lawyer',
'folk rock artist',
'arena',
'onion',
'toothbrush',
'fashion',
'perfume',
'flip',
'triangle',
'woodland',
'mail',
'grasshopper',
'studio',
'wood floor',
'den',
'racquet',
'cello',
'lemur',
'astronaut',
'glass table',
'blood',
'dvd',
'planter',
'silver',
'leash',
'master bedroom',
'forest',
'batter',
'shoe',
'engraving',
'opening',
'product',
'toe',
'cocktail',
'mallard duck',
'bike ride',
'oasis',
'wedding ring',
'cinematographer',
'holly',
'autograph',
'fence',
'ice cube',
'cove',
'pineapple',
'aurora',
'glass bead',
'produce',
'apartment building',
'cob',
'miniature',
'cockpit',
'flashlight',
'frog',
'sheep',
'groom',
'steel',
'watermelon',
'clip art',
'paper plate',
'ostrich',
'contour',
'mural',
'cub',
'paisley bandanna',
'winery',
'turn',
'handle',
'satellite',
'post',
'pork',
'child',
'asphalt',
'grocery store',
'vulture',
'trolley',
'nightclub',
'brick',
'trailer',
'compass',
'cereal',
'cafe',
'cartoon character',
'sugar',
'fiction book',
'glass floor',
'umpire',
'guitar',
'hamster',
'protester',
'airplane',
'garment',
'blazer',
'railway line',
'wedding',
'shoe box',
'parking lot',
'construction',
'graduation ceremony',
'tram',
'telescope',
'copper',
'pain',
'autumn forest',
'guest house',
'partner',
'crayon',
'dip',
'boot',
'corridor',
'computer keyboard',
'hockey player',
'chicken coop',
'bus station',
'gathering',
'ankle',
'bunk bed',
'wood table',
'football coach',
'monarch',
'pharmacy',
'legging',
'mannequin',
'female',
'train track',
'stack',
'canopy',
'design element',
'grandmother',
'symbol',
'beach hut',
'zucchini',
'bomb',
'businessman',
'skyscraper',
'tongue',
'case',
'sparkle',
'highland',
'ballroom',
'prom',
'estate',
'customer',
'archipelago',
'cheese',
'debate',
'carriage',
'bulldozer',
'pumpkin',
'sitting room',
'gas station',
'wedding reception',
'camp',
'dog bed',
'tower',
'property',
'river bed',
'pop latin artist',
'fridge',
'wine glass',
'coast',
'beer',
'tow truck',
'fire truck',
'mountain bike',
'thigh',
'heron',
'boat ride',
'gondola',
'turquoise',
'lake',
'llama',
'kitty',
'tin',
'waiting room',
'coffee cup',
'socialite',
'guard',
'tap',
'waterway',
'forehead',
'list',
'erosion',
'box',
'sea lion',
'pollen',
'dam',
'wasp',
'salon',
'tennis tournament',
'flower box',
'aquarium',
'rain cloud',
'clothing store',
'lead singer',
'cupcake',
'tortoise',
'lettering',
'sport facility',
'dance',
'dog house',
'nature',
'football',
'rooster',
'footballer',
'railway track',
'crowd',
'fishing rod',
'silhouette',
'wind turbine',
'sari',
'bus window',
'cloud',
'charity',
'medal',
'yoga',
'event',
'veil',
'fashion menswear milan week',
'news',
'knife',
'print',
'screen tv',
'walnut',
'fungus',
'ice cream',
'computer mouse',
'play',
'tribe',
'picture',
'video game',
'business card',
'music festival',
'rack',
'envelope',
'shower',
'dirt road',
'mine',
'oyster',
'monarch butterfly',
'dude',
'fruit salad',
'podium',
'fork',
'lace',
'test match',
'boulder',
'cricket player',
'staircase',
'peninsula',
'shopping',
'popcorn',
'oak',
'market stall',
'pine tree',
'mountaineer',
'student',
'closet',
'hood',
'handstand',
'centerpiece',
'insect',
'patient',
'makeover',
'tennis player',
'sheet',
'park bench',
'apple',
'organism',
'hook',
'turkey',
'tangerine',
'sibling',
'shopping mall',
'bird',
'scarf',
'smoothie',
'net',
'grass',
'napkin',
'ray',
'eyebrow',
'laptop keyboard',
'motorbike',
'woman hand',
'oven',
'book cover',
'easter egg',
'microwave',
'sand',
'snapshot',
'soccer ball',
'makeup',
'knight',
'bowling ball',
'shower curtain',
'flame',
'lightning',
'running',
'power plant',
'crib',
'cartoon',
'moat',
'fashion girl',
'wedding invitation',
'bottle',
'cliff',
'monastery',
'file photo',
'apartment',
'casino',
'cream',
'sweatshirt',
'storm',
'cruise',
'teddy bear',
'shovel',
'wind farm',
'writer',
'dock',
'professional',
'hotel room',
'job',
'monitor',
'donkey',
'pass',
'interview',
'duchess',
'mark',
'plank',
'beard',
'zombie',
'trio',
'channel',
'cricket team',
'windmill',
'vest',
'diagram',
'cable',
'winter scene',
'golden gate bridge',
'buffalo',
'studio portrait',
'pagoda',
'whiskey',
'freight train',
'kite',
'future',
'steam train',
'phone box',
'headset',
'wood',
'snowboarder',
'paper bag',
'slide',
'grapefruit',
'seating',
'morning',
'bronze sculpture',
'theatre actor',
'stump',
'jean',
'landmark',
'jam',
'waist',
'watercolor',
'hammock',
'light fixture',
'ice',
'basin',
'beverage',
'shelter',
'premiere',
'mound',
'ear',
'bronze',
'sunlight',
'street',
'energy',
'barn door',
'hike',
'fleet',
'claw',
'beach',
'pepperoni',
'bin',
'trainer',
'buffet',
'archive',
'toddler',
'referee',
'bay window',
'dove',
'production company',
'evening light',
'gate',
'farm',
'reed',
'fruit stand',
'explorer',
'snow storm',
'throw pillow',
'button',
'display case',
'bookcase',
'lead',
'lipstick',
'basketball court',
'cargo',
'ensemble',
'pope',
'clock tower',
'teen',
'speaker',
'rat',
'laptop',
'ski',
'mess',
'stadium',
'ferry boat',
'bunny',
'waterfront',
'downtown',
'sink',
'press conference',
'dinner',
'condiment',
'thread',
'audience',
'grid',
'car',
'plastic',
'people',
'barbecue',
'pigeon',
'urinal',
'seagull',
'volunteer',
'hockey',
'fir tree',
'pollution',
'trial',
'collar',
'area',
'meeting room',
'circus',
'yogurt',
'orangutan',
'viaduct',
'comedian',
'drone',
'scissor',
'pop rock artist',
'biscuit',
'panda',
'water feature',
'air balloon',
'remote control',
'watercolor painting',
'show',
'walk',
'post office',
'bike path',
'rap gangsta artist',
'microphone',
'crack',
'sunset sky',
'glass',
'tv show',
'cartoon style',
'stripe',
'foyer',
'signal',
'calligraphy',
'bulb',
'gardener',
'coffee bean',
'spider',
'tapestry',
'city skyline',
'necklace',
'kitten',
'traveler',
'veteran',
'frosting',
'fry',
'tennis court',
'tank top',
'butterfly house',
'mist',
'drummer',
'water level',
'scale',
'baseball glove',
'music video performer',
'champagne',
'camping',
'clothing',
'water drop',
'telephone box',
'pen',
'morning mist',
'fire engine',
'porch',
'opening ceremony',
'style',
'palm tree',
'fashion show',
'universe',
'scratch',
'axe',
'ottoman',
'explosion',
'rib',
'boutique',
'game',
'cucumber',
'fruit',
'stone bridge',
'nature reserve',
'track',
'train window',
'punch',
'telephone pole',
'velvet',
'sauce',
'moon',
'contrast',
'flamingo',
'bat',
'vending machine',
'ship',
'equestrian',
'shade',
'comforter',
'pallet',
'sparrow',
'wii',
'glaze',
'grocery',
'steeple',
'soccer player',
'contract',
'advertising',
'runner',
'chimpanzee',
'world',
'seat',
'project',
'chihuahua',
'bubble',
'willow',
'pedestal',
'soul hip hop artist',
'curb',
'drawer',
'leaf',
'banner',
'launch party',
'coach',
'government',
'snowball',
'toy',
'portrait',
'doctor',
'whiteboard',
'electronic',
'tiger',
'graffiti',
'column',
'nightstand',
'whistle',
'maxi dress',
'bench',
'wetsuit',
'bird feeder',
'football game',
'basketball',
'class',
'bathroom door',
'store window',
'text message',
'wreath',
'street view',
'binocular',
'pet',
'facade',
'drought',
'lemon',
'new year',
'night view',
'airplane window',
'specie',
'rule',
'jaw',
'wheat field',
'diet',
'pop artist',
'habitat',
'screenshot',
'scoreboard',
'shore',
'mane',
'quilt',
'ski lift',
'orchid',
'turban',
'christmas',
'airport',
'marina',
'glass door',
'glass bottle',
'restaurant',
'conductor',
'logo',
'sleep',
'tape',
'tomato',
'river bank',
'lilac',
'tooth',
'training',
'pottery',
'shop',
'steam engine',
'mason jar',
'base',
'procession',
'border',
'shoot',
'footprint',
'hotdog',
'bull',
'stocking',
'recreation',
'automobile model',
'design',
'country pop artist',
'river',
'retriever',
'department store',
'auditorium',
'sport car',
'supermarket',
'belt',
'cricket',
'window box',
'dress shirt',
'letter',
'residence',
'megaphone',
'pant',
'wildfire',
'bird nest',
'crab',
'swimsuit',
'candle',
'funeral',
'mill',
'national park',
'plant',
'cop',
'power line',
'perch',
'blue',
'finger',
'ferris wheel',
'globe',
'skateboard',
'helmet',
'movie theater',
'uniform',
'hammer',
'material',
'kid',
'well',
'butterfly',
'sideline',
'fashion fall show',
'planet earth',
'lift',
'male',
'sauna',
'gray',
'flour',
'sand sculpture',
'program',
'cabinet',
'infant',
'wheel',
'aircraft model',
'dough',
'garlic',
'skate',
'arrow',
'wrapping paper',
'ripple',
'lamp',
'iron',
'banknote',
'beaver',
'ferry',
'courtyard',
'bassist',
'countryside',
'steak',
'comfort',
'boxer',
'laundry room',
'campsite',
'brick building',
'golf',
'subway',
'headphone',
'fort',
'handbag',
'drum',
'flood',
'saddle',
'bass',
'labyrinth',
'needle',
'sun ray',
'app',
'menu',
'president',
'cardigan',
'dandelion',
'wetland',
'ice hockey player',
'number',
'city hall',
'fishing',
'portrait session',
'pug',
'key',
'art print',
'minister',
'hurdle',
'emergency',
'painting artist',
'flag pole',
'evening',
'purse',
'recipe',
'golf ball',
'coloring book',
'mountain peak',
'senior',
'holiday',
'bud',
'cousin',
'pantry',
'lap',
'skin',
'flag',
'tissue paper',
'ridge',
'wire fence',
'surfer',
'climber',
'photograph',
'sewing machine',
'cooler',
'actress',
'apple tree',
'cancer',
'starfish',
'automobile make',
'dumbbell',
'brace',
'tunnel',
'window',
'paint artist',
'composition',
'school student',
'condo',
'convertible',
'cushion',
'selfie',
'territory',
'guide',
'tree',
'court',
'shrimp',
'stone house',
'dress',
'eyelash',
'juice',
'broccoli',
'chain',
'tourism',
'mountain top',
'concept car',
'film premiere',
'light bulb',
'cafeteria',
'badge',
'flower bed',
'theater',
'root',
'racecar driver',
'basketball boy game',
'glove',
'skyline',
'wall',
'glacier',
'airport terminal',
'bug',
'trim',
'railway station',
'briefcase',
'flat',
'fountain',
'person',
'lane',
'asparagus',
'art',
'lantern',
'dishwasher',
'director',
'snake',
'lecture',
'game controller',
'tree branch',
'pub',
'bathing suit',
'queue',
'belly',
'poppy',
'bow',
'pitcher',
'ice cream cone',
'cave',
'candy',
'road bridge',
'host',
'traffic jam',
'earring',
'file',
'foot',
'watermark overlay stamp',
'mailbox',
'supercar',
'railing',
'bedroom',
'seafood',
'waffle',
'bronze statue',
'plan',
'flow',
'marble',
'basketball game',
'automobile',
'scene',
'cypress tree',
'soldier',
'skateboarder',
'glass building',
'cherry tree',
'pump',
'grain',
'wildebeest',
'loop',
'frame',
'bathtub',
'saxophone',
'diver',
'stalk',
'lily',
'bead',
'alley',
'flock',
'family room',
'manufacturing',
'pointer',
'worker',
'navy',
'potato',
'teacher',
'photography',
'dolly',
'boardwalk',
'water fountain',
'athlete',
'side dish',
'bay',
'ice hockey',
'phone',
'hero',
'face',
'gold medal',
'blind',
'swamp',
'researcher',
'swim',
'meatball',
'iguana',
'leather jacket',
'jellyfish',
'site',
'smoke',
'traffic signal',
'melon',
'beetle',
'calculator',
'skirt',
'plantation',
'sculptor',
'barrier',
'catcher',
'security guard',
'sketch',
'awning',
'steering wheel',
'mountain view',
'bus stop',
'pool',
'leg',
'spotlight',
'apron',
'mineral',
'inlet',
'sleeve',
'torch',
'emotion',
'march',
'police officer',
'performance',
'lamp post',
'fishing boat',
'summer',
'presentation',
'saucer',
'suitcase',
'supermodel',
'goalkeeper',
'shrub',
'rock artist',
'document',
'beach house',
'man',
'blue artist',
'cigar',
'railroad track',
'gown',
'mosaic',
'bungalow',
'alphabet',
'baseball field',
'shed',
'pedestrian',
'rail',
'soap',
'kitchen counter',
'dessert',
'dunk',
'blossom',
'conversation',
'fruit market',
'glass jar',
'military',
'beer bottle',
'photographer',
'tennis racket',
'competition',
'escalator',
'bell tower',
'stilt',
'ballerina',
'television',
'feather',
'fence post',
'rear',
'dahlia',
'red carpet',
'tub',
'hole',
'fortress',
'pack',
'telephone',
'cardboard',
'city park',
'platform',
'college student',
'arch bridge',
'wind',
'blender',
'bloom',
'ice rink',
'birthday',
'raven',
'fairy',
'embankment',
'hall',
'flower shop',
'suburb',
'barrel',
'biker',
'steam',
'dragonfly',
'formation',
'electricity',
'business people',
'symmetry',
'walkway',
'fisherman',
'gas mask',
'loch',
'youth',
'hanger',
'dot',
'fish',
'street market',
'animation film',
'crime fiction film',
'boar',
'emblem',
'halloween costume',
'kangaroo',
'couple',
'spoon',
'squirrel',
'neon sign',
'sky',
'office desk',
'beauty salon',
'breakwater',
'fashion look',
'toaster',
'author',
'news conference',
'outdoor',
'canoe',
'dragon',
'tool',
'shopping centre',
'ladybug',
'swimming pool',
'landscaping',
'ski pole',
'red',
'truck',
'fly',
'temple',
'level',
'sunday',
'railroad bridge',
'car mirror',
'lawn mower',
'flute',
'aircraft carrier',
'fashion menswear london week',
'sunshine',
'tile floor',
'skull',
'fossil',
'flower arrangement',
'diaper',
'sea turtle',
'cherry blossom',
'fireman',
'shack',
'lens',
'waiter',
'animal',
'basement',
'snow',
'autumn park',
'glass box',
'kick',
'head',
'anniversary',
'vine',
'back',
'paper lantern',
'fish tank',
'cellphone',
'silk',
'coral',
'notebook',
'photo',
'gazebo',
'ketchup',
'driver',
'farmer',
'bonfire',
'chestnut',
'photoshoot',
'football field',
'olive tree',
'pheasant',
'sandal',
'toilet',
'fireplace',
'music',
'deity',
'fish market',
'fig',
'bell',
'neck',
'grave',
'villa',
'cyclist',
'crate',
'grey',
'asphalt road',
'soccer',
'hostel',
'municipality',
'courthouse',
'roof',
'end table',
'pot',
'sedan',
'structure',
'folk artist',
'sport',
'sport team',
'protest',
'syringe',
'fashion designer',
'jersey',
'heart shape',
'kayak',
'stare',
'sit with',
'direct',
'read',
'photograph',
'spin',
'teach',
'laugh',
'carve',
'grow on',
'warm',
'watch',
'stretch',
'smell',
'decorate',
'shine',
'light',
'dance',
'send',
'park',
'chase',
'collect',
'lead',
'kiss',
'lead to',
'lick',
'smile',
'cheer',
'sit',
'point',
'block',
'rock',
'drop',
'cut',
'ski',
'wrap',
'lose',
'serve',
'provide',
'sleep',
'dress',
'embrace',
'burn',
'pack',
'stir',
'create',
'touch',
'wash',
'stick',
'reveal',
'shop',
'train',
'paint',
'groom',
'hunt',
'bloom',
'play',
'pay',
'brush',
'shoot',
'hold',
'picture',
'carry',
'sip',
'contain',
'turn',
'pour',
'pitch',
'give',
'add',
'blow',
'look in',
'show',
'walk',
'illuminate',
'kneel',
'cover',
'drag',
'post',
'present',
'fit',
'operate',
'fish',
'race',
'write',
'deliver',
'peel',
'push',
'run',
'sit around',
'buy',
'jump',
'walk on',
'attend',
'clean',
'sell',
'ride on',
'mount',
'host',
'dry',
'plant',
'sing',
'row',
'shake',
'perch',
'ride',
'fight',
'skateboard',
'live',
'call',
'surround',
'practice',
'play on',
'work on',
'step',
'relax',
'hit',
'fall in',
'flow',
'greet',
'launch',
'wear',
'hang on',
'drive',
'sit in',
'break',
'learn',
'fly',
'connect',
'display',
'locate',
'compete',
'go for',
'sail',
'lift',
'toast',
'help',
'run on',
'reflect',
'pose',
'scratch',
'frame',
'dribble',
'herd',
'enter',
'exit',
'place',
'inspect',
'build',
'pick',
'fill',
'grind',
'skate',
'offer',
'float',
'sit by',
'stand',
'release',
'rest',
'singe',
'climb',
'tie',
'mark',
'lay',
'stand around',
'capture',
'set',
'land',
'swinge',
'run in',
'kick',
'lean',
'head',
'sign',
'approach',
'swim',
'close',
'crash',
'control',
'fall',
'remove',
'repair',
'open',
'appear',
'travel',
'load',
'miss',
'check',
'surf',
'moor',
'smoke',
'drink',
'board',
'seat',
'feed',
'rise',
'sit on',
'swing',
'grow',
'strike',
'date',
'slide',
'share',
'graze',
'jump in',
'lie',
'extrude',
'roll',
'move',
'gather',
'eat',
'pull',
'run through',
'squeeze',
'lay on',
'draw',
'play with',
'wave',
'assemble',
'perform',
'march',
'score',
'attach',
'adjust',
'hang',
'hug',
'sleep on',
'throw',
'live in',
'talk',
'pet',
'work',
'run with',
'see',
'flip',
'catch',
'cook',
'receive',
'celebrate',
'look',
'classic',
'bridal',
'indoor',
'industrial',
'teenage',
'mini',
'grassy',
'aged',
'long',
'warm',
'light',
'handsome',
'happy',
'three',
'pregnant',
'circular',
'urban',
'silver',
'ceramic',
'3d',
'green',
'blonde',
'golden',
'dark',
'tropical',
'ripe',
'deep',
'fat',
'musical',
'giant',
'medical',
'medieval',
'bare',
'stunning',
'bold',
'geographical',
'huge',
'plastic',
'foggy',
'stormy',
'gothic',
'biological',
'empty',
'clear',
'antique',
'pink',
'steep',
'brown',
'striped',
'aerial',
'rainy',
'cool',
'flying',
'commercial',
'purple',
'trendy',
'blank',
'haired',
'dead',
'wooden',
'flat',
'high',
'beige',
'panoramic',
'angry',
'dozen',
'rural',
'solar',
'big',
'small',
'stained',
'thick',
'many',
'fresh',
'clean',
'strong',
'abstract',
'crowded',
'retro',
'dry',
'gorgeous',
'martial',
'modern',
'blue',
'cloudy',
'low',
'four',
'outdoor',
'single',
'much',
'beautiful',
'snowy',
'pretty',
'new',
'short',
'sunny',
'closed',
'rocky',
'red',
'two',
'double',
'male',
'gray',
'five',
'colorful',
'automotive',
'various',
'one',
'old',
'rusty',
'tall',
'wild',
'narrow',
'natural',
'several',
'frozen',
'textured',
'lush',
'young',
'hot',
'mixed',
'white',
'float',
'quiet',
'round',
'bright',
'religious',
'female',
'historical',
'shiny',
'traditional',
'tourist',
'yellow',
'bald',
'coastal',
'lovely',
'little',
'broken',
'romantic',
'wide',
'royal',
'rich',
'open',
'cute',
'ancient',
'cold',
'political',
'elderly',
'gold',
'full',
'rustic',
'metallic',
'floral',
'sad',
'wet',
'fancy',
'senior',
'tiny',
'stylish',
'large',
'frosty',
'orange',
'transparent',
'electronic',
'shallow',
'scared',
'armed',
'dirty',
'historic',
'black',
'few',
'windy',
'some',
'square',
'ornamental',
'sandy',
'thin']


tra_array = np.array(tra_array)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/tag2Text/vit.py
================================================
'''
 * Copyright (c) 2022, salesforce.com, inc.
 * All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 * By Junnan Li
 * Based on timm code base
 * https://github.com/rwightman/pytorch-image-models/tree/master/timm
'''

import torch
import torch.nn as nn
import torch.nn.functional as F
from functools import partial

from timm.models.vision_transformer import _cfg, PatchEmbed
from timm.models.registry import register_model
from timm.models.layers import trunc_normal_, DropPath
from timm.models.helpers import named_apply, adapt_input_conv

from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper

class Mlp(nn.Module):
    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
    """
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
        self.scale = qk_scale or head_dim ** -0.5
        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
        self.attn_gradients = None
        self.attention_map = None
        
    def save_attn_gradients(self, attn_gradients):
        self.attn_gradients = attn_gradients
        
    def get_attn_gradients(self):
        return self.attn_gradients
    
    def save_attention_map(self, attention_map):
        self.attention_map = attention_map
        
    def get_attention_map(self):
        return self.attention_map
    
    def forward(self, x, register_hook=False):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
                
        if register_hook:
            self.save_attention_map(attn)
            attn.register_hook(self.save_attn_gradients)        

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class Block(nn.Module):

    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_grad_checkpointing=False):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

        if use_grad_checkpointing:
            self.attn = checkpoint_wrapper(self.attn)
            self.mlp = checkpoint_wrapper(self.mlp)

    def forward(self, x, register_hook=False):
        x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

    
class VisionTransformer(nn.Module):
    """ Vision Transformer
    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`  -
        https://arxiv.org/abs/2010.11929
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
                 num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=None, 
                 use_grad_checkpointing=False, ckpt_layer=0):
        """
        Args:
            img_size (int, tuple): input image size
            patch_size (int, tuple): patch size
            in_chans (int): number of input channels
            num_classes (int): number of classes for classification head
            embed_dim (int): embedding dimension
            depth (int): depth of transformer
            num_heads (int): number of attention heads
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
            qkv_bias (bool): enable bias for qkv if True
            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
            drop_rate (float): dropout rate
            attn_drop_rate (float): attention dropout rate
            drop_path_rate (float): stochastic depth rate
            norm_layer: (nn.Module): normalization layer
        """
        super().__init__()
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)

        self.patch_embed = PatchEmbed(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)

        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
                use_grad_checkpointing=(use_grad_checkpointing and i>=depth-ckpt_layer)
            )
            for i in range(depth)])
        self.norm = norm_layer(embed_dim)

        trunc_normal_(self.pos_embed, std=.02)
        trunc_normal_(self.cls_token, std=.02)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'pos_embed', 'cls_token'}

    def forward(self, x, register_blk=-1):
        B = x.shape[0]
        x = self.patch_embed(x)

        cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
        x = torch.cat((cls_tokens, x), dim=1)
  
        x = x + self.pos_embed[:,:x.size(1),:]
        x = self.pos_drop(x)

        for i,blk in enumerate(self.blocks):
            x = blk(x, register_blk==i)
        x = self.norm(x)
        
        return x

    @torch.jit.ignore()
    def load_pretrained(self, checkpoint_path, prefix=''):
        _load_weights(self, checkpoint_path, prefix)
        

@torch.no_grad()
def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''):
    """ Load weights from .npz checkpoints for official Google Brain Flax implementation
    """
    import numpy as np

    def _n2p(w, t=True):
        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
            w = w.flatten()
        if t:
            if w.ndim == 4:
                w = w.transpose([3, 2, 0, 1])
            elif w.ndim == 3:
                w = w.transpose([2, 0, 1])
            elif w.ndim == 2:
                w = w.transpose([1, 0])
        return torch.from_numpy(w)

    w = np.load(checkpoint_path)
    if not prefix and 'opt/target/embedding/kernel' in w:
        prefix = 'opt/target/'

    if hasattr(model.patch_embed, 'backbone'):
        # hybrid
        backbone = model.patch_embed.backbone
        stem_only = not hasattr(backbone, 'stem')
        stem = backbone if stem_only else backbone.stem
        stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel'])))
        stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
        stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
        if not stem_only:
            for i, stage in enumerate(backbone.stages):
                for j, block in enumerate(stage.blocks):
                    bp = f'{prefix}block{i + 1}/unit{j + 1}/'
                    for r in range(3):
                        getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel']))
                        getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale']))
                        getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias']))
                    if block.downsample is not None:
                        block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel']))
                        block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale']))
                        block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias']))
        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
    else:
        embed_conv_w = adapt_input_conv(
            model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel']))
    model.patch_embed.proj.weight.copy_(embed_conv_w)
    model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
    model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
    pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
    if pos_embed_w.shape != model.pos_embed.shape:
        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
            pos_embed_w, model.pos_embed, getattr(model, 'num_tokens', 1), model.patch_embed.grid_size)
    model.pos_embed.copy_(pos_embed_w)
    model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
    model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
#     if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
#         model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
#         model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
#     if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
#         model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
#         model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
    for i, block in enumerate(model.blocks.children()):
        block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
        mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
        block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
        block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
        block.attn.qkv.weight.copy_(torch.cat([
            _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')]))
        block.attn.qkv.bias.copy_(torch.cat([
            _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')]))
        block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
        block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
        for r in range(2):
            getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
            getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
        block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
        block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))

            
def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder):        
    # interpolate position embedding
    embedding_size = pos_embed_checkpoint.shape[-1]
    num_patches = visual_encoder.patch_embed.num_patches
    num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches
    # height (== width) for the checkpoint position embedding
    orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
    # height (== width) for the new position embedding
    new_size = int(num_patches ** 0.5)

    if orig_size!=new_size:
        # class_token and dist_token are kept unchanged
        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
        # only the position tokens are interpolated
        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
        pos_tokens = torch.nn.functional.interpolate(
            pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
        print('reshape position embedding from %d to %d'%(orig_size ** 2,new_size ** 2))
        
        return new_pos_embed    
    else:
        return pos_embed_checkpoint

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/__init__.py
================================================


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/__init__.py
================================================
from .build import build_dataset, build_pretraining_dataset

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/build.py
================================================
import os
from torchvision import transforms
from .transforms import *
from .masking_generator import TubeMaskingGenerator, RandomMaskingGenerator
from .mae import VideoMAE
from .kinetics import VideoClsDataset
from .kinetics_sparse import VideoClsDataset_sparse
from .ssv2 import SSVideoClsDataset, SSRawFrameClsDataset


class DataAugmentationForVideoMAE(object):
    def __init__(self, args):
        self.input_mean = [0.485, 0.456, 0.406]  # IMAGENET_DEFAULT_MEAN
        self.input_std = [0.229, 0.224, 0.225]  # IMAGENET_DEFAULT_STD
        normalize = GroupNormalize(self.input_mean, self.input_std)
        self.train_augmentation = GroupMultiScaleCrop(args.input_size, [1, .875, .75, .66])
        if args.color_jitter > 0:
            self.transform = transforms.Compose([                            
                self.train_augmentation,
                GroupColorJitter(args.color_jitter),
                GroupRandomHorizontalFlip(flip=args.flip),
                Stack(roll=False),
                ToTorchFormatTensor(div=True),
                normalize,
            ])
        else:
            self.transform = transforms.Compose([                            
                self.train_augmentation,
                GroupRandomHorizontalFlip(flip=args.flip),
                Stack(roll=False),
                ToTorchFormatTensor(div=True),
                normalize,
            ])
        if args.mask_type == 'tube':
            self.masked_position_generator = TubeMaskingGenerator(
                args.window_size, args.mask_ratio
            )
        elif args.mask_type == 'random':
            self.masked_position_generator = RandomMaskingGenerator(
                args.window_size, args.mask_ratio
            )
        elif args.mask_type in 'attention':
            self.masked_position_generator = None

    def __call__(self, images):
        process_data, _ = self.transform(images)
        if self.masked_position_generator is None:
            return process_data, -1
        else:
            return process_data, self.masked_position_generator()

    def __repr__(self):
        repr = "(DataAugmentationForVideoMAE,\n"
        repr += "  transform = %s,\n" % str(self.transform)
        repr += "  Masked position generator = %s,\n" % str(self.masked_position_generator)
        repr += ")"
        return repr


def build_pretraining_dataset(args):
    transform = DataAugmentationForVideoMAE(args)
    dataset = VideoMAE(
        root=None,
        setting=args.data_path,
        prefix=args.prefix,
        split=args.split,
        video_ext='mp4',
        is_color=True,
        modality='rgb',
        num_segments=args.num_segments,
        new_length=args.num_frames,
        new_step=args.sampling_rate,
        transform=transform,
        temporal_jitter=False,
        video_loader=True,
        use_decord=args.use_decord,
        lazy_init=False,
        num_sample=args.num_sample)
    print("Data Aug = %s" % str(transform))
    return dataset


def build_dataset(is_train, test_mode, args):
    print(f'Use Dataset: {args.data_set}')
    if args.data_set in [
            'Kinetics',
            'Kinetics_sparse',
            'mitv1_sparse'
        ]:
        mode = None
        anno_path = None
        if is_train is True:
            mode = 'train'
            anno_path = os.path.join(args.data_path, 'train.csv')
        elif test_mode is True:
            mode = 'test'
            anno_path = os.path.join(args.data_path, 'test.csv') 
        else:  
            mode = 'validation'
            anno_path = os.path.join(args.data_path, 'val.csv') 

        if 'sparse' in args.data_set:
            func = VideoClsDataset_sparse
        else:
            func = VideoClsDataset

        dataset = func(
            anno_path=anno_path,
            prefix=args.prefix,
            split=args.split,
            mode=mode,
            clip_len=args.num_frames,
            frame_sample_rate=args.sampling_rate,
            num_segment=1,
            test_num_segment=args.test_num_segment,
            test_num_crop=args.test_num_crop,
            num_crop=1 if not test_mode else 3,
            keep_aspect_ratio=True,
            crop_size=args.input_size,
            short_side_size=args.short_side_size,
            new_height=256,
            new_width=320,
            args=args)
        
        nb_classes = args.nb_classes
    
    elif args.data_set == 'SSV2':
        mode = None
        anno_path = None
        if is_train is True:
            mode = 'train'
            anno_path = os.path.join(args.data_path, 'train.csv')
        elif test_mode is True:
            mode = 'test'
            anno_path = os.path.join(args.data_path, 'test.csv') 
        else:  
            mode = 'validation'
            anno_path = os.path.join(args.data_path, 'val.csv') 

        if args.use_decord:
            func = SSVideoClsDataset
        else:
            func = SSRawFrameClsDataset

        dataset = func(
            anno_path=anno_path,
            prefix=args.prefix,
            split=args.split,
            mode=mode,
            clip_len=1,
            num_segment=args.num_frames,
            test_num_segment=args.test_num_segment,
            test_num_crop=args.test_num_crop,
            num_crop=1 if not test_mode else 3,
            keep_aspect_ratio=True,
            crop_size=args.input_size,
            short_side_size=args.short_side_size,
            new_height=256,
            new_width=320,
            args=args)
        nb_classes = 174

    elif args.data_set == 'UCF101':
        mode = None
        anno_path = None
        if is_train is True:
            mode = 'train'
            anno_path = os.path.join(args.data_path, 'train.csv')
        elif test_mode is True:
            mode = 'test'
            anno_path = os.path.join(args.data_path, 'test.csv') 
        else:  
            mode = 'validation'
            anno_path = os.path.join(args.data_path, 'val.csv') 

        dataset = VideoClsDataset(
            anno_path=anno_path,
            prefix=args.prefix,
            split=args.split,
            mode=mode,
            clip_len=args.num_frames,
            frame_sample_rate=args.sampling_rate,
            num_segment=1,
            test_num_segment=args.test_num_segment,
            test_num_crop=args.test_num_crop,
            num_crop=1 if not test_mode else 3,
            keep_aspect_ratio=True,
            crop_size=args.input_size,
            short_side_size=args.short_side_size,
            new_height=256,
            new_width=320,
            args=args)
        nb_classes = 101
    
    elif args.data_set == 'HMDB51':
        mode = None
        anno_path = None
        if is_train is True:
            mode = 'train'
            anno_path = os.path.join(args.data_path, 'train.csv')
        elif test_mode is True:
            mode = 'test'
            anno_path = os.path.join(args.data_path, 'test.csv') 
        else:  
            mode = 'validation'
            anno_path = os.path.join(args.data_path, 'val.csv') 

        dataset = VideoClsDataset(
            anno_path=anno_path,
            prefix=args.prefix,
            split=args.split,
            mode=mode,
            clip_len=args.num_frames,
            frame_sample_rate=args.sampling_rate,
            num_segment=1,
            test_num_segment=args.test_num_segment,
            test_num_crop=args.test_num_crop,
            num_crop=1 if not test_mode else 3,
            keep_aspect_ratio=True,
            crop_size=args.input_size,
            short_side_size=args.short_side_size,
            new_height=256,
            new_width=320,
            args=args)
        nb_classes = 51
    else:
        print(f'Wrong: {args.data_set}')
        raise NotImplementedError()
    assert nb_classes == args.nb_classes
    print("Number of the class = %d" % args.nb_classes)

    return dataset, nb_classes


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/kinetics.py
================================================
import os
import os
import io
import numpy as np
from numpy.lib.function_base import disp
import torch
from torchvision import transforms
import warnings
from decord import VideoReader, cpu
from torch.utils.data import Dataset
from .random_erasing import RandomErasing
from .video_transforms import (
    Compose, Resize, CenterCrop, Normalize,
    create_random_augment, random_short_side_scale_jitter, 
    random_crop, random_resized_crop_with_shift, random_resized_crop,
    horizontal_flip, random_short_side_scale_jitter, uniform_crop, 
)
from .volume_transforms import ClipToTensor

try:
    from petrel_client.client import Client
    has_client = True
except ImportError:
    has_client = False

class VideoClsDataset(Dataset):
    """Load your own video classification dataset."""

    def __init__(self, anno_path, prefix='', split=' ', mode='train', clip_len=8,
                 frame_sample_rate=2, crop_size=224, short_side_size=256,
                 new_height=256, new_width=340, keep_aspect_ratio=True,
                 num_segment=1, num_crop=1, test_num_segment=10, test_num_crop=3,
                 args=None):
        self.anno_path = anno_path
        self.prefix = prefix
        self.split = split
        self.mode = mode
        self.clip_len = clip_len
        self.frame_sample_rate = frame_sample_rate
        self.crop_size = crop_size
        self.short_side_size = short_side_size
        self.new_height = new_height
        self.new_width = new_width
        self.keep_aspect_ratio = keep_aspect_ratio
        self.num_segment = num_segment
        self.test_num_segment = test_num_segment
        self.num_crop = num_crop
        self.test_num_crop = test_num_crop
        self.args = args
        self.aug = False
        self.rand_erase = False
        assert num_segment == 1
        if self.mode in ['train']:
            self.aug = True
            if self.args.reprob > 0:
                self.rand_erase = True
        if VideoReader is None:
            raise ImportError("Unable to import `decord` which is required to read videos.")

        import pandas as pd
        cleaned = pd.read_csv(self.anno_path, header=None, delimiter=self.split)
        self.dataset_samples = list(cleaned.values[:, 0])
        self.label_array = list(cleaned.values[:, 1])

        self.client = None
        if has_client:
            self.client = Client('~/petreloss.conf')

        if (mode == 'train'):
            pass

        elif (mode == 'validation'):
            self.data_transform = Compose([
                Resize(self.short_side_size, interpolation='bilinear'),
                CenterCrop(size=(self.crop_size, self.crop_size)),
                ClipToTensor(),
                Normalize(mean=[0.485, 0.456, 0.406],
                                           std=[0.229, 0.224, 0.225])
            ])
        elif mode == 'test':
            self.data_resize = Compose([
                Resize(size=(short_side_size), interpolation='bilinear')
            ])
            self.data_transform = Compose([
                ClipToTensor(),
                Normalize(mean=[0.485, 0.456, 0.406],
                                           std=[0.229, 0.224, 0.225])
            ])
            self.test_seg = []
            self.test_dataset = []
            self.test_label_array = []
            for ck in range(self.test_num_segment):
                for cp in range(self.test_num_crop):
                    for idx in range(len(self.label_array)):
                        sample_label = self.label_array[idx]
                        self.test_label_array.append(sample_label)
                        self.test_dataset.append(self.dataset_samples[idx])
                        self.test_seg.append((ck, cp))

    def __getitem__(self, index):
        if self.mode == 'train':
            args = self.args 
            scale_t = 1

            sample = self.dataset_samples[index]
            buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t) # T H W C
            if len(buffer) == 0:
                while len(buffer) == 0:
                    warnings.warn("video {} not correctly loaded during training".format(sample))
                    index = np.random.randint(self.__len__())
                    sample = self.dataset_samples[index]
                    buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t)

            if args.num_sample > 1:
                frame_list = []
                label_list = []
                index_list = []
                for _ in range(args.num_sample):
                    new_frames = self._aug_frame(buffer, args)
                    label = self.label_array[index]
                    frame_list.append(new_frames)
                    label_list.append(label)
                    index_list.append(index)
                return frame_list, label_list, index_list, {}
            else:
                buffer = self._aug_frame(buffer, args)
            
            return buffer, self.label_array[index], index, {}

        elif self.mode == 'validation':
            sample = self.dataset_samples[index]
            buffer = self.loadvideo_decord(sample)
            if len(buffer) == 0:
                while len(buffer) == 0:
                    warnings.warn("video {} not correctly loaded during validation".format(sample))
                    index = np.random.randint(self.__len__())
                    sample = self.dataset_samples[index]
                    buffer = self.loadvideo_decord(sample)
            buffer = self.data_transform(buffer)
            return buffer, self.label_array[index], sample.split("/")[-1].split(".")[0]

        elif self.mode == 'test':
            sample = self.test_dataset[index]
            chunk_nb, split_nb = self.test_seg[index]
            buffer = self.loadvideo_decord(sample, chunk_nb=chunk_nb)

            while len(buffer) == 0:
                warnings.warn("video {}, temporal {}, spatial {} not found during testing".format(\
                    str(self.test_dataset[index]), chunk_nb, split_nb))
                index = np.random.randint(self.__len__())
                sample = self.test_dataset[index]
                chunk_nb, split_nb = self.test_seg[index]
                buffer = self.loadvideo_decord(sample, chunk_nb=chunk_nb)

            buffer = self.data_resize(buffer)
            if isinstance(buffer, list):
                buffer = np.stack(buffer, 0)

            if self.test_num_crop == 1:
                spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) / 2
                spatial_start = int(spatial_step)
            else:
                spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) \
                                    / (self.test_num_crop - 1)
                spatial_start = int(split_nb * spatial_step)
            if buffer.shape[1] >= buffer.shape[2]:
                buffer = buffer[:, spatial_start:spatial_start + self.short_side_size, :, :]
            else:
                buffer = buffer[:, :, spatial_start:spatial_start + self.short_side_size, :]

            buffer = self.data_transform(buffer)
            return buffer, self.test_label_array[index], sample.split("/")[-1].split(".")[0], \
                   chunk_nb, split_nb
        else:
            raise NameError('mode {} unkown'.format(self.mode))

    def _aug_frame(
        self,
        buffer,
        args,
    ):

        aug_transform = create_random_augment(
            input_size=(self.crop_size, self.crop_size),
            auto_augment=args.aa,
            interpolation=args.train_interpolation,
        )

        buffer = [
            transforms.ToPILImage()(frame) for frame in buffer
        ]

        buffer = aug_transform(buffer)

        buffer = [transforms.ToTensor()(img) for img in buffer]
        buffer = torch.stack(buffer) # T C H W
        buffer = buffer.permute(0, 2, 3, 1) # T H W C 
        
        # T H W C 
        buffer = tensor_normalize(
            buffer, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
        )
        # T H W C -> C T H W.
        buffer = buffer.permute(3, 0, 1, 2)
        # Perform data augmentation.
        scl, asp = (
            [0.08, 1.0],
            [0.75, 1.3333],
        )

        buffer = spatial_sampling(
            buffer,
            spatial_idx=-1,
            min_scale=256,
            max_scale=320,
            crop_size=self.crop_size,
            random_horizontal_flip=False if args.data_set == 'SSV2' else True ,
            inverse_uniform_sampling=False,
            aspect_ratio=asp,
            scale=scl,
            motion_shift=False
        )

        if self.rand_erase:
            erase_transform = RandomErasing(
                args.reprob,
                mode=args.remode,
                max_count=args.recount,
                num_splits=args.recount,
                device="cpu",
            )
            buffer = buffer.permute(1, 0, 2, 3)
            buffer = erase_transform(buffer)
            buffer = buffer.permute(1, 0, 2, 3)

        return buffer


    def loadvideo_decord(self, sample, sample_rate_scale=1, chunk_nb=0):
        """Load video content using Decord"""
        fname = sample
        fname = os.path.join(self.prefix, fname)

        try:
            if self.keep_aspect_ratio:
                if fname.startswith('s3'):
                    video_bytes = self.client.get(fname)
                    vr = VideoReader(io.BytesIO(video_bytes),
                                     num_threads=1,
                                     ctx=cpu(0))
                else:
                    vr = VideoReader(fname, num_threads=1, ctx=cpu(0))
            else:
                if fname.startswith('s3:'):
                    video_bytes = self.client.get(fname)
                    vr = VideoReader(io.BytesIO(video_bytes),
                                     width=self.new_width,
                                     height=self.new_height,
                                     num_threads=1,
                                     ctx=cpu(0))
                else:
                    vr = VideoReader(fname, width=self.new_width, height=self.new_height,
                                    num_threads=1, ctx=cpu(0))

            # handle temporal segments
            converted_len = int(self.clip_len * self.frame_sample_rate)
            seg_len = len(vr) // self.num_segment

            if self.mode == 'test':
                temporal_step = max(1.0 * (len(vr) - converted_len) / (self.test_num_segment - 1), 0)
                temporal_start = int(chunk_nb * temporal_step)

                bound = min(temporal_start + converted_len, len(vr))
                all_index = [x for x in range(temporal_start, bound, self.frame_sample_rate)]
                while len(all_index) < self.clip_len:
                    all_index.append(all_index[-1])
                vr.seek(0)
                buffer = vr.get_batch(all_index).asnumpy()
                return buffer

            all_index = []
            for i in range(self.num_segment):
                if seg_len <= converted_len:
                    index = np.linspace(0, seg_len, num=seg_len // self.frame_sample_rate)
                    index = np.concatenate((index, np.ones(self.clip_len - seg_len // self.frame_sample_rate) * seg_len))
                    index = np.clip(index, 0, seg_len - 1).astype(np.int64)
                else:
                    if self.mode == 'validation':
                        end_idx = (seg_len - converted_len) // 2
                    else:
                        end_idx = np.random.randint(converted_len, seg_len)
                    str_idx = end_idx - converted_len
                    index = np.linspace(str_idx, end_idx, num=self.clip_len)
                    index = np.clip(index, str_idx, end_idx - 1).astype(np.int64)
                index = index + i*seg_len
                all_index.extend(list(index))

            all_index = all_index[::int(sample_rate_scale)]
            vr.seek(0)
            buffer = vr.get_batch(all_index).asnumpy()
            return buffer
        except:
            print("video cannot be loaded by decord: ", fname)
            return []

    def __len__(self):
        if self.mode != 'test':
            return len(self.dataset_samples)
        else:
            return len(self.test_dataset)


def spatial_sampling(
    frames,
    spatial_idx=-1,
    min_scale=256,
    max_scale=320,
    crop_size=224,
    random_horizontal_flip=True,
    inverse_uniform_sampling=False,
    aspect_ratio=None,
    scale=None,
    motion_shift=False,
):
    """
    Perform spatial sampling on the given video frames. If spatial_idx is
    -1, perform random scale, random crop, and random flip on the given
    frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling
    with the given spatial_idx.
    Args:
        frames (tensor): frames of images sampled from the video. The
            dimension is `num frames` x `height` x `width` x `channel`.
        spatial_idx (int): if -1, perform random spatial sampling. If 0, 1,
            or 2, perform left, center, right crop if width is larger than
            height, and perform top, center, buttom crop if height is larger
            than width.
        min_scale (int): the minimal size of scaling.
        max_scale (int): the maximal size of scaling.
        crop_size (int): the size of height and width used to crop the
            frames.
        inverse_uniform_sampling (bool): if True, sample uniformly in
            [1 / max_scale, 1 / min_scale] and take a reciprocal to get the
            scale. If False, take a uniform sample from [min_scale,
            max_scale].
        aspect_ratio (list): Aspect ratio range for resizing.
        scale (list): Scale range for resizing.
        motion_shift (bool): Whether to apply motion shift for resizing.
    Returns:
        frames (tensor): spatially sampled frames.
    """
    assert spatial_idx in [-1, 0, 1, 2]
    if spatial_idx == -1:
        if aspect_ratio is None and scale is None:
            frames, _ = random_short_side_scale_jitter(
                images=frames,
                min_size=min_scale,
                max_size=max_scale,
                inverse_uniform_sampling=inverse_uniform_sampling,
            )
            frames, _ = random_crop(frames, crop_size)
        else:
            transform_func = (
                random_resized_crop_with_shift
                if motion_shift
                else random_resized_crop
            )
            frames = transform_func(
                images=frames,
                target_height=crop_size,
                target_width=crop_size,
                scale=scale,
                ratio=aspect_ratio,
            )
        if random_horizontal_flip:
            frames, _ = horizontal_flip(0.5, frames)
    else:
        # The testing is deterministic and no jitter should be performed.
        # min_scale, max_scale, and crop_size are expect to be the same.
        assert len({min_scale, max_scale, crop_size}) == 1
        frames, _ = random_short_side_scale_jitter(
            frames, min_scale, max_scale
        )
        frames, _ = uniform_crop(frames, crop_size, spatial_idx)
    return frames


def tensor_normalize(tensor, mean, std):
    """
    Normalize a given tensor by subtracting the mean and dividing the std.
    Args:
        tensor (tensor): tensor to normalize.
        mean (tensor or list): mean value to subtract.
        std (tensor or list): std to divide.
    """
    if tensor.dtype == torch.uint8:
        tensor = tensor.float()
        tensor = tensor / 255.0
    if type(mean) == list:
        mean = torch.tensor(mean)
    if type(std) == list:
        std = torch.tensor(std)
    tensor = tensor - mean
    tensor = tensor / std
    return tensor


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/kinetics_sparse.py
================================================
import os
import os
import io
import random
import numpy as np
from numpy.lib.function_base import disp
import torch
from torchvision import transforms
import warnings
from decord import VideoReader, cpu
from torch.utils.data import Dataset
from .random_erasing import RandomErasing
from .video_transforms import (
    Compose, Resize, CenterCrop, Normalize,
    create_random_augment, random_short_side_scale_jitter, 
    random_crop, random_resized_crop_with_shift, random_resized_crop,
    horizontal_flip, random_short_side_scale_jitter, uniform_crop, 
)
from .volume_transforms import ClipToTensor

try:
    from petrel_client.client import Client
    has_client = True
except ImportError:
    has_client = False

class VideoClsDataset_sparse(Dataset):
    """Load your own video classification dataset."""

    def __init__(self, anno_path, prefix='', split=' ', mode='train', clip_len=8,
                 frame_sample_rate=2, crop_size=224, short_side_size=256,
                 new_height=256, new_width=340, keep_aspect_ratio=True,
                 num_segment=1, num_crop=1, test_num_segment=10, test_num_crop=3,
                 args=None):
        self.anno_path = anno_path
        self.prefix = prefix
        self.split = split
        self.mode = mode
        self.clip_len = clip_len
        self.frame_sample_rate = frame_sample_rate
        self.crop_size = crop_size
        self.short_side_size = short_side_size
        self.new_height = new_height
        self.new_width = new_width
        self.keep_aspect_ratio = keep_aspect_ratio
        self.num_segment = num_segment
        self.test_num_segment = test_num_segment
        self.num_crop = num_crop
        self.test_num_crop = test_num_crop
        self.args = args
        self.aug = False
        self.rand_erase = False
        assert num_segment == 1
        if self.mode in ['train']:
            self.aug = True
            if self.args.reprob > 0:
                self.rand_erase = True
        if VideoReader is None:
            raise ImportError("Unable to import `decord` which is required to read videos.")

        import pandas as pd
        cleaned = pd.read_csv(self.anno_path, header=None, delimiter=self.split)
        self.dataset_samples = list(cleaned.values[:, 0])
        self.label_array = list(cleaned.values[:, 1])

        self.client = None
        if has_client:
            self.client = Client('~/petreloss.conf')

        if (mode == 'train'):
            pass

        elif (mode == 'validation'):
            self.data_transform = Compose([
                Resize(self.short_side_size, interpolation='bilinear'),
                CenterCrop(size=(self.crop_size, self.crop_size)),
                ClipToTensor(),
                Normalize(mean=[0.485, 0.456, 0.406],
                                           std=[0.229, 0.224, 0.225])
            ])
        elif mode == 'test':
            self.data_resize = Compose([
                Resize(size=(short_side_size), interpolation='bilinear')
            ])
            self.data_transform = Compose([
                ClipToTensor(),
                Normalize(mean=[0.485, 0.456, 0.406],
                                           std=[0.229, 0.224, 0.225])
            ])
            self.test_seg = []
            self.test_dataset = []
            self.test_label_array = []
            for ck in range(self.test_num_segment):
                for cp in range(self.test_num_crop):
                    for idx in range(len(self.label_array)):
                        sample_label = self.label_array[idx]
                        self.test_label_array.append(sample_label)
                        self.test_dataset.append(self.dataset_samples[idx])
                        self.test_seg.append((ck, cp))

    def __getitem__(self, index):
        if self.mode == 'train':
            args = self.args 

            sample = self.dataset_samples[index]
            buffer = self.loadvideo_decord(sample, chunk_nb=-1) # T H W C
            if len(buffer) == 0:
                while len(buffer) == 0:
                    warnings.warn("video {} not correctly loaded during training".format(sample))
                    index = np.random.randint(self.__len__())
                    sample = self.dataset_samples[index]
                    buffer = self.loadvideo_decord(sample, chunk_nb=-1)

            if args.num_sample > 1:
                frame_list = []
                label_list = []
                index_list = []
                for _ in range(args.num_sample):
                    new_frames = self._aug_frame(buffer, args)
                    label = self.label_array[index]
                    frame_list.append(new_frames)
                    label_list.append(label)
                    index_list.append(index)
                return frame_list, label_list, index_list, {}
            else:
                buffer = self._aug_frame(buffer, args)
            
            return buffer, self.label_array[index], index, {}

        elif self.mode == 'validation':
            sample = self.dataset_samples[index]
            buffer = self.loadvideo_decord(sample, chunk_nb=0)
            if len(buffer) == 0:
                while len(buffer) == 0:
                    warnings.warn("video {} not correctly loaded during validation".format(sample))
                    index = np.random.randint(self.__len__())
                    sample = self.dataset_samples[index]
                    buffer = self.loadvideo_decord(sample, chunk_nb=0)
            buffer = self.data_transform(buffer)
            return buffer, self.label_array[index], sample.split("/")[-1].split(".")[0]

        elif self.mode == 'test':
            sample = self.test_dataset[index]
            chunk_nb, split_nb = self.test_seg[index]
            buffer = self.loadvideo_decord(sample, chunk_nb=chunk_nb)

            while len(buffer) == 0:
                warnings.warn("video {}, temporal {}, spatial {} not found during testing".format(\
                    str(self.test_dataset[index]), chunk_nb, split_nb))
                index = np.random.randint(self.__len__())
                sample = self.test_dataset[index]
                chunk_nb, split_nb = self.test_seg[index]
                buffer = self.loadvideo_decord(sample, chunk_nb=chunk_nb)

            buffer = self.data_resize(buffer)
            if isinstance(buffer, list):
                buffer = np.stack(buffer, 0)
            if self.test_num_crop == 1:
                spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) / 2
                spatial_start = int(spatial_step)
            else:
                spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) \
                                    / (self.test_num_crop - 1)
                spatial_start = int(split_nb * spatial_step)
            if buffer.shape[1] >= buffer.shape[2]:
                buffer = buffer[:, spatial_start:spatial_start + self.short_side_size, :, :]
            else:
                buffer = buffer[:, :, spatial_start:spatial_start + self.short_side_size, :]

            buffer = self.data_transform(buffer)
            return buffer, self.test_label_array[index], sample.split("/")[-1].split(".")[0], \
                   chunk_nb, split_nb
        else:
            raise NameError('mode {} unkown'.format(self.mode))

    def _aug_frame(
        self,
        buffer,
        args,
    ):

        aug_transform = create_random_augment(
            input_size=(self.crop_size, self.crop_size),
            auto_augment=args.aa,
            interpolation=args.train_interpolation,
        )

        buffer = [
            transforms.ToPILImage()(frame) for frame in buffer
        ]

        buffer = aug_transform(buffer)

        buffer = [transforms.ToTensor()(img) for img in buffer]
        buffer = torch.stack(buffer) # T C H W
        buffer = buffer.permute(0, 2, 3, 1) # T H W C 
        
        # T H W C 
        buffer = tensor_normalize(
            buffer, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
        )
        # T H W C -> C T H W.
        buffer = buffer.permute(3, 0, 1, 2)
        # Perform data augmentation.
        scl, asp = (
            [0.08, 1.0],
            [0.75, 1.3333],
        )

        buffer = spatial_sampling(
            buffer,
            spatial_idx=-1,
            min_scale=256,
            max_scale=320,
            crop_size=self.crop_size,
            random_horizontal_flip=False if args.data_set == 'SSV2' else True ,
            inverse_uniform_sampling=False,
            aspect_ratio=asp,
            scale=scl,
            motion_shift=False
        )

        if self.rand_erase:
            erase_transform = RandomErasing(
                args.reprob,
                mode=args.remode,
                max_count=args.recount,
                num_splits=args.recount,
                device="cpu",
            )
            buffer = buffer.permute(1, 0, 2, 3)
            buffer = erase_transform(buffer)
            buffer = buffer.permute(1, 0, 2, 3)

        return buffer

    def _get_seq_frames(self, video_size, num_frames, clip_idx=-1):
        seg_size = max(0., float(video_size - 1) / num_frames)
        max_frame = int(video_size) - 1
        seq = []
        # index from 1, must add 1
        if clip_idx == -1:
            for i in range(num_frames):
                start = int(np.round(seg_size * i))
                end = int(np.round(seg_size * (i + 1)))
                idx = min(random.randint(start, end), max_frame)
                seq.append(idx)
        else:
            num_segment = 1
            if self.mode == 'test':
                num_segment = self.test_num_segment
            duration = seg_size / (num_segment + 1)
            for i in range(num_frames):
                start = int(np.round(seg_size * i))
                frame_index = start + int(duration * (clip_idx + 1))
                idx = min(frame_index, max_frame)
                seq.append(idx)
        return seq

    def loadvideo_decord(self, sample, chunk_nb=0):
        """Load video content using Decord"""
        fname = sample
        fname = os.path.join(self.prefix, fname)

        try:
            if self.keep_aspect_ratio:
                if fname.startswith('s3'):
                    video_bytes = self.client.get(fname)
                    vr = VideoReader(io.BytesIO(video_bytes),
                                     num_threads=1,
                                     ctx=cpu(0))
                else:
                    vr = VideoReader(fname, num_threads=1, ctx=cpu(0))
            else:
                if fname.startswith('s3:'):
                    video_bytes = self.client.get(fname)
                    vr = VideoReader(io.BytesIO(video_bytes),
                                     width=self.new_width,
                                     height=self.new_height,
                                     num_threads=1,
                                     ctx=cpu(0))
                else:
                    vr = VideoReader(fname, width=self.new_width, height=self.new_height,
                                    num_threads=1, ctx=cpu(0))

            all_index = self._get_seq_frames(len(vr), self.clip_len, clip_idx=chunk_nb)
            vr.seek(0)
            buffer = vr.get_batch(all_index).asnumpy()
            return buffer
        except:
            print("video cannot be loaded by decord: ", fname)
            return []

    def __len__(self):
        if self.mode != 'test':
            return len(self.dataset_samples)
        else:
            return len(self.test_dataset)


def spatial_sampling(
    frames,
    spatial_idx=-1,
    min_scale=256,
    max_scale=320,
    crop_size=224,
    random_horizontal_flip=True,
    inverse_uniform_sampling=False,
    aspect_ratio=None,
    scale=None,
    motion_shift=False,
):
    """
    Perform spatial sampling on the given video frames. If spatial_idx is
    -1, perform random scale, random crop, and random flip on the given
    frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling
    with the given spatial_idx.
    Args:
        frames (tensor): frames of images sampled from the video. The
            dimension is `num frames` x `height` x `width` x `channel`.
        spatial_idx (int): if -1, perform random spatial sampling. If 0, 1,
            or 2, perform left, center, right crop if width is larger than
            height, and perform top, center, buttom crop if height is larger
            than width.
        min_scale (int): the minimal size of scaling.
        max_scale (int): the maximal size of scaling.
        crop_size (int): the size of height and width used to crop the
            frames.
        inverse_uniform_sampling (bool): if True, sample uniformly in
            [1 / max_scale, 1 / min_scale] and take a reciprocal to get the
            scale. If False, take a uniform sample from [min_scale,
            max_scale].
        aspect_ratio (list): Aspect ratio range for resizing.
        scale (list): Scale range for resizing.
        motion_shift (bool): Whether to apply motion shift for resizing.
    Returns:
        frames (tensor): spatially sampled frames.
    """
    assert spatial_idx in [-1, 0, 1, 2]
    if spatial_idx == -1:
        if aspect_ratio is None and scale is None:
            frames, _ = random_short_side_scale_jitter(
                images=frames,
                min_size=min_scale,
                max_size=max_scale,
                inverse_uniform_sampling=inverse_uniform_sampling,
            )
            frames, _ = random_crop(frames, crop_size)
        else:
            transform_func = (
                random_resized_crop_with_shift
                if motion_shift
                else random_resized_crop
            )
            frames = transform_func(
                images=frames,
                target_height=crop_size,
                target_width=crop_size,
                scale=scale,
                ratio=aspect_ratio,
            )
        if random_horizontal_flip:
            frames, _ = horizontal_flip(0.5, frames)
    else:
        # The testing is deterministic and no jitter should be performed.
        # min_scale, max_scale, and crop_size are expect to be the same.
        assert len({min_scale, max_scale, crop_size}) == 1
        frames, _ = random_short_side_scale_jitter(
            frames, min_scale, max_scale
        )
        frames, _ = uniform_crop(frames, crop_size, spatial_idx)
    return frames


def tensor_normalize(tensor, mean, std):
    """
    Normalize a given tensor by subtracting the mean and dividing the std.
    Args:
        tensor (tensor): tensor to normalize.
        mean (tensor or list): mean value to subtract.
        std (tensor or list): std to divide.
    """
    if tensor.dtype == torch.uint8:
        tensor = tensor.float()
        tensor = tensor / 255.0
    if type(mean) == list:
        mean = torch.tensor(mean)
    if type(std) == list:
        std = torch.tensor(std)
    tensor = tensor - mean
    tensor = tensor / std
    return tensor


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/mae.py
================================================
import os
import cv2
import io
import numpy as np
import torch
import decord
from PIL import Image
from decord import VideoReader, cpu
import random

try:
    from petrel_client.client import Client
    has_client = True
except ImportError:
    has_client = False


class VideoMAE(torch.utils.data.Dataset):
    """Load your own video classification dataset.
    Parameters
    ----------
    root : str, required.
        Path to the root folder storing the dataset.
    setting : str, required.
        A text file describing the dataset, each line per video sample.
        There are three items in each line: (1) video path; (2) video length and (3) video label.
    prefix : str, required.
        The prefix for loading data.
    split : str, required.
        The split character for metadata.
    train : bool, default True.
        Whether to load the training or validation set.
    test_mode : bool, default False.
        Whether to perform evaluation on the test set.
        Usually there is three-crop or ten-crop evaluation strategy involved.
    name_pattern : str, default None.
        The naming pattern of the decoded video frames.
        For example, img_00012.jpg.
    video_ext : str, default 'mp4'.
        If video_loader is set to True, please specify the video format accordinly.
    is_color : bool, default True.
        Whether the loaded image is color or grayscale.
    modality : str, default 'rgb'.
        Input modalities, we support only rgb video frames for now.
        Will add support for rgb difference image and optical flow image later.
    num_segments : int, default 1.
        Number of segments to evenly divide the video into clips.
        A useful technique to obtain global video-level information.
        Limin Wang, etal, Temporal Segment Networks: Towards Good Practices for Deep Action Recognition, ECCV 2016.
    num_crop : int, default 1.
        Number of crops for each image. default is 1.
        Common choices are three crops and ten crops during evaluation.
    new_length : int, default 1.
        The length of input video clip. Default is a single image, but it can be multiple video frames.
        For example, new_length=16 means we will extract a video clip of consecutive 16 frames.
    new_step : int, default 1.
        Temporal sampling rate. For example, new_step=1 means we will extract a video clip of consecutive frames.
        new_step=2 means we will extract a video clip of every other frame.
    temporal_jitter : bool, default False.
        Whether to temporally jitter if new_step > 1.
    video_loader : bool, default False.
        Whether to use video loader to load data.
    use_decord : bool, default True.
        Whether to use Decord video loader to load data. Otherwise load image.
    transform : function, default None.
        A function that takes data and label and transforms them.
    data_aug : str, default 'v1'.
        Different types of data augmentation auto. Supports v1, v2, v3 and v4.
    lazy_init : bool, default False.
        If set to True, build a dataset instance without loading any dataset.
    """
    def __init__(self,
                 root,
                 setting,
                 prefix='',
                 split=' ',
                 train=True,
                 test_mode=False,
                 name_pattern='img_%05d.jpg',
                 video_ext='mp4',
                 is_color=True,
                 modality='rgb',
                 num_segments=1,
                 num_crop=1,
                 new_length=1,
                 new_step=1,
                 transform=None,
                 temporal_jitter=False,
                 video_loader=False,
                 use_decord=True,
                 lazy_init=False,
                 num_sample=1,
                 ):

        super(VideoMAE, self).__init__()
        self.root = root
        self.setting = setting
        self.prefix = prefix
        self.split = split
        self.train = train
        self.test_mode = test_mode
        self.is_color = is_color
        self.modality = modality
        self.num_segments = num_segments
        self.num_crop = num_crop
        self.new_length = new_length
        self.new_step = new_step
        self.skip_length = self.new_length * self.new_step
        self.temporal_jitter = temporal_jitter
        self.name_pattern = name_pattern
        self.video_loader = video_loader
        self.video_ext = video_ext
        self.use_decord = use_decord
        self.transform = transform
        self.lazy_init = lazy_init
        self.num_sample = num_sample

        # sparse sampling, num_segments != 1
        if self.num_segments != 1:
            print('Use sparse sampling, change frame and stride')
            self.new_length = self.num_segments
            self.skip_length = 1

        self.client = None
        if has_client:
            self.client = Client('~/petreloss.conf')

        if not self.lazy_init:
            self.clips = self._make_dataset(root, setting)
            if len(self.clips) == 0:
                raise(RuntimeError("Found 0 video clips in subfolders of: " + root + "\n"
                                   "Check your data directory (opt.data-dir)."))

    def __getitem__(self, index):
        while True:
            try:
                images = None
                if self.use_decord:
                    directory, target = self.clips[index]
                    if self.video_loader:
                        if '.' in directory.split('/')[-1]:
                            # data in the "setting" file already have extension, e.g., demo.mp4
                            video_name = directory
                        else:
                            # data in the "setting" file do not have extension, e.g., demo
                            # So we need to provide extension (i.e., .mp4) to complete the file name.
                            video_name = '{}.{}'.format(directory, self.video_ext)

                        video_name = os.path.join(self.prefix, video_name)
                        if video_name.startswith('s3'):
                            video_bytes = self.client.get(video_name)
                            decord_vr = VideoReader(io.BytesIO(video_bytes),
                                                    num_threads=1,
                                                    ctx=cpu(0))
                        else:
                            decord_vr = decord.VideoReader(video_name, num_threads=1, ctx=cpu(0))
                        duration = len(decord_vr)
                        
                    segment_indices, skip_offsets = self._sample_train_indices(duration)
                    images = self._video_TSN_decord_batch_loader(directory, decord_vr, duration, segment_indices, skip_offsets)
                
                else:
                    video_name, total_frame, target = self.clips[index]
                    video_name = os.path.join(self.prefix, video_name)

                    segment_indices, skip_offsets = self._sample_train_indices(total_frame)
                    frame_id_list = self._get_frame_id_list(total_frame, segment_indices, skip_offsets)
                    images = []
                    for idx in frame_id_list:
                        frame_fname = os.path.join(video_name, self.name_pattern.format(idx))
                        img_bytes = self.client.get(frame_fname)
                        img_np = np.frombuffer(img_bytes, np.uint8)
                        img = cv2.imdecode(img_np, cv2.IMREAD_COLOR)
                        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
                        images.append(Image.fromarray(img))    
                if images is not None:
                    break
            except Exception as e:
                print("Failed to load video from {} with error {}".format(
                    video_name, e))
            index = random.randint(0, len(self.clips) - 1)
       
        if self.num_sample > 1:
            process_data_list = []
            mask_list = []
            for _ in range(self.num_sample):
                process_data, mask = self.transform((images, None))
                process_data = process_data.view((self.new_length, 3) + process_data.size()[-2:]).transpose(0, 1)
                process_data_list.append(process_data)
                mask_list.append(mask)
            return process_data_list, mask_list
        else:
            process_data, mask = self.transform((images, None)) # T*C,H,W
            process_data = process_data.view((self.new_length, 3) + process_data.size()[-2:]).transpose(0, 1)  # T*C,H,W -> T,C,H,W -> C,T,H,W
            return (process_data, mask)

    def __len__(self):
        return len(self.clips)

    def _make_dataset(self, directory, setting):
        if not os.path.exists(setting):
            raise(RuntimeError("Setting file %s doesn't exist. Check opt.train-list and opt.val-list. " % (setting)))
        clips = []

        print(f'Load dataset using decord: {self.use_decord}')
        with open(setting) as split_f:
            data = split_f.readlines()
            for line in data:
                line_info = line.split(self.split)
                if len(line_info) < 2:
                    raise(RuntimeError('Video input format is not correct, missing one or more element. %s' % line))
                if self.use_decord:
                    # line format: video_path, video_label
                    clip_path = os.path.join(line_info[0])
                    target = int(line_info[1])
                    item = (clip_path, target)
                else:
                    # line format: video_path, video_duration, video_label
                    clip_path = os.path.join(line_info[0])
                    total_frame = int(line_info[1])
                    target = int(line_info[2])
                    item = (clip_path, total_frame, target)
                clips.append(item)
        return clips

    def _sample_train_indices(self, num_frames):
        average_duration = (num_frames - self.skip_length + 1) // self.num_segments
        if average_duration > 0:
            offsets = np.multiply(list(range(self.num_segments)),
                                  average_duration)
            offsets = offsets + np.random.randint(average_duration,
                                                  size=self.num_segments)
        elif num_frames > max(self.num_segments, self.skip_length):
            offsets = np.sort(np.random.randint(
                num_frames - self.skip_length + 1,
                size=self.num_segments))
        else:
            offsets = np.zeros((self.num_segments,))

        if self.temporal_jitter:
            skip_offsets = np.random.randint(
                self.new_step, size=self.skip_length // self.new_step)
        else:
            skip_offsets = np.zeros(
                self.skip_length // self.new_step, dtype=int)
        return offsets + 1, skip_offsets

    def _get_frame_id_list(self, duration, indices, skip_offsets):
        frame_id_list = []
        for seg_ind in indices:
            offset = int(seg_ind)
            for i, _ in enumerate(range(0, self.skip_length, self.new_step)):
                if offset + skip_offsets[i] <= duration:
                    frame_id = offset + skip_offsets[i] - 1
                else:
                    frame_id = offset - 1
                frame_id_list.append(frame_id)
                if offset + self.new_step < duration:
                    offset += self.new_step
        return frame_id_list

    def _video_TSN_decord_batch_loader(self, directory, video_reader, duration, indices, skip_offsets):
        sampled_list = []
        frame_id_list = []
        for seg_ind in indices:
            offset = int(seg_ind)
            for i, _ in enumerate(range(0, self.skip_length, self.new_step)):
                if offset + skip_offsets[i] <= duration:
                    frame_id = offset + skip_offsets[i] - 1
                else:
                    frame_id = offset - 1
                frame_id_list.append(frame_id)
                if offset + self.new_step < duration:
                    offset += self.new_step
        try:
            video_data = video_reader.get_batch(frame_id_list).asnumpy()
            sampled_list = [Image.fromarray(video_data[vid, :, :, :]).convert('RGB') for vid, _ in enumerate(frame_id_list)]
        except:
            raise RuntimeError('Error occured in reading frames {} from video {} of duration {}.'.format(frame_id_list, directory, duration))
        return sampled_list

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/masking_generator.py
================================================
import numpy as np


class TubeMaskingGenerator:
    def __init__(self, input_size, mask_ratio):
        self.frames, self.height, self.width = input_size
        self.num_patches_per_frame = self.height * self.width
        self.total_patches = self.frames * self.num_patches_per_frame 
        self.num_masks_per_frame = int(mask_ratio * self.num_patches_per_frame)
        self.total_masks = self.frames * self.num_masks_per_frame

    def __repr__(self):
        repr_str = "Maks: total patches {}, mask patches {}".format(
            self.total_patches, self.total_masks
        )
        return repr_str

    def __call__(self):
        mask_per_frame = np.hstack([
            np.zeros(self.num_patches_per_frame - self.num_masks_per_frame),
            np.ones(self.num_masks_per_frame),
        ])
        np.random.shuffle(mask_per_frame)
        mask = np.tile(mask_per_frame, (self.frames, 1)).flatten()
        return mask 


class RandomMaskingGenerator:
    def __init__(self, input_size, mask_ratio):
        if not isinstance(input_size, tuple):
            input_size = (input_size, ) * 3

        self.frames, self.height, self.width = input_size

        self.num_patches = self.frames * self.height * self.width  # 8x14x14
        self.num_mask = int(mask_ratio * self.num_patches)

    def __repr__(self):
        repr_str = "Maks: total patches {}, mask patches {}".format(
            self.num_patches, self.num_mask)
        return repr_str

    def __call__(self):
        mask = np.hstack([
            np.zeros(self.num_patches - self.num_mask),
            np.ones(self.num_mask),
        ])
        np.random.shuffle(mask)
        return mask  # [196*8]


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/mixup.py
================================================
""" Mixup and Cutmix

Papers:
mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412)

CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899)

Code Reference:
CutMix: https://github.com/clovaai/CutMix-PyTorch

Hacked together by / Copyright 2019, Ross Wightman
"""
import numpy as np
import torch


def one_hot(x, num_classes, on_value=1., off_value=0., device='cuda'):
    x = x.long().view(-1, 1)
    return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value)


def mixup_target(target, num_classes, lam=1., smoothing=0.0, device='cuda'):
    off_value = smoothing / num_classes
    on_value = 1. - smoothing + off_value
    y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device)
    y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device)
    return y1 * lam + y2 * (1. - lam)


def rand_bbox(img_shape, lam, margin=0., count=None):
    """ Standard CutMix bounding-box
    Generates a random square bbox based on lambda value. This impl includes
    support for enforcing a border margin as percent of bbox dimensions.

    Args:
        img_shape (tuple): Image shape as tuple
        lam (float): Cutmix lambda value
        margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
        count (int): Number of bbox to generate
    """
    ratio = np.sqrt(1 - lam)
    img_h, img_w = img_shape[-2:]
    cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
    margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
    cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
    cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
    yl = np.clip(cy - cut_h // 2, 0, img_h)
    yh = np.clip(cy + cut_h // 2, 0, img_h)
    xl = np.clip(cx - cut_w // 2, 0, img_w)
    xh = np.clip(cx + cut_w // 2, 0, img_w)
    return yl, yh, xl, xh


def rand_bbox_minmax(img_shape, minmax, count=None):
    """ Min-Max CutMix bounding-box
    Inspired by Darknet cutmix impl, generates a random rectangular bbox
    based on min/max percent values applied to each dimension of the input image.

    Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.

    Args:
        img_shape (tuple): Image shape as tuple
        minmax (tuple or list): Min and max bbox ratios (as percent of image size)
        count (int): Number of bbox to generate
    """
    assert len(minmax) == 2
    img_h, img_w = img_shape[-2:]
    cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
    cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
    yl = np.random.randint(0, img_h - cut_h, size=count)
    xl = np.random.randint(0, img_w - cut_w, size=count)
    yu = yl + cut_h
    xu = xl + cut_w
    return yl, yu, xl, xu


def cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None):
    """ Generate bbox and apply lambda correction.
    """
    if ratio_minmax is not None:
        yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count)
    else:
        yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count)
    if correct_lam or ratio_minmax is not None:
        bbox_area = (yu - yl) * (xu - xl)
        lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1])
    return (yl, yu, xl, xu), lam


class Mixup:
    """ Mixup/Cutmix that applies different params to each element or whole batch

    Args:
        mixup_alpha (float): mixup alpha value, mixup is active if > 0.
        cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
        prob (float): probability of applying mixup or cutmix per batch or element
        switch_prob (float): probability of switching to cutmix instead of mixup when both are active
        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
        correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders
        label_smoothing (float): apply label smoothing to the mixed target tensor
        num_classes (int): number of classes for target
    """
    def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5,
                 mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000):
        self.mixup_alpha = mixup_alpha
        self.cutmix_alpha = cutmix_alpha
        self.cutmix_minmax = cutmix_minmax
        if self.cutmix_minmax is not None:
            assert len(self.cutmix_minmax) == 2
            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
            self.cutmix_alpha = 1.0
        self.mix_prob = prob
        self.switch_prob = switch_prob
        self.label_smoothing = label_smoothing
        self.num_classes = num_classes
        self.mode = mode
        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)

    def _params_per_elem(self, batch_size):
        lam = np.ones(batch_size, dtype=np.float32)
        use_cutmix = np.zeros(batch_size, dtype=np.bool)
        if self.mixup_enabled:
            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
                use_cutmix = np.random.rand(batch_size) < self.switch_prob
                lam_mix = np.where(
                    use_cutmix,
                    np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size),
                    np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size))
            elif self.mixup_alpha > 0.:
                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size)
            elif self.cutmix_alpha > 0.:
                use_cutmix = np.ones(batch_size, dtype=np.bool)
                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size)
            else:
                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
            lam = np.where(np.random.rand(batch_size) < self.mix_prob, lam_mix.astype(np.float32), lam)
        return lam, use_cutmix

    def _params_per_batch(self):
        lam = 1.
        use_cutmix = False
        if self.mixup_enabled and np.random.rand() < self.mix_prob:
            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
                use_cutmix = np.random.rand() < self.switch_prob
                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \
                    np.random.beta(self.mixup_alpha, self.mixup_alpha)
            elif self.mixup_alpha > 0.:
                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
            elif self.cutmix_alpha > 0.:
                use_cutmix = True
                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
            else:
                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
            lam = float(lam_mix)
        return lam, use_cutmix

    def _mix_elem(self, x):
        batch_size = len(x)
        lam_batch, use_cutmix = self._params_per_elem(batch_size)
        x_orig = x.clone()  # need to keep an unmodified original for mixing source
        for i in range(batch_size):
            j = batch_size - i - 1
            lam = lam_batch[i]
            if lam != 1.:
                if use_cutmix[i]:
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    x[i][..., yl:yh, xl:xh] = x_orig[j][..., yl:yh, xl:xh]
                    lam_batch[i] = lam
                else:
                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)

    def _mix_pair(self, x):
        batch_size = len(x)
        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
        x_orig = x.clone()  # need to keep an unmodified original for mixing source
        for i in range(batch_size // 2):
            j = batch_size - i - 1
            lam = lam_batch[i]
            if lam != 1.:
                if use_cutmix[i]:
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
                    x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh]
                    lam_batch[i] = lam
                else:
                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
                    x[j] = x[j] * lam + x_orig[i] * (1 - lam)
        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)

    def _mix_batch(self, x):
        lam, use_cutmix = self._params_per_batch()
        if lam == 1.:
            return 1.
        if use_cutmix:
            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
                x.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
            x[..., yl:yh, xl:xh] = x.flip(0)[..., yl:yh, xl:xh]
        else:
            x_flipped = x.flip(0).mul_(1. - lam)
            x.mul_(lam).add_(x_flipped)
        return lam

    def __call__(self, x, target):
        assert len(x) % 2 == 0, 'Batch size should be even when using this'
        if self.mode == 'elem':
            lam = self._mix_elem(x)
        elif self.mode == 'pair':
            lam = self._mix_pair(x)
        else:
            lam = self._mix_batch(x)
        target = mixup_target(target, self.num_classes, lam, self.label_smoothing, x.device)
        return x, target


class FastCollateMixup(Mixup):
    """ Fast Collate w/ Mixup/Cutmix that applies different params to each element or whole batch

    A Mixup impl that's performed while collating the batches.
    """

    def _mix_elem_collate(self, output, batch, half=False):
        batch_size = len(batch)
        num_elem = batch_size // 2 if half else batch_size
        assert len(output) == num_elem
        lam_batch, use_cutmix = self._params_per_elem(num_elem)
        for i in range(num_elem):
            j = batch_size - i - 1
            lam = lam_batch[i]
            mixed = batch[i][0]
            if lam != 1.:
                if use_cutmix[i]:
                    if not half:
                        mixed = mixed.copy()
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
                    lam_batch[i] = lam
                else:
                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
                    np.rint(mixed, out=mixed)
            output[i] += torch.from_numpy(mixed.astype(np.uint8))
        if half:
            lam_batch = np.concatenate((lam_batch, np.ones(num_elem)))
        return torch.tensor(lam_batch).unsqueeze(1)

    def _mix_pair_collate(self, output, batch):
        batch_size = len(batch)
        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
        for i in range(batch_size // 2):
            j = batch_size - i - 1
            lam = lam_batch[i]
            mixed_i = batch[i][0]
            mixed_j = batch[j][0]
            assert 0 <= lam <= 1.0
            if lam < 1.:
                if use_cutmix[i]:
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    patch_i = mixed_i[:, yl:yh, xl:xh].copy()
                    mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh]
                    mixed_j[:, yl:yh, xl:xh] = patch_i
                    lam_batch[i] = lam
                else:
                    mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam)
                    mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam)
                    mixed_i = mixed_temp
                    np.rint(mixed_j, out=mixed_j)
                    np.rint(mixed_i, out=mixed_i)
            output[i] += torch.from_numpy(mixed_i.astype(np.uint8))
            output[j] += torch.from_numpy(mixed_j.astype(np.uint8))
        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
        return torch.tensor(lam_batch).unsqueeze(1)

    def _mix_batch_collate(self, output, batch):
        batch_size = len(batch)
        lam, use_cutmix = self._params_per_batch()
        if use_cutmix:
            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
                output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
        for i in range(batch_size):
            j = batch_size - i - 1
            mixed = batch[i][0]
            if lam != 1.:
                if use_cutmix:
                    mixed = mixed.copy()  # don't want to modify the original while iterating
                    mixed[..., yl:yh, xl:xh] = batch[j][0][..., yl:yh, xl:xh]
                else:
                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
                    np.rint(mixed, out=mixed)
            output[i] += torch.from_numpy(mixed.astype(np.uint8))
        return lam

    def __call__(self, batch, _=None):
        batch_size = len(batch)
        assert batch_size % 2 == 0, 'Batch size should be even when using this'
        half = 'half' in self.mode
        if half:
            batch_size //= 2
        output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
        if self.mode == 'elem' or self.mode == 'half':
            lam = self._mix_elem_collate(output, batch, half=half)
        elif self.mode == 'pair':
            lam = self._mix_pair_collate(output, batch)
        else:
            lam = self._mix_batch_collate(output, batch)
        target = torch.tensor([b[1] for b in batch], dtype=torch.int64)
        target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device='cpu')
        target = target[:batch_size]
        return output, target


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/rand_augment.py
================================================
"""
This implementation is based on
https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py
pulished under an Apache License 2.0.

COMMENT FROM ORIGINAL:
AutoAugment, RandAugment, and AugMix for PyTorch
This code implements the searched ImageNet policies with various tweaks and
improvements and does not include any of the search code. AA and RA
Implementation adapted from:
    https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py
AugMix adapted from:
    https://github.com/google-research/augmix
Papers:
    AutoAugment: Learning Augmentation Policies from Data
    https://arxiv.org/abs/1805.09501
    Learning Data Augmentation Strategies for Object Detection
    https://arxiv.org/abs/1906.11172
    RandAugment: Practical automated data augmentation...
    https://arxiv.org/abs/1909.13719
    AugMix: A Simple Data Processing Method to Improve Robustness and
    Uncertainty https://arxiv.org/abs/1912.02781

Hacked together by / Copyright 2020 Ross Wightman
"""

import math
import numpy as np
import random
import re
import PIL
from PIL import Image, ImageEnhance, ImageOps

_PIL_VER = tuple([int(x) for x in PIL.__version__.split(".")[:2]])

_FILL = (128, 128, 128)

# This signifies the max integer that the controller RNN could predict for the
# augmentation scheme.
_MAX_LEVEL = 10.0

_HPARAMS_DEFAULT = {
    "translate_const": 250,
    "img_mean": _FILL,
}

_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)


def _interpolation(kwargs):
    interpolation = kwargs.pop("resample", Image.BILINEAR)
    if isinstance(interpolation, (list, tuple)):
        return random.choice(interpolation)
    else:
        return interpolation


def _check_args_tf(kwargs):
    if "fillcolor" in kwargs and _PIL_VER < (5, 0):
        kwargs.pop("fillcolor")
    kwargs["resample"] = _interpolation(kwargs)


def shear_x(img, factor, **kwargs):
    _check_args_tf(kwargs)
    return img.transform(
        img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs
    )


def shear_y(img, factor, **kwargs):
    _check_args_tf(kwargs)
    return img.transform(
        img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs
    )


def translate_x_rel(img, pct, **kwargs):
    pixels = pct * img.size[0]
    _check_args_tf(kwargs)
    return img.transform(
        img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs
    )


def translate_y_rel(img, pct, **kwargs):
    pixels = pct * img.size[1]
    _check_args_tf(kwargs)
    return img.transform(
        img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs
    )


def translate_x_abs(img, pixels, **kwargs):
    _check_args_tf(kwargs)
    return img.transform(
        img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs
    )


def translate_y_abs(img, pixels, **kwargs):
    _check_args_tf(kwargs)
    return img.transform(
        img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs
    )


def rotate(img, degrees, **kwargs):
    _check_args_tf(kwargs)
    if _PIL_VER >= (5, 2):
        return img.rotate(degrees, **kwargs)
    elif _PIL_VER >= (5, 0):
        w, h = img.size
        post_trans = (0, 0)
        rotn_center = (w / 2.0, h / 2.0)
        angle = -math.radians(degrees)
        matrix = [
            round(math.cos(angle), 15),
            round(math.sin(angle), 15),
            0.0,
            round(-math.sin(angle), 15),
            round(math.cos(angle), 15),
            0.0,
        ]

        def transform(x, y, matrix):
            (a, b, c, d, e, f) = matrix
            return a * x + b * y + c, d * x + e * y + f

        matrix[2], matrix[5] = transform(
            -rotn_center[0] - post_trans[0],
            -rotn_center[1] - post_trans[1],
            matrix,
        )
        matrix[2] += rotn_center[0]
        matrix[5] += rotn_center[1]
        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
    else:
        return img.rotate(degrees, resample=kwargs["resample"])


def auto_contrast(img, **__):
    return ImageOps.autocontrast(img)


def invert(img, **__):
    return ImageOps.invert(img)


def equalize(img, **__):
    return ImageOps.equalize(img)


def solarize(img, thresh, **__):
    return ImageOps.solarize(img, thresh)


def solarize_add(img, add, thresh=128, **__):
    lut = []
    for i in range(256):
        if i < thresh:
            lut.append(min(255, i + add))
        else:
            lut.append(i)
    if img.mode in ("L", "RGB"):
        if img.mode == "RGB" and len(lut) == 256:
            lut = lut + lut + lut
        return img.point(lut)
    else:
        return img


def posterize(img, bits_to_keep, **__):
    if bits_to_keep >= 8:
        return img
    return ImageOps.posterize(img, bits_to_keep)


def contrast(img, factor, **__):
    return ImageEnhance.Contrast(img).enhance(factor)


def color(img, factor, **__):
    return ImageEnhance.Color(img).enhance(factor)


def brightness(img, factor, **__):
    return ImageEnhance.Brightness(img).enhance(factor)


def sharpness(img, factor, **__):
    return ImageEnhance.Sharpness(img).enhance(factor)


def _randomly_negate(v):
    """With 50% prob, negate the value"""
    return -v if random.random() > 0.5 else v


def _rotate_level_to_arg(level, _hparams):
    # range [-30, 30]
    level = (level / _MAX_LEVEL) * 30.0
    level = _randomly_negate(level)
    return (level,)


def _enhance_level_to_arg(level, _hparams):
    # range [0.1, 1.9]
    return ((level / _MAX_LEVEL) * 1.8 + 0.1,)


def _enhance_increasing_level_to_arg(level, _hparams):
    # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend
    # range [0.1, 1.9]
    level = (level / _MAX_LEVEL) * 0.9
    level = 1.0 + _randomly_negate(level)
    return (level,)


def _shear_level_to_arg(level, _hparams):
    # range [-0.3, 0.3]
    level = (level / _MAX_LEVEL) * 0.3
    level = _randomly_negate(level)
    return (level,)


def _translate_abs_level_to_arg(level, hparams):
    translate_const = hparams["translate_const"]
    level = (level / _MAX_LEVEL) * float(translate_const)
    level = _randomly_negate(level)
    return (level,)


def _translate_rel_level_to_arg(level, hparams):
    # default range [-0.45, 0.45]
    translate_pct = hparams.get("translate_pct", 0.45)
    level = (level / _MAX_LEVEL) * translate_pct
    level = _randomly_negate(level)
    return (level,)


def _posterize_level_to_arg(level, _hparams):
    # As per Tensorflow TPU EfficientNet impl
    # range [0, 4], 'keep 0 up to 4 MSB of original image'
    # intensity/severity of augmentation decreases with level
    return (int((level / _MAX_LEVEL) * 4),)


def _posterize_increasing_level_to_arg(level, hparams):
    # As per Tensorflow models research and UDA impl
    # range [4, 0], 'keep 4 down to 0 MSB of original image',
    # intensity/severity of augmentation increases with level
    return (4 - _posterize_level_to_arg(level, hparams)[0],)


def _posterize_original_level_to_arg(level, _hparams):
    # As per original AutoAugment paper description
    # range [4, 8], 'keep 4 up to 8 MSB of image'
    # intensity/severity of augmentation decreases with level
    return (int((level / _MAX_LEVEL) * 4) + 4,)


def _solarize_level_to_arg(level, _hparams):
    # range [0, 256]
    # intensity/severity of augmentation decreases with level
    return (int((level / _MAX_LEVEL) * 256),)


def _solarize_increasing_level_to_arg(level, _hparams):
    # range [0, 256]
    # intensity/severity of augmentation increases with level
    return (256 - _solarize_level_to_arg(level, _hparams)[0],)


def _solarize_add_level_to_arg(level, _hparams):
    # range [0, 110]
    return (int((level / _MAX_LEVEL) * 110),)


LEVEL_TO_ARG = {
    "AutoContrast": None,
    "Equalize": None,
    "Invert": None,
    "Rotate": _rotate_level_to_arg,
    # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
    "Posterize": _posterize_level_to_arg,
    "PosterizeIncreasing": _posterize_increasing_level_to_arg,
    "PosterizeOriginal": _posterize_original_level_to_arg,
    "Solarize": _solarize_level_to_arg,
    "SolarizeIncreasing": _solarize_increasing_level_to_arg,
    "SolarizeAdd": _solarize_add_level_to_arg,
    "Color": _enhance_level_to_arg,
    "ColorIncreasing": _enhance_increasing_level_to_arg,
    "Contrast": _enhance_level_to_arg,
    "ContrastIncreasing": _enhance_increasing_level_to_arg,
    "Brightness": _enhance_level_to_arg,
    "BrightnessIncreasing": _enhance_increasing_level_to_arg,
    "Sharpness": _enhance_level_to_arg,
    "SharpnessIncreasing": _enhance_increasing_level_to_arg,
    "ShearX": _shear_level_to_arg,
    "ShearY": _shear_level_to_arg,
    "TranslateX": _translate_abs_level_to_arg,
    "TranslateY": _translate_abs_level_to_arg,
    "TranslateXRel": _translate_rel_level_to_arg,
    "TranslateYRel": _translate_rel_level_to_arg,
}


NAME_TO_OP = {
    "AutoContrast": auto_contrast,
    "Equalize": equalize,
    "Invert": invert,
    "Rotate": rotate,
    "Posterize": posterize,
    "PosterizeIncreasing": posterize,
    "PosterizeOriginal": posterize,
    "Solarize": solarize,
    "SolarizeIncreasing": solarize,
    "SolarizeAdd": solarize_add,
    "Color": color,
    "ColorIncreasing": color,
    "Contrast": contrast,
    "ContrastIncreasing": contrast,
    "Brightness": brightness,
    "BrightnessIncreasing": brightness,
    "Sharpness": sharpness,
    "SharpnessIncreasing": sharpness,
    "ShearX": shear_x,
    "ShearY": shear_y,
    "TranslateX": translate_x_abs,
    "TranslateY": translate_y_abs,
    "TranslateXRel": translate_x_rel,
    "TranslateYRel": translate_y_rel,
}


class AugmentOp:
    """
    Apply for video.
    """

    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
        hparams = hparams or _HPARAMS_DEFAULT
        self.aug_fn = NAME_TO_OP[name]
        self.level_fn = LEVEL_TO_ARG[name]
        self.prob = prob
        self.magnitude = magnitude
        self.hparams = hparams.copy()
        self.kwargs = {
            "fillcolor": hparams["img_mean"]
            if "img_mean" in hparams
            else _FILL,
            "resample": hparams["interpolation"]
            if "interpolation" in hparams
            else _RANDOM_INTERPOLATION,
        }

        # If magnitude_std is > 0, we introduce some randomness
        # in the usually fixed policy and sample magnitude from a normal distribution
        # with mean `magnitude` and std-dev of `magnitude_std`.
        # NOTE This is my own hack, being tested, not in papers or reference impls.
        self.magnitude_std = self.hparams.get("magnitude_std", 0)

    def __call__(self, img_list):
        if self.prob < 1.0 and random.random() > self.prob:
            return img_list
        magnitude = self.magnitude
        if self.magnitude_std and self.magnitude_std > 0:
            magnitude = random.gauss(magnitude, self.magnitude_std)
        magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
        level_args = (
            self.level_fn(magnitude, self.hparams)
            if self.level_fn is not None
            else ()
        )

        if isinstance(img_list, list):
            return [
                self.aug_fn(img, *level_args, **self.kwargs) for img in img_list
            ]
        else:
            return self.aug_fn(img_list, *level_args, **self.kwargs)


_RAND_TRANSFORMS = [
    "AutoContrast",
    "Equalize",
    "Invert",
    "Rotate",
    "Posterize",
    "Solarize",
    "SolarizeAdd",
    "Color",
    "Contrast",
    "Brightness",
    "Sharpness",
    "ShearX",
    "ShearY",
    "TranslateXRel",
    "TranslateYRel",
]


_RAND_INCREASING_TRANSFORMS = [
    "AutoContrast",
    "Equalize",
    "Invert",
    "Rotate",
    "PosterizeIncreasing",
    "SolarizeIncreasing",
    "SolarizeAdd",
    "ColorIncreasing",
    "ContrastIncreasing",
    "BrightnessIncreasing",
    "SharpnessIncreasing",
    "ShearX",
    "ShearY",
    "TranslateXRel",
    "TranslateYRel",
]


# These experimental weights are based loosely on the relative improvements mentioned in paper.
# They may not result in increased performance, but could likely be tuned to so.
_RAND_CHOICE_WEIGHTS_0 = {
    "Rotate": 0.3,
    "ShearX": 0.2,
    "ShearY": 0.2,
    "TranslateXRel": 0.1,
    "TranslateYRel": 0.1,
    "Color": 0.025,
    "Sharpness": 0.025,
    "AutoContrast": 0.025,
    "Solarize": 0.005,
    "SolarizeAdd": 0.005,
    "Contrast": 0.005,
    "Brightness": 0.005,
    "Equalize": 0.005,
    "Posterize": 0,
    "Invert": 0,
}


def _select_rand_weights(weight_idx=0, transforms=None):
    transforms = transforms or _RAND_TRANSFORMS
    assert weight_idx == 0  # only one set of weights currently
    rand_weights = _RAND_CHOICE_WEIGHTS_0
    probs = [rand_weights[k] for k in transforms]
    probs /= np.sum(probs)
    return probs


def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
    hparams = hparams or _HPARAMS_DEFAULT
    transforms = transforms or _RAND_TRANSFORMS
    return [
        AugmentOp(name, prob=0.5, magnitude=magnitude, hparams=hparams)
        for name in transforms
    ]


class RandAugment:
    def __init__(self, ops, num_layers=2, choice_weights=None):
        self.ops = ops
        self.num_layers = num_layers
        self.choice_weights = choice_weights

    def __call__(self, img):
        # no replacement when using weighted choice
        ops = np.random.choice(
            self.ops,
            self.num_layers,
            replace=self.choice_weights is None,
            p=self.choice_weights,
        )
        for op in ops:
            img = op(img)
        return img


def rand_augment_transform(config_str, hparams):
    """
    RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719

    Create a RandAugment transform
    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
    sections, not order sepecific determine
        'm' - integer magnitude of rand augment
        'n' - integer num layers (number of transform ops selected per image)
        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
        'mstd' -  float std deviation of magnitude noise applied
        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
    :param hparams: Other hparams (kwargs) for the RandAugmentation scheme
    :return: A PyTorch compatible Transform
    """
    magnitude = _MAX_LEVEL  # default to _MAX_LEVEL for magnitude (currently 10)
    num_layers = 2  # default to 2 ops per image
    weight_idx = None  # default to no probability weights for op choice
    transforms = _RAND_TRANSFORMS
    config = config_str.split("-")
    assert config[0] == "rand"
    config = config[1:]
    for c in config:
        cs = re.split(r"(\d.*)", c)
        if len(cs) < 2:
            continue
        key, val = cs[:2]
        if key == "mstd":
            # noise param injected via hparams for now
            hparams.setdefault("magnitude_std", float(val))
        elif key == "inc":
            if bool(val):
                transforms = _RAND_INCREASING_TRANSFORMS
        elif key == "m":
            magnitude = int(val)
        elif key == "n":
            num_layers = int(val)
        elif key == "w":
            weight_idx = int(val)
        else:
            assert NotImplementedError
    ra_ops = rand_augment_ops(
        magnitude=magnitude, hparams=hparams, transforms=transforms
    )
    choice_weights = (
        None if weight_idx is None else _select_rand_weights(weight_idx)
    )
    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/random_erasing.py
================================================
"""
This implementation is based on
https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/random_erasing.py
pulished under an Apache License 2.0.
"""
import math
import random
import torch


def _get_pixels(
    per_pixel, rand_color, patch_size, dtype=torch.float32, device="cuda"
):
    # NOTE I've seen CUDA illegal memory access errors being caused by the normal_()
    # paths, flip the order so normal is run on CPU if this becomes a problem
    # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508
    if per_pixel:
        return torch.empty(patch_size, dtype=dtype, device=device).normal_()
    elif rand_color:
        return torch.empty(
            (patch_size[0], 1, 1), dtype=dtype, device=device
        ).normal_()
    else:
        return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device)


class RandomErasing:
    """Randomly selects a rectangle region in an image and erases its pixels.
        'Random Erasing Data Augmentation' by Zhong et al.
        See https://arxiv.org/pdf/1708.04896.pdf
        This variant of RandomErasing is intended to be applied to either a batch
        or single image tensor after it has been normalized by dataset mean and std.
    Args:
         probability: Probability that the Random Erasing operation will be performed.
         min_area: Minimum percentage of erased area wrt input image area.
         max_area: Maximum percentage of erased area wrt input image area.
         min_aspect: Minimum aspect ratio of erased area.
         mode: pixel color mode, one of 'const', 'rand', or 'pixel'
            'const' - erase block is constant color of 0 for all channels
            'rand'  - erase block is same per-channel random (normal) color
            'pixel' - erase block is per-pixel random (normal) color
        max_count: maximum number of erasing blocks per image, area per box is scaled by count.
            per-image count is randomly chosen between 1 and this value.
    """

    def __init__(
        self,
        probability=0.5,
        min_area=0.02,
        max_area=1 / 3,
        min_aspect=0.3,
        max_aspect=None,
        mode="const",
        min_count=1,
        max_count=None,
        num_splits=0,
        device="cuda",
        cube=True,
    ):
        self.probability = probability
        self.min_area = min_area
        self.max_area = max_area
        max_aspect = max_aspect or 1 / min_aspect
        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
        self.min_count = min_count
        self.max_count = max_count or min_count
        self.num_splits = num_splits
        mode = mode.lower()
        self.rand_color = False
        self.per_pixel = False
        self.cube = cube
        if mode == "rand":
            self.rand_color = True  # per block random normal
        elif mode == "pixel":
            self.per_pixel = True  # per pixel random normal
        else:
            assert not mode or mode == "const"
        self.device = device

    def _erase(self, img, chan, img_h, img_w, dtype):
        if random.random() > self.probability:
            return
        area = img_h * img_w
        count = (
            self.min_count
            if self.min_count == self.max_count
            else random.randint(self.min_count, self.max_count)
        )
        for _ in range(count):
            for _ in range(10):
                target_area = (
                    random.uniform(self.min_area, self.max_area) * area / count
                )
                aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
                h = int(round(math.sqrt(target_area * aspect_ratio)))
                w = int(round(math.sqrt(target_area / aspect_ratio)))
                if w < img_w and h < img_h:
                    top = random.randint(0, img_h - h)
                    left = random.randint(0, img_w - w)
                    img[:, top : top + h, left : left + w] = _get_pixels(
                        self.per_pixel,
                        self.rand_color,
                        (chan, h, w),
                        dtype=dtype,
                        device=self.device,
                    )
                    break

    def _erase_cube(
        self,
        img,
        batch_start,
        batch_size,
        chan,
        img_h,
        img_w,
        dtype,
    ):
        if random.random() > self.probability:
            return
        area = img_h * img_w
        count = (
            self.min_count
            if self.min_count == self.max_count
            else random.randint(self.min_count, self.max_count)
        )
        for _ in range(count):
            for _ in range(100):
                target_area = (
                    random.uniform(self.min_area, self.max_area) * area / count
                )
                aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
                h = int(round(math.sqrt(target_area * aspect_ratio)))
                w = int(round(math.sqrt(target_area / aspect_ratio)))
                if w < img_w and h < img_h:
                    top = random.randint(0, img_h - h)
                    left = random.randint(0, img_w - w)
                    for i in range(batch_start, batch_size):
                        img_instance = img[i]
                        img_instance[
                            :, top : top + h, left : left + w
                        ] = _get_pixels(
                            self.per_pixel,
                            self.rand_color,
                            (chan, h, w),
                            dtype=dtype,
                            device=self.device,
                        )
                    break

    def __call__(self, input):
        if len(input.size()) == 3:
            self._erase(input, *input.size(), input.dtype)
        else:
            batch_size, chan, img_h, img_w = input.size()
            # skip first slice of batch if num_splits is set (for clean portion of samples)
            batch_start = (
                batch_size // self.num_splits if self.num_splits > 1 else 0
            )
            if self.cube:
                self._erase_cube(
                    input,
                    batch_start,
                    batch_size,
                    chan,
                    img_h,
                    img_w,
                    input.dtype,
                )
            else:
                for i in range(batch_start, batch_size):
                    self._erase(input[i], chan, img_h, img_w, input.dtype)
        return input


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/ssv2.py
================================================
import os
import io
import cv2
import numpy as np
import torch
from torchvision import transforms
import warnings
from decord import VideoReader, cpu
from torch.utils.data import Dataset
from .random_erasing import RandomErasing
from .video_transforms import (
    Compose, Resize, CenterCrop, Normalize,
    create_random_augment, random_short_side_scale_jitter, 
    random_crop, random_resized_crop_with_shift, random_resized_crop,
    horizontal_flip, random_short_side_scale_jitter, uniform_crop, 
)
from .volume_transforms import ClipToTensor

try:
    from petrel_client.client import Client
    has_client = True
except ImportError:
    has_client = False


class SSRawFrameClsDataset(Dataset):
    """Load your own raw frame classification dataset."""

    def __init__(self, anno_path, prefix='', split=' ', mode='train', clip_len=8,
                 crop_size=224, short_side_size=256, new_height=256, new_width=340,
                 keep_aspect_ratio=True, num_segment=1, num_crop=1, test_num_segment=10,
                 test_num_crop=3, filename_tmpl='img_{:05}.jpg', args=None):
        self.anno_path = anno_path
        self.prefix = prefix
        self.split = split
        self.mode = mode
        self.clip_len = clip_len
        self.crop_size = crop_size
        self.short_side_size = short_side_size
        self.new_height = new_height
        self.new_width = new_width
        self.keep_aspect_ratio = keep_aspect_ratio
        self.num_segment = num_segment
        self.test_num_segment = test_num_segment
        self.num_crop = num_crop
        self.test_num_crop = test_num_crop
        self.filename_tmpl = filename_tmpl
        self.args = args
        self.aug = False
        self.rand_erase = False

        self.client = None
        if has_client:
            self.client = Client('~/petreloss.conf')

        if self.mode in ['train']:
            self.aug = True
            if self.args.reprob > 0:
                self.rand_erase = True
        if VideoReader is None:
            raise ImportError(
                "Unable to import `decord` which is required to read videos.")

        import pandas as pd
        cleaned = pd.read_csv(self.anno_path, header=None, delimiter=self.split)
        self.dataset_samples = list(cleaned.values[:, 0])
        self.total_frames = list(cleaned.values[:, 1])
        self.label_array = list(cleaned.values[:, -1])

        if (mode == 'train'):
            pass

        elif (mode == 'validation'):
            self.data_transform = Compose([
                Resize(self.short_side_size,
                                        interpolation='bilinear'),
                CenterCrop(size=(self.crop_size,
                                                  self.crop_size)),
                ClipToTensor(),
                Normalize(mean=[0.485, 0.456, 0.406],
                                           std=[0.229, 0.224, 0.225])
            ])
        elif mode == 'test':
            self.data_resize = Compose([
                Resize(size=(short_side_size),
                                        interpolation='bilinear')
            ])
            self.data_transform = Compose([
                ClipToTensor(),
                Normalize(mean=[0.485, 0.456, 0.406],
                                           std=[0.229, 0.224, 0.225])
            ])
            self.test_seg = []
            self.test_dataset = []
            self.test_total_frames = []
            self.test_label_array = []
            for ck in range(self.test_num_segment):
                for cp in range(self.test_num_crop):
                    for idx in range(len(self.label_array)):
                        self.test_seg.append((ck, cp))
                        self.test_dataset.append(self.dataset_samples[idx])
                        self.test_total_frames.append(self.total_frames[idx])
                        self.test_label_array.append(self.label_array[idx])

    def __getitem__(self, index):
        if self.mode == 'train':
            args = self.args
            scale_t = 1

            sample = self.dataset_samples[index]
            total_frame = self.total_frames[index]
            buffer = self.load_frame(sample,
                                     total_frame,
                                     sample_rate_scale=scale_t)  # T H W C
            if len(buffer) == 0:
                while len(buffer) == 0:
                    warnings.warn(
                        "video {} not correctly loaded during training".format(
                            sample))
                    index = np.random.randint(self.__len__())
                    sample = self.dataset_samples[index]
                    total_frame = self.total_frames[index]
                    buffer = self.load_frame(sample,
                                             total_frame,
                                             sample_rate_scale=scale_t)

            if args.num_sample > 1:
                frame_list = []
                label_list = []
                index_list = []
                for _ in range(args.num_sample):
                    new_frames = self._aug_frame(buffer, args)
                    label = self.label_array[index]
                    frame_list.append(new_frames)
                    label_list.append(label)
                    index_list.append(index)
                return frame_list, label_list, index_list, {}
            else:
                buffer = self._aug_frame(buffer, args)

            return buffer, self.label_array[index], index, {}

        elif self.mode == 'validation':
            sample = self.dataset_samples[index]
            total_frame = self.total_frames[index]
            buffer = self.load_frame(sample, total_frame)
            if len(buffer) == 0:
                while len(buffer) == 0:
                    warnings.warn(
                        "video {} not correctly loaded during validation".
                        format(sample))
                    index = np.random.randint(self.__len__())
                    sample = self.dataset_samples[index]
                    buffer = self.load_frame(sample, total_frame)
            buffer = self.data_transform(buffer)
            return buffer, self.label_array[index], sample.split(
                "/")[-1].split(".")[0]

        elif self.mode == 'test':
            sample = self.test_dataset[index]
            total_frame = self.test_total_frames[index]
            chunk_nb, split_nb = self.test_seg[index]
            buffer = self.load_frame(sample, total_frame)

            while len(buffer) == 0:
                warnings.warn("video {}, temporal {}, spatial {} not found during testing".format(\
                    str(self.test_dataset[index]), chunk_nb, split_nb))
                index = np.random.randint(self.__len__())
                sample = self.test_dataset[index]
                total_frame = self.test_total_frames[index]
                chunk_nb, split_nb = self.test_seg[index]
                buffer = self.load_frame(sample, total_frame)

            buffer = self.data_resize(buffer)
            if isinstance(buffer, list):
                buffer = np.stack(buffer, 0)

            spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) \
                                / (self.test_num_crop - 1)
            temporal_start = chunk_nb
            spatial_start = int(split_nb * spatial_step)
            if buffer.shape[1] >= buffer.shape[2]:
                buffer = buffer[temporal_start::self.test_num_segment, \
                       spatial_start:spatial_start + self.short_side_size, :, :]
            else:
                buffer = buffer[temporal_start::self.test_num_segment, \
                       :, spatial_start:spatial_start + self.short_side_size, :]

            buffer = self.data_transform(buffer)
            return buffer, self.test_label_array[index], sample.split("/")[-1].split(".")[0], \
                   chunk_nb, split_nb
        else:
            raise NameError('mode {} unkown'.format(self.mode))

    def _aug_frame(
        self,
        buffer,
        args,
    ):

        aug_transform = create_random_augment(
            input_size=(self.crop_size, self.crop_size),
            auto_augment=args.aa,
            interpolation=args.train_interpolation,
        )

        buffer = [transforms.ToPILImage()(frame) for frame in buffer]

        buffer = aug_transform(buffer)

        buffer = [transforms.ToTensor()(img) for img in buffer]
        buffer = torch.stack(buffer)  # T C H W
        buffer = buffer.permute(0, 2, 3, 1)  # T H W C

        # T H W C
        buffer = tensor_normalize(buffer, [0.485, 0.456, 0.406],
                                  [0.229, 0.224, 0.225])
        # T H W C -> C T H W.
        buffer = buffer.permute(3, 0, 1, 2)
        # Perform data augmentation.
        scl, asp = (
            [0.08, 1.0],
            [0.75, 1.3333],
        )

        buffer = spatial_sampling(
            buffer,
            spatial_idx=-1,
            min_scale=256,
            max_scale=320,
            crop_size=self.crop_size,
            random_horizontal_flip=False if args.data_set == 'SSV2' else True,
            inverse_uniform_sampling=False,
            aspect_ratio=asp,
            scale=scl,
            motion_shift=False)

        if self.rand_erase:
            erase_transform = RandomErasing(
                args.reprob,
                mode=args.remode,
                max_count=args.recount,
                num_splits=args.recount,
                device="cpu",
            )
            buffer = buffer.permute(1, 0, 2, 3)
            buffer = erase_transform(buffer)
            buffer = buffer.permute(1, 0, 2, 3)

        return buffer

    def load_frame(self, sample, num_frames, sample_rate_scale=1):
        """Load video content using Decord"""
        fname = sample
        fname = os.path.join(self.prefix, fname)

        if self.mode == 'test':
            tick = num_frames / float(self.num_segment)
            all_index = []
            for t_seg in range(self.test_num_segment):
                tmp_index = [
                    int(t_seg * tick / self.test_num_segment + tick * x)
                    for x in range(self.num_segment)
                ]
                all_index.extend(tmp_index)
            all_index = list(np.sort(np.array(all_index)))
            imgs = []
            for idx in all_index:
                frame_fname = os.path.join(fname, self.filename_tmpl.format(idx + 1)) 
                img_bytes = self.client.get(frame_fname)
                img_np = np.frombuffer(img_bytes, np.uint8)
                img = cv2.imdecode(img_np, cv2.IMREAD_COLOR)
                cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
                imgs.append(img)
            buffer = np.array(imgs)
            return buffer

        # handle temporal segments
        average_duration = num_frames // self.num_segment
        all_index = []
        if average_duration > 0:
            if self.mode == 'validation':
                all_index = list(
                    np.multiply(list(range(self.num_segment)),
                                average_duration) +
                    np.ones(self.num_segment, dtype=int) *
                    (average_duration // 2))
            else:
                all_index = list(
                    np.multiply(list(range(self.num_segment)),
                                average_duration) +
                    np.random.randint(average_duration, size=self.num_segment))
        elif num_frames > self.num_segment:
            if self.mode == 'validation':
                all_index = list(range(self.num_segment))
            else:
                all_index = list(
                    np.sort(
                        np.random.randint(num_frames, size=self.num_segment)))
        else:
            all_index = [0] * (self.num_segment - num_frames) + list(
                range(num_frames))
        all_index = list(np.array(all_index))
        imgs = []
        for idx in all_index:
            frame_fname = os.path.join(fname, self.filename_tmpl.format(idx + 1))
            img_bytes = self.client.get(frame_fname)
            img_np = np.frombuffer(img_bytes, np.uint8)
            img = cv2.imdecode(img_np, cv2.IMREAD_COLOR)
            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
            imgs.append(img)
        buffer = np.array(imgs)
        return buffer

    def __len__(self):
        if self.mode != 'test':
            return len(self.dataset_samples)
        else:
            return len(self.test_dataset)


class SSVideoClsDataset(Dataset):
    """Load your own video classification dataset."""

    def __init__(self, anno_path, prefix='', split=' ', mode='train', clip_len=8,
                crop_size=224, short_side_size=256, new_height=256,
                new_width=340, keep_aspect_ratio=True, num_segment=1,
                num_crop=1, test_num_segment=10, test_num_crop=3, args=None):
        self.anno_path = anno_path
        self.prefix = prefix
        self.split = split
        self.mode = mode
        self.clip_len = clip_len
        self.crop_size = crop_size
        self.short_side_size = short_side_size
        self.new_height = new_height
        self.new_width = new_width
        self.keep_aspect_ratio = keep_aspect_ratio
        self.num_segment = num_segment
        self.test_num_segment = test_num_segment
        self.num_crop = num_crop
        self.test_num_crop = test_num_crop
        self.args = args
        self.aug = False
        self.rand_erase = False
        
        self.client = None
        if has_client:
            self.client = Client('~/petreloss.conf')

        if self.mode in ['train']:
            self.aug = True
            if self.args.reprob > 0:
                self.rand_erase = True
        if VideoReader is None:
            raise ImportError("Unable to import `decord` which is required to read videos.")

        import pandas as pd
        cleaned = pd.read_csv(self.anno_path, header=None, delimiter=self.split)
        self.dataset_samples = list(cleaned.values[:, 0])
        self.label_array = list(cleaned.values[:, 1])

        if (mode == 'train'):
            pass

        elif (mode == 'validation'):
            self.data_transform = Compose([
                Resize(self.short_side_size, interpolation='bilinear'),
                CenterCrop(size=(self.crop_size, self.crop_size)),
                ClipToTensor(),
                Normalize(mean=[0.485, 0.456, 0.406],
                                        std=[0.229, 0.224, 0.225])
            ])
        elif mode == 'test':
            self.data_resize = Compose([
                Resize(size=(short_side_size), interpolation='bilinear')
            ])
            self.data_transform = Compose([
                ClipToTensor(),
                Normalize(mean=[0.485, 0.456, 0.406],
                                        std=[0.229, 0.224, 0.225])
            ])
            self.test_seg = []
            self.test_dataset = []
            self.test_label_array = []
            for ck in range(self.test_num_segment):
                for cp in range(self.test_num_crop):
                    for idx in range(len(self.label_array)):
                        sample_label = self.label_array[idx]
                        self.test_label_array.append(sample_label)
                        self.test_dataset.append(self.dataset_samples[idx])
                        self.test_seg.append((ck, cp))

    def __getitem__(self, index):
        if self.mode == 'train':
            args = self.args 
            scale_t = 1

            sample = self.dataset_samples[index]
            buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t) # T H W C
            if len(buffer) == 0:
                while len(buffer) == 0:
                    warnings.warn("video {} not correctly loaded during training".format(sample))
                    index = np.random.randint(self.__len__())
                    sample = self.dataset_samples[index]
                    buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t)

            if args.num_sample > 1:
                frame_list = []
                label_list = []
                index_list = []
                for _ in range(args.num_sample):
                    new_frames = self._aug_frame(buffer, args)
                    label = self.label_array[index]
                    frame_list.append(new_frames)
                    label_list.append(label)
                    index_list.append(index)
                return frame_list, label_list, index_list, {}
            else:
                buffer = self._aug_frame(buffer, args)
            
            return buffer, self.label_array[index], index, {}

        elif self.mode == 'validation':
            sample = self.dataset_samples[index]
            buffer = self.loadvideo_decord(sample)
            if len(buffer) == 0:
                while len(buffer) == 0:
                    warnings.warn("video {} not correctly loaded during validation".format(sample))
                    index = np.random.randint(self.__len__())
                    sample = self.dataset_samples[index]
                    buffer = self.loadvideo_decord(sample)
            buffer = self.data_transform(buffer)
            return buffer, self.label_array[index], sample.split("/")[-1].split(".")[0]

        elif self.mode == 'test':
            sample = self.test_dataset[index]
            chunk_nb, split_nb = self.test_seg[index]
            buffer = self.loadvideo_decord(sample)

            while len(buffer) == 0:
                warnings.warn("video {}, temporal {}, spatial {} not found during testing".format(\
                    str(self.test_dataset[index]), chunk_nb, split_nb))
                index = np.random.randint(self.__len__())
                sample = self.test_dataset[index]
                chunk_nb, split_nb = self.test_seg[index]
                buffer = self.loadvideo_decord(sample)

            buffer = self.data_resize(buffer)
            if isinstance(buffer, list):
                buffer = np.stack(buffer, 0)

            spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) \
                                / (self.test_num_crop - 1)
            temporal_start = chunk_nb # 0/1
            spatial_start = int(split_nb * spatial_step)
            if buffer.shape[1] >= buffer.shape[2]:
                buffer = buffer[temporal_start::2, \
                       spatial_start:spatial_start + self.short_side_size, :, :]
            else:
                buffer = buffer[temporal_start::2, \
                       :, spatial_start:spatial_start + self.short_side_size, :]

            buffer = self.data_transform(buffer)
            return buffer, self.test_label_array[index], sample.split("/")[-1].split(".")[0], \
                   chunk_nb, split_nb
        else:
            raise NameError('mode {} unkown'.format(self.mode))

    def _aug_frame(
        self,
        buffer,
        args,
    ):

        aug_transform = create_random_augment(
            input_size=(self.crop_size, self.crop_size),
            auto_augment=args.aa,
            interpolation=args.train_interpolation,
        )

        buffer = [
            transforms.ToPILImage()(frame) for frame in buffer
        ]

        buffer = aug_transform(buffer)

        buffer = [transforms.ToTensor()(img) for img in buffer]
        buffer = torch.stack(buffer) # T C H W
        buffer = buffer.permute(0, 2, 3, 1) # T H W C 
        
        # T H W C 
        buffer = tensor_normalize(
            buffer, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
        )
        # T H W C -> C T H W.
        buffer = buffer.permute(3, 0, 1, 2)
        # Perform data augmentation.
        scl, asp = (
            [0.08, 1.0],
            [0.75, 1.3333],
        )

        buffer = spatial_sampling(
            buffer,
            spatial_idx=-1,
            min_scale=256,
            max_scale=320,
            crop_size=self.crop_size,
            random_horizontal_flip=False if args.data_set == 'SSV2' else True,
            inverse_uniform_sampling=False,
            aspect_ratio=asp,
            scale=scl,
            motion_shift=False
        )

        if self.rand_erase:
            erase_transform = RandomErasing(
                args.reprob,
                mode=args.remode,
                max_count=args.recount,
                num_splits=args.recount,
                device="cpu",
            )
            buffer = buffer.permute(1, 0, 2, 3)
            buffer = erase_transform(buffer)
            buffer = buffer.permute(1, 0, 2, 3)

        return buffer


    def loadvideo_decord(self, sample, sample_rate_scale=1):
        """Load video content using Decord"""
        fname = sample
        fname = os.path.join(self.prefix, fname)

        try:
            if self.keep_aspect_ratio:
                if fname.startswith('s3'):
                    video_bytes = self.client.get(fname)
                    vr = VideoReader(io.BytesIO(video_bytes),
                                     num_threads=1,
                                     ctx=cpu(0))
                else:
                    vr = VideoReader(fname, num_threads=1, ctx=cpu(0))
            else:
                if fname.startswith('s3:'):
                    video_bytes = self.client.get(fname)
                    vr = VideoReader(io.BytesIO(video_bytes),
                                     width=self.new_width,
                                     height=self.new_height,
                                     num_threads=1,
                                     ctx=cpu(0))
                else:
                    vr = VideoReader(fname, width=self.new_width, height=self.new_height,
                                    num_threads=1, ctx=cpu(0))
        except:
            print("video cannot be loaded by decord: ", fname)
            return []

        if self.mode == 'test':
            tick = len(vr) / float(self.num_segment)
            all_index = list(np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segment)] +
                               [int(tick * x) for x in range(self.num_segment)]))
            while len(all_index) < (self.num_segment * self.test_num_segment):
                all_index.append(all_index[-1])
            all_index = np.sort(np.array(all_index))
            vr.seek(0)
            buffer = vr.get_batch(all_index).asnumpy()
            return buffer
        elif self.mode == 'validation':
            tick = len(vr) / float(self.num_segment)
            all_index = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segment)])
            vr.seek(0)
            buffer = vr.get_batch(all_index).asnumpy()
            return buffer

        # handle temporal segments
        average_duration = len(vr) // self.num_segment
        if average_duration > 0:
            all_index = list(np.multiply(list(range(self.num_segment)), average_duration) + np.random.randint(average_duration,
                                                                                                        size=self.num_segment))
        elif len(vr) > self.num_segment:
            all_index = list(np.sort(np.random.randint(len(vr), size=self.num_segment)))
        else:
            all_index = list(np.zeros((self.num_segment,)))
        vr.seek(0)
        buffer = vr.get_batch(all_index).asnumpy()
        return buffer

    def __len__(self):
        if self.mode != 'test':
            return len(self.dataset_samples)
        else:
            return len(self.test_dataset)


def spatial_sampling(
    frames,
    spatial_idx=-1,
    min_scale=256,
    max_scale=320,
    crop_size=224,
    random_horizontal_flip=True,
    inverse_uniform_sampling=False,
    aspect_ratio=None,
    scale=None,
    motion_shift=False,
):
    """
    Perform spatial sampling on the given video frames. If spatial_idx is
    -1, perform random scale, random crop, and random flip on the given
    frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling
    with the given spatial_idx.
    Args:
        frames (tensor): frames of images sampled from the video. The
            dimension is `num frames` x `height` x `width` x `channel`.
        spatial_idx (int): if -1, perform random spatial sampling. If 0, 1,
            or 2, perform left, center, right crop if width is larger than
            height, and perform top, center, buttom crop if height is larger
            than width.
        min_scale (int): the minimal size of scaling.
        max_scale (int): the maximal size of scaling.
        crop_size (int): the size of height and width used to crop the
            frames.
        inverse_uniform_sampling (bool): if True, sample uniformly in
            [1 / max_scale, 1 / min_scale] and take a reciprocal to get the
            scale. If False, take a uniform sample from [min_scale,
            max_scale].
        aspect_ratio (list): Aspect ratio range for resizing.
        scale (list): Scale range for resizing.
        motion_shift (bool): Whether to apply motion shift for resizing.
    Returns:
        frames (tensor): spatially sampled frames.
    """
    assert spatial_idx in [-1, 0, 1, 2]
    if spatial_idx == -1:
        if aspect_ratio is None and scale is None:
            frames, _ = random_short_side_scale_jitter(
                images=frames,
                min_size=min_scale,
                max_size=max_scale,
                inverse_uniform_sampling=inverse_uniform_sampling,
            )
            frames, _ = random_crop(frames, crop_size)
        else:
            transform_func = (
                random_resized_crop_with_shift
                if motion_shift
                else random_resized_crop
            )
            frames = transform_func(
                images=frames,
                target_height=crop_size,
                target_width=crop_size,
                scale=scale,
                ratio=aspect_ratio,
            )
        if random_horizontal_flip:
            frames, _ = horizontal_flip(0.5, frames)
    else:
        # The testing is deterministic and no jitter should be performed.
        # min_scale, max_scale, and crop_size are expect to be the same.
        assert len({min_scale, max_scale, crop_size}) == 1
        frames, _ = random_short_side_scale_jitter(
            frames, min_scale, max_scale
        )
        frames, _ = uniform_crop(frames, crop_size, spatial_idx)
    return frames


def tensor_normalize(tensor, mean, std):
    """
    Normalize a given tensor by subtracting the mean and dividing the std.
    Args:
        tensor (tensor): tensor to normalize.
        mean (tensor or list): mean value to subtract.
        std (tensor or list): std to divide.
    """
    if tensor.dtype == torch.uint8:
        tensor = tensor.float()
        tensor = tensor / 255.0
    if type(mean) == list:
        mean = torch.tensor(mean)
    if type(std) == list:
        std = torch.tensor(std)
    tensor = tensor - mean
    tensor = tensor / std
    return tensor


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/transforms.py
================================================
import torch
import torchvision.transforms.functional as F
import warnings
import random
import numpy as np
import torchvision
from PIL import Image, ImageOps
import numbers


class GroupRandomCrop(object):
    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, img_tuple):
        img_group, label = img_tuple
        
        w, h = img_group[0].size
        th, tw = self.size

        out_images = list()

        x1 = random.randint(0, w - tw)
        y1 = random.randint(0, h - th)

        for img in img_group:
            assert(img.size[0] == w and img.size[1] == h)
            if w == tw and h == th:
                out_images.append(img)
            else:
                out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))

        return (out_images, label)


class GroupCenterCrop(object):
    def __init__(self, size):
        self.worker = torchvision.transforms.CenterCrop(size)

    def __call__(self, img_tuple):
        img_group, label = img_tuple
        return ([self.worker(img) for img in img_group], label)


class GroupRandomHorizontalFlip(object):
    def __init__(self, flip=False):
        self.flip = flip

    def __call__(self, img_tuple):
        v = random.random()
        if self.flip and v < 0.5:
            img_group, label = img_tuple
            ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
            return (ret, label)
        else:
            return img_tuple


class GroupNormalize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor_tuple):
        tensor, label = tensor_tuple
        rep_mean = self.mean * (tensor.size()[0]//len(self.mean))
        rep_std = self.std * (tensor.size()[0]//len(self.std))
        
        # TODO: make efficient
        for t, m, s in zip(tensor, rep_mean, rep_std):
            t.sub_(m).div_(s)

        return (tensor,label)


class GroupGrayScale(object):
    def __init__(self, size):
        self.worker = torchvision.transforms.Grayscale(size)

    def __call__(self, img_tuple):
        img_group, label = img_tuple
        return ([self.worker(img) for img in img_group], label)


class GroupColorJitter(object):
    def __init__(self, size):
        self.worker = torchvision.transforms.ColorJitter(
            brightness=size, contrast=size, saturation=size
        )

    def __call__(self, img_tuple):
        img_group, label = img_tuple
        return ([self.worker(img) for img in img_group], label)

    
class GroupScale(object):
    """ Rescales the input PIL.Image to the given 'size'.
    'size' will be the size of the smaller edge.
    For example, if height > width, then image will be
    rescaled to (size * height / width, size)
    size: size of the smaller edge
    interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(self, size, interpolation=Image.BILINEAR):
        self.worker = torchvision.transforms.Resize(size, interpolation)

    def __call__(self, img_tuple):
        img_group, label = img_tuple
        return ([self.worker(img) for img in img_group], label)


class GroupMultiScaleCrop(object):

    def __init__(self, input_size, scales=None, max_distort=1, fix_crop=True, more_fix_crop=True):
        self.scales = scales if scales is not None else [1, 875, .75, .66]
        self.max_distort = max_distort
        self.fix_crop = fix_crop
        self.more_fix_crop = more_fix_crop
        self.input_size = input_size if not isinstance(input_size, int) else [input_size, input_size]
        self.interpolation = Image.BILINEAR

    def __call__(self, img_tuple):
        img_group, label = img_tuple
        
        im_size = img_group[0].size

        crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
        crop_img_group = [img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h)) for img in img_group]
        ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation) for img in crop_img_group]
        return (ret_img_group, label)

    def _sample_crop_size(self, im_size):
        image_w, image_h = im_size[0], im_size[1]

        # find a crop size
        base_size = min(image_w, image_h)
        crop_sizes = [int(base_size * x) for x in self.scales]
        crop_h = [self.input_size[1] if abs(x - self.input_size[1]) < 3 else x for x in crop_sizes]
        crop_w = [self.input_size[0] if abs(x - self.input_size[0]) < 3 else x for x in crop_sizes]

        pairs = []
        for i, h in enumerate(crop_h):
            for j, w in enumerate(crop_w):
                if abs(i - j) <= self.max_distort:
                    pairs.append((w, h))

        crop_pair = random.choice(pairs)
        if not self.fix_crop:
            w_offset = random.randint(0, image_w - crop_pair[0])
            h_offset = random.randint(0, image_h - crop_pair[1])
        else:
            w_offset, h_offset = self._sample_fix_offset(image_w, image_h, crop_pair[0], crop_pair[1])

        return crop_pair[0], crop_pair[1], w_offset, h_offset

    def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
        offsets = self.fill_fix_offset(self.more_fix_crop, image_w, image_h, crop_w, crop_h)
        return random.choice(offsets)

    @staticmethod
    def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
        w_step = (image_w - crop_w) // 4
        h_step = (image_h - crop_h) // 4

        ret = list()
        ret.append((0, 0))  # upper left
        ret.append((4 * w_step, 0))  # upper right
        ret.append((0, 4 * h_step))  # lower left
        ret.append((4 * w_step, 4 * h_step))  # lower right
        ret.append((2 * w_step, 2 * h_step))  # center

        if more_fix_crop:
            ret.append((0, 2 * h_step))  # center left
            ret.append((4 * w_step, 2 * h_step))  # center right
            ret.append((2 * w_step, 4 * h_step))  # lower center
            ret.append((2 * w_step, 0 * h_step))  # upper center

            ret.append((1 * w_step, 1 * h_step))  # upper left quarter
            ret.append((3 * w_step, 1 * h_step))  # upper right quarter
            ret.append((1 * w_step, 3 * h_step))  # lower left quarter
            ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
        return ret


class Stack(object):

    def __init__(self, roll=False):
        self.roll = roll

    def __call__(self, img_tuple):
        img_group, label = img_tuple
        
        if img_group[0].mode == 'L':
            return (np.concatenate([np.expand_dims(x, 2) for x in img_group], axis=2), label)
        elif img_group[0].mode == 'RGB':
            if self.roll:
                return (np.concatenate([np.array(x)[:, :, ::-1] for x in img_group], axis=2), label)
            else:
                return (np.concatenate(img_group, axis=2), label)


class ToTorchFormatTensor(object):
    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
    def __init__(self, div=True):
        self.div = div

    def __call__(self, pic_tuple):
        pic, label = pic_tuple
        
        if isinstance(pic, np.ndarray):
            # handle numpy array
            img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
        else:
            # handle PIL Image
            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
            # put it from HWC to CHW format
            # yikes, this transpose takes 80% of the loading time/CPU
            img = img.transpose(0, 1).transpose(0, 2).contiguous()
        return (img.float().div(255.) if self.div else img.float(), label)


class IdentityTransform(object):

    def __call__(self, data):
        return data


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/video_transforms.py
================================================
#!/usr/bin/env python3
import math
import numpy as np
import random
import torch
import torchvision.transforms.functional as F
from PIL import Image
from torchvision import transforms

from .rand_augment import rand_augment_transform
from .random_erasing import RandomErasing

import numbers
import PIL
import torchvision

import vbench.third_party.umt.functional as FF

_pil_interpolation_to_str = {
    Image.NEAREST: "PIL.Image.NEAREST",
    Image.BILINEAR: "PIL.Image.BILINEAR",
    Image.BICUBIC: "PIL.Image.BICUBIC",
    Image.LANCZOS: "PIL.Image.LANCZOS",
    Image.HAMMING: "PIL.Image.HAMMING",
    Image.BOX: "PIL.Image.BOX",
}


_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)


def _pil_interp(method):
    if method == "bicubic":
        return Image.BICUBIC
    elif method == "lanczos":
        return Image.LANCZOS
    elif method == "hamming":
        return Image.HAMMING
    else:
        return Image.BILINEAR


def random_short_side_scale_jitter(
    images, min_size, max_size, boxes=None, inverse_uniform_sampling=False
):
    """
    Perform a spatial short scale jittering on the given images and
    corresponding boxes.
    Args:
        images (tensor): images to perform scale jitter. Dimension is
            `num frames` x `channel` x `height` x `width`.
        min_size (int): the minimal size to scale the frames.
        max_size (int): the maximal size to scale the frames.
        boxes (ndarray): optional. Corresponding boxes to images.
            Dimension is `num boxes` x 4.
        inverse_uniform_sampling (bool): if True, sample uniformly in
            [1 / max_scale, 1 / min_scale] and take a reciprocal to get the
            scale. If False, take a uniform sample from [min_scale, max_scale].
    Returns:
        (tensor): the scaled images with dimension of
            `num frames` x `channel` x `new height` x `new width`.
        (ndarray or None): the scaled boxes with dimension of
            `num boxes` x 4.
    """
    if inverse_uniform_sampling:
        size = int(
            round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size))
        )
    else:
        size = int(round(np.random.uniform(min_size, max_size)))

    height = images.shape[2]
    width = images.shape[3]
    if (width <= height and width == size) or (
        height <= width and height == size
    ):
        return images, boxes
    new_width = size
    new_height = size
    if width < height:
        new_height = int(math.floor((float(height) / width) * size))
        if boxes is not None:
            boxes = boxes * float(new_height) / height
    else:
        new_width = int(math.floor((float(width) / height) * size))
        if boxes is not None:
            boxes = boxes * float(new_width) / width

    return (
        torch.nn.functional.interpolate(
            images,
            size=(new_height, new_width),
            mode="bilinear",
            align_corners=False,
        ),
        boxes,
    )


def crop_boxes(boxes, x_offset, y_offset):
    """
    Peform crop on the bounding boxes given the offsets.
    Args:
        boxes (ndarray or None): bounding boxes to peform crop. The dimension
            is `num boxes` x 4.
        x_offset (int): cropping offset in the x axis.
        y_offset (int): cropping offset in the y axis.
    Returns:
        cropped_boxes (ndarray or None): the cropped boxes with dimension of
            `num boxes` x 4.
    """
    cropped_boxes = boxes.copy()
    cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
    cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset

    return cropped_boxes


def random_crop(images, size, boxes=None):
    """
    Perform random spatial crop on the given images and corresponding boxes.
    Args:
        images (tensor): images to perform random crop. The dimension is
            `num frames` x `channel` x `height` x `width`.
        size (int): the size of height and width to crop on the image.
        boxes (ndarray or None): optional. Corresponding boxes to images.
            Dimension is `num boxes` x 4.
    Returns:
        cropped (tensor): cropped images with dimension of
            `num frames` x `channel` x `size` x `size`.
        cropped_boxes (ndarray or None): the cropped boxes with dimension of
            `num boxes` x 4.
    """
    if images.shape[2] == size and images.shape[3] == size:
        return images
    height = images.shape[2]
    width = images.shape[3]
    y_offset = 0
    if height > size:
        y_offset = int(np.random.randint(0, height - size))
    x_offset = 0
    if width > size:
        x_offset = int(np.random.randint(0, width - size))
    cropped = images[
        :, :, y_offset : y_offset + size, x_offset : x_offset + size
    ]

    cropped_boxes = (
        crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
    )

    return cropped, cropped_boxes


def horizontal_flip(prob, images, boxes=None):
    """
    Perform horizontal flip on the given images and corresponding boxes.
    Args:
        prob (float): probility to flip the images.
        images (tensor): images to perform horizontal flip, the dimension is
            `num frames` x `channel` x `height` x `width`.
        boxes (ndarray or None): optional. Corresponding boxes to images.
            Dimension is `num boxes` x 4.
    Returns:
        images (tensor): images with dimension of
            `num frames` x `channel` x `height` x `width`.
        flipped_boxes (ndarray or None): the flipped boxes with dimension of
            `num boxes` x 4.
    """
    if boxes is None:
        flipped_boxes = None
    else:
        flipped_boxes = boxes.copy()

    if np.random.uniform() < prob:
        images = images.flip((-1))

        if len(images.shape) == 3:
            width = images.shape[2]
        elif len(images.shape) == 4:
            width = images.shape[3]
        else:
            raise NotImplementedError("Dimension does not supported")
        if boxes is not None:
            flipped_boxes[:, [0, 2]] = width - boxes[:, [2, 0]] - 1

    return images, flipped_boxes


def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
    """
    Perform uniform spatial sampling on the images and corresponding boxes.
    Args:
        images (tensor): images to perform uniform crop. The dimension is
            `num frames` x `channel` x `height` x `width`.
        size (int): size of height and weight to crop the images.
        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
            is larger than height. Or 0, 1, or 2 for top, center, and bottom
            crop if height is larger than width.
        boxes (ndarray or None): optional. Corresponding boxes to images.
            Dimension is `num boxes` x 4.
        scale_size (int): optinal. If not None, resize the images to scale_size before
            performing any crop.
    Returns:
        cropped (tensor): images with dimension of
            `num frames` x `channel` x `size` x `size`.
        cropped_boxes (ndarray or None): the cropped boxes with dimension of
            `num boxes` x 4.
    """
    assert spatial_idx in [0, 1, 2]
    ndim = len(images.shape)
    if ndim == 3:
        images = images.unsqueeze(0)
    height = images.shape[2]
    width = images.shape[3]

    if scale_size is not None:
        if width <= height:
            width, height = scale_size, int(height / width * scale_size)
        else:
            width, height = int(width / height * scale_size), scale_size
        images = torch.nn.functional.interpolate(
            images,
            size=(height, width),
            mode="bilinear",
            align_corners=False,
        )

    y_offset = int(math.ceil((height - size) / 2))
    x_offset = int(math.ceil((width - size) / 2))

    if height > width:
        if spatial_idx == 0:
            y_offset = 0
        elif spatial_idx == 2:
            y_offset = height - size
    else:
        if spatial_idx == 0:
            x_offset = 0
        elif spatial_idx == 2:
            x_offset = width - size
    cropped = images[
        :, :, y_offset : y_offset + size, x_offset : x_offset + size
    ]
    cropped_boxes = (
        crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
    )
    if ndim == 3:
        cropped = cropped.squeeze(0)
    return cropped, cropped_boxes


def clip_boxes_to_image(boxes, height, width):
    """
    Clip an array of boxes to an image with the given height and width.
    Args:
        boxes (ndarray): bounding boxes to perform clipping.
            Dimension is `num boxes` x 4.
        height (int): given image height.
        width (int): given image width.
    Returns:
        clipped_boxes (ndarray): the clipped boxes with dimension of
            `num boxes` x 4.
    """
    clipped_boxes = boxes.copy()
    clipped_boxes[:, [0, 2]] = np.minimum(
        width - 1.0, np.maximum(0.0, boxes[:, [0, 2]])
    )
    clipped_boxes[:, [1, 3]] = np.minimum(
        height - 1.0, np.maximum(0.0, boxes[:, [1, 3]])
    )
    return clipped_boxes


def blend(images1, images2, alpha):
    """
    Blend two images with a given weight alpha.
    Args:
        images1 (tensor): the first images to be blended, the dimension is
            `num frames` x `channel` x `height` x `width`.
        images2 (tensor): the second images to be blended, the dimension is
            `num frames` x `channel` x `height` x `width`.
        alpha (float): the blending weight.
    Returns:
        (tensor): blended images, the dimension is
            `num frames` x `channel` x `height` x `width`.
    """
    return images1 * alpha + images2 * (1 - alpha)


def grayscale(images):
    """
    Get the grayscale for the input images. The channels of images should be
    in order BGR.
    Args:
        images (tensor): the input images for getting grayscale. Dimension is
            `num frames` x `channel` x `height` x `width`.
    Returns:
        img_gray (tensor): blended images, the dimension is
            `num frames` x `channel` x `height` x `width`.
    """
    # R -> 0.299, G -> 0.587, B -> 0.114.
    img_gray = torch.tensor(images)
    gray_channel = (
        0.299 * images[:, 2] + 0.587 * images[:, 1] + 0.114 * images[:, 0]
    )
    img_gray[:, 0] = gray_channel
    img_gray[:, 1] = gray_channel
    img_gray[:, 2] = gray_channel
    return img_gray


def color_jitter(images, img_brightness=0, img_contrast=0, img_saturation=0):
    """
    Perfrom a color jittering on the input images. The channels of images
    should be in order BGR.
    Args:
        images (tensor): images to perform color jitter. Dimension is
            `num frames` x `channel` x `height` x `width`.
        img_brightness (float): jitter ratio for brightness.
        img_contrast (float): jitter ratio for contrast.
        img_saturation (float): jitter ratio for saturation.
    Returns:
        images (tensor): the jittered images, the dimension is
            `num frames` x `channel` x `height` x `width`.
    """

    jitter = []
    if img_brightness != 0:
        jitter.append("brightness")
    if img_contrast != 0:
        jitter.append("contrast")
    if img_saturation != 0:
        jitter.append("saturation")

    if len(jitter) > 0:
        order = np.random.permutation(np.arange(len(jitter)))
        for idx in range(0, len(jitter)):
            if jitter[order[idx]] == "brightness":
                images = brightness_jitter(img_brightness, images)
            elif jitter[order[idx]] == "contrast":
                images = contrast_jitter(img_contrast, images)
            elif jitter[order[idx]] == "saturation":
                images = saturation_jitter(img_saturation, images)
    return images


def brightness_jitter(var, images):
    """
    Perfrom brightness jittering on the input images. The channels of images
    should be in order BGR.
    Args:
        var (float): jitter ratio for brightness.
        images (tensor): images to perform color jitter. Dimension is
            `num frames` x `channel` x `height` x `width`.
    Returns:
        images (tensor): the jittered images, the dimension is
            `num frames` x `channel` x `height` x `width`.
    """
    alpha = 1.0 + np.random.uniform(-var, var)

    img_bright = torch.zeros(images.shape)
    images = blend(images, img_bright, alpha)
    return images


def contrast_jitter(var, images):
    """
    Perfrom contrast jittering on the input images. The channels of images
    should be in order BGR.
    Args:
        var (float): jitter ratio for contrast.
        images (tensor): images to perform color jitter. Dimension is
            `num frames` x `channel` x `height` x `width`.
    Returns:
        images (tensor): the jittered images, the dimension is
            `num frames` x `channel` x `height` x `width`.
    """
    alpha = 1.0 + np.random.uniform(-var, var)

    img_gray = grayscale(images)
    img_gray[:] = torch.mean(img_gray, dim=(1, 2, 3), keepdim=True)
    images = blend(images, img_gray, alpha)
    return images


def saturation_jitter(var, images):
    """
    Perfrom saturation jittering on the input images. The channels of images
    should be in order BGR.
    Args:
        var (float): jitter ratio for saturation.
        images (tensor): images to perform color jitter. Dimension is
            `num frames` x `channel` x `height` x `width`.
    Returns:
        images (tensor): the jittered images, the dimension is
            `num frames` x `channel` x `height` x `width`.
    """
    alpha = 1.0 + np.random.uniform(-var, var)
    img_gray = grayscale(images)
    images = blend(images, img_gray, alpha)

    return images


def lighting_jitter(images, alphastd, eigval, eigvec):
    """
    Perform AlexNet-style PCA jitter on the given images.
    Args:
        images (tensor): images to perform lighting jitter. Dimension is
            `num frames` x `channel` x `height` x `width`.
        alphastd (float): jitter ratio for PCA jitter.
        eigval (list): eigenvalues for PCA jitter.
        eigvec (list[list]): eigenvectors for PCA jitter.
    Returns:
        out_images (tensor): the jittered images, the dimension is
            `num frames` x `channel` x `height` x `width`.
    """
    if alphastd == 0:
        return images
    # generate alpha1, alpha2, alpha3.
    alpha = np.random.normal(0, alphastd, size=(1, 3))
    eig_vec = np.array(eigvec)
    eig_val = np.reshape(eigval, (1, 3))
    rgb = np.sum(
        eig_vec * np.repeat(alpha, 3, axis=0) * np.repeat(eig_val, 3, axis=0),
        axis=1,
    )
    out_images = torch.zeros_like(images)
    if len(images.shape) == 3:
        # C H W
        channel_dim = 0
    elif len(images.shape) == 4:
        # T C H W
        channel_dim = 1
    else:
        raise NotImplementedError(f"Unsupported dimension {len(images.shape)}")

    for idx in range(images.shape[channel_dim]):
        # C H W
        if len(images.shape) == 3:
            out_images[idx] = images[idx] + rgb[2 - idx]
        # T C H W
        elif len(images.shape) == 4:
            out_images[:, idx] = images[:, idx] + rgb[2 - idx]
        else:
            raise NotImplementedError(
                f"Unsupported dimension {len(images.shape)}"
            )

    return out_images


def color_normalization(images, mean, stddev):
    """
    Perform color nomration on the given images.
    Args:
        images (tensor): images to perform color normalization. Dimension is
            `num frames` x `channel` x `height` x `width`.
        mean (list): mean values for normalization.
        stddev (list): standard deviations for normalization.

    Returns:
        out_images (tensor): the noramlized images, the dimension is
            `num frames` x `channel` x `height` x `width`.
    """
    if len(images.shape) == 3:
        assert (
            len(mean) == images.shape[0]
        ), "channel mean not computed properly"
        assert (
            len(stddev) == images.shape[0]
        ), "channel stddev not computed properly"
    elif len(images.shape) == 4:
        assert (
            len(mean) == images.shape[1]
        ), "channel mean not computed properly"
        assert (
            len(stddev) == images.shape[1]
        ), "channel stddev not computed properly"
    else:
        raise NotImplementedError(f"Unsupported dimension {len(images.shape)}")

    out_images = torch.zeros_like(images)
    for idx in range(len(mean)):
        # C H W
        if len(images.shape) == 3:
            out_images[idx] = (images[idx] - mean[idx]) / stddev[idx]
        elif len(images.shape) == 4:
            out_images[:, idx] = (images[:, idx] - mean[idx]) / stddev[idx]
        else:
            raise NotImplementedError(
                f"Unsupported dimension {len(images.shape)}"
            )
    return out_images


def _get_param_spatial_crop(
    scale, ratio, height, width, num_repeat=10, log_scale=True, switch_hw=False
):
    """
    Given scale, ratio, height and width, return sampled coordinates of the videos.
    """
    for _ in range(num_repeat):
        area = height * width
        target_area = random.uniform(*scale) * area
        if log_scale:
            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
            aspect_ratio = math.exp(random.uniform(*log_ratio))
        else:
            aspect_ratio = random.uniform(*ratio)

        w = int(round(math.sqrt(target_area * aspect_ratio)))
        h = int(round(math.sqrt(target_area / aspect_ratio)))

        if np.random.uniform() < 0.5 and switch_hw:
            w, h = h, w

        if 0 < w <= width and 0 < h <= height:
            i = random.randint(0, height - h)
            j = random.randint(0, width - w)
            return i, j, h, w

    # Fallback to central crop
    in_ratio = float(width) / float(height)
    if in_ratio < min(ratio):
        w = width
        h = int(round(w / min(ratio)))
    elif in_ratio > max(ratio):
        h = height
        w = int(round(h * max(ratio)))
    else:  # whole image
        w = width
        h = height
    i = (height - h) // 2
    j = (width - w) // 2
    return i, j, h, w


def random_resized_crop(
    images,
    target_height,
    target_width,
    scale=(0.8, 1.0),
    ratio=(3.0 / 4.0, 4.0 / 3.0),
):
    """
    Crop the given images to random size and aspect ratio. A crop of random
    size (default: of 0.08 to 1.0) of the original size and a random aspect
    ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This
    crop is finally resized to given size. This is popularly used to train the
    Inception networks.

    Args:
        images: Images to perform resizing and cropping.
        target_height: Desired height after cropping.
        target_width: Desired width after cropping.
        scale: Scale range of Inception-style area based random resizing.
        ratio: Aspect ratio range of Inception-style area based random resizing.
    """

    height = images.shape[2]
    width = images.shape[3]

    i, j, h, w = _get_param_spatial_crop(scale, ratio, height, width)
    cropped = images[:, :, i : i + h, j : j + w]
    return torch.nn.functional.interpolate(
        cropped,
        size=(target_height, target_width),
        mode="bilinear",
        align_corners=False,
    )


def random_resized_crop_with_shift(
    images,
    target_height,
    target_width,
    scale=(0.8, 1.0),
    ratio=(3.0 / 4.0, 4.0 / 3.0),
):
    """
    This is similar to random_resized_crop. However, it samples two different
    boxes (for cropping) for the first and last frame. It then linearly
    interpolates the two boxes for other frames.

    Args:
        images: Images to perform resizing and cropping.
        target_height: Desired height after cropping.
        target_width: Desired width after cropping.
        scale: Scale range of Inception-style area based random resizing.
        ratio: Aspect ratio range of Inception-style area based random resizing.
    """
    t = images.shape[1]
    height = images.shape[2]
    width = images.shape[3]

    i, j, h, w = _get_param_spatial_crop(scale, ratio, height, width)
    i_, j_, h_, w_ = _get_param_spatial_crop(scale, ratio, height, width)
    i_s = [int(i) for i in torch.linspace(i, i_, steps=t).tolist()]
    j_s = [int(i) for i in torch.linspace(j, j_, steps=t).tolist()]
    h_s = [int(i) for i in torch.linspace(h, h_, steps=t).tolist()]
    w_s = [int(i) for i in torch.linspace(w, w_, steps=t).tolist()]
    out = torch.zeros((3, t, target_height, target_width))
    for ind in range(t):
        out[:, ind : ind + 1, :, :] = torch.nn.functional.interpolate(
            images[
                :,
                ind : ind + 1,
                i_s[ind] : i_s[ind] + h_s[ind],
                j_s[ind] : j_s[ind] + w_s[ind],
            ],
            size=(target_height, target_width),
            mode="bilinear",
            align_corners=False,
        )
    return out


def create_random_augment(
    input_size,
    auto_augment=None,
    interpolation="bilinear",
):
    """
    Get video randaug transform.

    Args:
        input_size: The size of the input video in tuple.
        auto_augment: Parameters for randaug. An example:
            "rand-m7-n4-mstd0.5-inc1" (m is the magnitude and n is the number
            of operations to apply).
        interpolation: Interpolation method.
    """
    if isinstance(input_size, tuple):
        img_size = input_size[-2:]
    else:
        img_size = input_size

    if auto_augment:
        assert isinstance(auto_augment, str)
        if isinstance(img_size, tuple):
            img_size_min = min(img_size)
        else:
            img_size_min = img_size
        aa_params = {"translate_const": int(img_size_min * 0.45)}
        if interpolation and interpolation != "random":
            aa_params["interpolation"] = _pil_interp(interpolation)
        if auto_augment.startswith("rand"):
            return transforms.Compose(
                [rand_augment_transform(auto_augment, aa_params)]
            )
    raise NotImplementedError


def random_sized_crop_img(
    im,
    size,
    jitter_scale=(0.08, 1.0),
    jitter_aspect=(3.0 / 4.0, 4.0 / 3.0),
    max_iter=10,
):
    """
    Performs Inception-style cropping (used for training).
    """
    assert (
        len(im.shape) == 3
    ), "Currently only support image for random_sized_crop"
    h, w = im.shape[1:3]
    i, j, h, w = _get_param_spatial_crop(
        scale=jitter_scale,
        ratio=jitter_aspect,
        height=h,
        width=w,
        num_repeat=max_iter,
        log_scale=False,
        switch_hw=True,
    )
    cropped = im[:, i : i + h, j : j + w]
    return torch.nn.functional.interpolate(
        cropped.unsqueeze(0),
        size=(size, size),
        mode="bilinear",
        align_corners=False,
    ).squeeze(0)


# The following code are modified based on timm lib, we will replace the following
# contents with dependency from PyTorchVideo.
# https://github.com/facebookresearch/pytorchvideo
class RandomResizedCropAndInterpolation:
    """Crop the given PIL Image to random size and aspect ratio with random interpolation.
    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
    is finally resized to given size.
    This is popularly used to train the Inception networks.
    Args:
        size: expected output size of each edge
        scale: range of size of the origin size cropped
        ratio: range of aspect ratio of the origin aspect ratio cropped
        interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(
        self,
        size,
        scale=(0.08, 1.0),
        ratio=(3.0 / 4.0, 4.0 / 3.0),
        interpolation="bilinear",
    ):
        if isinstance(size, tuple):
            self.size = size
        else:
            self.size = (size, size)
        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
            print("range should be of kind (min, max)")

        if interpolation == "random":
            self.interpolation = _RANDOM_INTERPOLATION
        else:
            self.interpolation = _pil_interp(interpolation)
        self.scale = scale
        self.ratio = ratio

    @staticmethod
    def get_params(img, scale, ratio):
        """Get parameters for ``crop`` for a random sized crop.
        Args:
            img (PIL Image): Image to be cropped.
            scale (tuple): range of size of the origin size cropped
            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
        Returns:
            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
                sized crop.
        """
        area = img.size[0] * img.size[1]

        for _ in range(10):
            target_area = random.uniform(*scale) * area
            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
            aspect_ratio = math.exp(random.uniform(*log_ratio))

            w = int(round(math.sqrt(target_area * aspect_ratio)))
            h = int(round(math.sqrt(target_area / aspect_ratio)))

            if w <= img.size[0] and h <= img.size[1]:
                i = random.randint(0, img.size[1] - h)
                j = random.randint(0, img.size[0] - w)
                return i, j, h, w

        # Fallback to central crop
        in_ratio = img.size[0] / img.size[1]
        if in_ratio < min(ratio):
            w = img.size[0]
            h = int(round(w / min(ratio)))
        elif in_ratio > max(ratio):
            h = img.size[1]
            w = int(round(h * max(ratio)))
        else:  # whole image
            w = img.size[0]
            h = img.size[1]
        i = (img.size[1] - h) // 2
        j = (img.size[0] - w) // 2
        return i, j, h, w

    def __call__(self, img):
        """
        Args:
            img (PIL Image): Image to be cropped and resized.
        Returns:
            PIL Image: Randomly cropped and resized image.
        """
        i, j, h, w = self.get_params(img, self.scale, self.ratio)
        if isinstance(self.interpolation, (tuple, list)):
            interpolation = random.choice(self.interpolation)
        else:
            interpolation = self.interpolation
        return F.resized_crop(img, i, j, h, w, self.size, interpolation)

    def __repr__(self):
        if isinstance(self.interpolation, (tuple, list)):
            interpolate_str = " ".join(
                [_pil_interpolation_to_str[x] for x in self.interpolation]
            )
        else:
            interpolate_str = _pil_interpolation_to_str[self.interpolation]
        format_string = self.__class__.__name__ + "(size={0}".format(self.size)
        format_string += ", scale={0}".format(
            tuple(round(s, 4) for s in self.scale)
        )
        format_string += ", ratio={0}".format(
            tuple(round(r, 4) for r in self.ratio)
        )
        format_string += ", interpolation={0})".format(interpolate_str)
        return format_string


def transforms_imagenet_train(
    img_size=224,
    scale=None,
    ratio=None,
    hflip=0.5,
    vflip=0.0,
    color_jitter=0.4,
    auto_augment=None,
    interpolation="random",
    use_prefetcher=False,
    mean=(0.485, 0.456, 0.406),
    std=(0.229, 0.224, 0.225),
    re_prob=0.0,
    re_mode="const",
    re_count=1,
    re_num_splits=0,
    separate=False,
):
    """
    If separate==True, the transforms are returned as a tuple of 3 separate transforms
    for use in a mixing dataset that passes
     * all data through the first (primary) transform, called the 'clean' data
     * a portion of the data through the secondary transform
     * normalizes and converts the branches above with the third, final transform
    """
    if isinstance(img_size, tuple):
        img_size = img_size[-2:]
    else:
        img_size = img_size

    scale = tuple(scale or (0.08, 1.0))  # default imagenet scale range
    ratio = tuple(
        ratio or (3.0 / 4.0, 4.0 / 3.0)
    )  # default imagenet ratio range
    primary_tfl = [
        RandomResizedCropAndInterpolation(
            img_size, scale=scale, ratio=ratio, interpolation=interpolation
        )
    ]
    if hflip > 0.0:
        primary_tfl += [transforms.RandomHorizontalFlip(p=hflip)]
    if vflip > 0.0:
        primary_tfl += [transforms.RandomVerticalFlip(p=vflip)]

    secondary_tfl = []
    if auto_augment:
        assert isinstance(auto_augment, str)
        if isinstance(img_size, tuple):
            img_size_min = min(img_size)
        else:
            img_size_min = img_size
        aa_params = dict(
            translate_const=int(img_size_min * 0.45),
            img_mean=tuple([min(255, round(255 * x)) for x in mean]),
        )
        if interpolation and interpolation != "random":
            aa_params["interpolation"] = _pil_interp(interpolation)
        if auto_augment.startswith("rand"):
            secondary_tfl += [rand_augment_transform(auto_augment, aa_params)]
        elif auto_augment.startswith("augmix"):
            raise NotImplementedError("Augmix not implemented")
        else:
            raise NotImplementedError("Auto aug not implemented")
    elif color_jitter is not None:
        # color jitter is enabled when not using AA
        if isinstance(color_jitter, (list, tuple)):
            # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation
            # or 4 if also augmenting hue
            assert len(color_jitter) in (3, 4)
        else:
            # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue
            color_jitter = (float(color_jitter),) * 3
        secondary_tfl += [transforms.ColorJitter(*color_jitter)]

    final_tfl = []
    final_tfl += [
        transforms.ToTensor(),
        transforms.Normalize(mean=torch.tensor(mean), std=torch.tensor(std)),
    ]
    if re_prob > 0.0:
        final_tfl.append(
            RandomErasing(
                re_prob,
                mode=re_mode,
                max_count=re_count,
                num_splits=re_num_splits,
                device="cpu",
                cube=False,
            )
        )

    if separate:
        return (
            transforms.Compose(primary_tfl),
            transforms.Compose(secondary_tfl),
            transforms.Compose(final_tfl),
        )
    else:
        return transforms.Compose(primary_tfl + secondary_tfl + final_tfl)

############################################################################################################
############################################################################################################

class Compose(object):
    """Composes several transforms
    Args:
    transforms (list of ``Transform`` objects): list of transforms
    to compose
    """

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, clip):
        for t in self.transforms:
            clip = t(clip)
        return clip


class RandomHorizontalFlip(object):
    """Horizontally flip the list of given images randomly
    with a probability 0.5
    """

    def __call__(self, clip):
        """
        Args:
        img (PIL.Image or numpy.ndarray): List of images to be cropped
        in format (h, w, c) in numpy.ndarray
        Returns:
        PIL.Image or numpy.ndarray: Randomly flipped clip
        """
        if random.random() < 0.5:
            if isinstance(clip[0], np.ndarray):
                return [np.fliplr(img) for img in clip]
            elif isinstance(clip[0], PIL.Image.Image):
                return [
                    img.transpose(PIL.Image.FLIP_LEFT_RIGHT) for img in clip
                ]
            else:
                raise TypeError('Expected numpy.ndarray or PIL.Image' +
                                ' but got list of {0}'.format(type(clip[0])))
        return clip


class RandomResize(object):
    """Resizes a list of (H x W x C) numpy.ndarray to the final size
    The larger the original image is, the more times it takes to
    interpolate
    Args:
    interpolation (str): Can be one of 'nearest', 'bilinear'
    defaults to nearest
    size (tuple): (widht, height)
    """

    def __init__(self, ratio=(3. / 4., 4. / 3.), interpolation='nearest'):
        self.ratio = ratio
        self.interpolation = interpolation

    def __call__(self, clip):
        scaling_factor = random.uniform(self.ratio[0], self.ratio[1])

        if isinstance(clip[0], np.ndarray):
            im_h, im_w, im_c = clip[0].shape
        elif isinstance(clip[0], PIL.Image.Image):
            im_w, im_h = clip[0].size

        new_w = int(im_w * scaling_factor)
        new_h = int(im_h * scaling_factor)
        new_size = (new_w, new_h)
        resized = FF.resize_clip(
            clip, new_size, interpolation=self.interpolation)
        return resized


class Resize(object):
    """Resizes a list of (H x W x C) numpy.ndarray to the final size
    The larger the original image is, the more times it takes to
    interpolate
    Args:
    interpolation (str): Can be one of 'nearest', 'bilinear'
    defaults to nearest
    size (tuple): (widht, height)
    """

    def __init__(self, size, interpolation='nearest'):
        self.size = size
        self.interpolation = interpolation

    def __call__(self, clip):
        resized = FF.resize_clip(
            clip, self.size, interpolation=self.interpolation)
        return resized


class RandomCrop(object):
    """Extract random crop at the same location for a list of images
    Args:
    size (sequence or int): Desired output size for the
    crop in format (h, w)
    """

    def __init__(self, size):
        if isinstance(size, numbers.Number):
            size = (size, size)

        self.size = size

    def __call__(self, clip):
        """
        Args:
        img (PIL.Image or numpy.ndarray): List of images to be cropped
        in format (h, w, c) in numpy.ndarray
        Returns:
        PIL.Image or numpy.ndarray: Cropped list of images
        """
        h, w = self.size
        if isinstance(clip[0], np.ndarray):
            im_h, im_w, im_c = clip[0].shape
        elif isinstance(clip[0], PIL.Image.Image):
            im_w, im_h = clip[0].size
        else:
            raise TypeError('Expected numpy.ndarray or PIL.Image' +
                            'but got list of {0}'.format(type(clip[0])))
        if w > im_w or h > im_h:
            error_msg = (
                'Initial image size should be larger then '
                'cropped size but got cropped sizes : ({w}, {h}) while '
                'initial image is ({im_w}, {im_h})'.format(
                    im_w=im_w, im_h=im_h, w=w, h=h))
            raise ValueError(error_msg)

        x1 = random.randint(0, im_w - w)
        y1 = random.randint(0, im_h - h)
        cropped = FF.crop_clip(clip, y1, x1, h, w)

        return cropped


class ThreeCrop(object):
    """Extract random crop at the same location for a list of images
    Args:
    size (sequence or int): Desired output size for the
    crop in format (h, w)
    """

    def __init__(self, size):
        if isinstance(size, numbers.Number):
            size = (size, size)

        self.size = size

    def __call__(self, clip):
        """
        Args:
        img (PIL.Image or numpy.ndarray): List of images to be cropped
        in format (h, w, c) in numpy.ndarray
        Returns:
        PIL.Image or numpy.ndarray: Cropped list of images
        """
        h, w = self.size
        if isinstance(clip[0], np.ndarray):
            im_h, im_w, im_c = clip[0].shape
        elif isinstance(clip[0], PIL.Image.Image):
            im_w, im_h = clip[0].size
        else:
            raise TypeError('Expected numpy.ndarray or PIL.Image' +
                            'but got list of {0}'.format(type(clip[0])))
        if w != im_w and h != im_h:
            clip = FF.resize_clip(clip, self.size, interpolation="bilinear")
            im_h, im_w, im_c = clip[0].shape

        step = np.max((np.max((im_w, im_h)) - self.size[0]) // 2, 0)
        cropped = []
        for i in range(3):
            if (im_h > self.size[0]):
                x1 = 0
                y1 = i * step
                cropped.extend(FF.crop_clip(clip, y1, x1, h, w))
            else:
                x1 = i * step
                y1 = 0
                cropped.extend(FF.crop_clip(clip, y1, x1, h, w))
        return cropped


class RandomRotation(object):
    """Rotate entire clip randomly by a random angle within
    given bounds
    Args:
    degrees (sequence or int): Range of degrees to select from
    If degrees is a number instead of sequence like (min, max),
    the range of degrees, will be (-degrees, +degrees).
    """

    def __init__(self, degrees):
        if isinstance(degrees, numbers.Number):
            if degrees < 0:
                raise ValueError('If degrees is a single number,'
                                 'must be positive')
            degrees = (-degrees, degrees)
        else:
            if len(degrees) != 2:
                raise ValueError('If degrees is a sequence,'
                                 'it must be of len 2.')

        self.degrees = degrees

    def __call__(self, clip):
        """
        Args:
        img (PIL.Image or numpy.ndarray): List of images to be cropped
        in format (h, w, c) in numpy.ndarray
        Returns:
        PIL.Image or numpy.ndarray: Cropped list of images
        """
        import skimage
        angle = random.uniform(self.degrees[0], self.degrees[1])
        if isinstance(clip[0], np.ndarray):
            rotated = [skimage.transform.rotate(img, angle) for img in clip]
        elif isinstance(clip[0], PIL.Image.Image):
            rotated = [img.rotate(angle) for img in clip]
        else:
            raise TypeError('Expected numpy.ndarray or PIL.Image' +
                            'but got list of {0}'.format(type(clip[0])))

        return rotated


class CenterCrop(object):
    """Extract center crop at the same location for a list of images
    Args:
    size (sequence or int): Desired output size for the
    crop in format (h, w)
    """

    def __init__(self, size):
        if isinstance(size, numbers.Number):
            size = (size, size)

        self.size = size

    def __call__(self, clip):
        """
        Args:
        img (PIL.Image or numpy.ndarray): List of images to be cropped
        in format (h, w, c) in numpy.ndarray
        Returns:
        PIL.Image or numpy.ndarray: Cropped list of images
        """
        h, w = self.size
        if isinstance(clip[0], np.ndarray):
            im_h, im_w, im_c = clip[0].shape
        elif isinstance(clip[0], PIL.Image.Image):
            im_w, im_h = clip[0].size
        else:
            raise TypeError('Expected numpy.ndarray or PIL.Image' +
                            'but got list of {0}'.format(type(clip[0])))
        if w > im_w or h > im_h:
            error_msg = (
                'Initial image size should be larger then '
                'cropped size but got cropped sizes : ({w}, {h}) while '
                'initial image is ({im_w}, {im_h})'.format(
                    im_w=im_w, im_h=im_h, w=w, h=h))
            raise ValueError(error_msg)

        x1 = int(round((im_w - w) / 2.))
        y1 = int(round((im_h - h) / 2.))
        cropped = FF.crop_clip(clip, y1, x1, h, w)

        return cropped


class ColorJitter(object):
    """Randomly change the brightness, contrast and saturation and hue of the clip
    Args:
    brightness (float): How much to jitter brightness. brightness_factor
    is chosen uniformly from [max(0, 1 - brightness), 1 + brightness].
    contrast (float): How much to jitter contrast. contrast_factor
    is chosen uniformly from [max(0, 1 - contrast), 1 + contrast].
    saturation (float): How much to jitter saturation. saturation_factor
    is chosen uniformly from [max(0, 1 - saturation), 1 + saturation].
    hue(float): How much to jitter hue. hue_factor is chosen uniformly from
    [-hue, hue]. Should be >=0 and <= 0.5.
    """

    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
        self.brightness = brightness
        self.contrast = contrast
        self.saturation = saturation
        self.hue = hue

    def get_params(self, brightness, contrast, saturation, hue):
        if brightness > 0:
            brightness_factor = random.uniform(
                max(0, 1 - brightness), 1 + brightness)
        else:
            brightness_factor = None

        if contrast > 0:
            contrast_factor = random.uniform(
                max(0, 1 - contrast), 1 + contrast)
        else:
            contrast_factor = None

        if saturation > 0:
            saturation_factor = random.uniform(
                max(0, 1 - saturation), 1 + saturation)
        else:
            saturation_factor = None

        if hue > 0:
            hue_factor = random.uniform(-hue, hue)
        else:
            hue_factor = None
        return brightness_factor, contrast_factor, saturation_factor, hue_factor

    def __call__(self, clip):
        """
        Args:
        clip (list): list of PIL.Image
        Returns:
        list PIL.Image : list of transformed PIL.Image
        """
        if isinstance(clip[0], np.ndarray):
            raise TypeError(
                'Color jitter not yet implemented for numpy arrays')
        elif isinstance(clip[0], PIL.Image.Image):
            brightness, contrast, saturation, hue = self.get_params(
                self.brightness, self.contrast, self.saturation, self.hue)

            # Create img transform function sequence
            img_transforms = []
            if brightness is not None:
                img_transforms.append(lambda img: torchvision.transforms.functional.adjust_brightness(img, brightness))
            if saturation is not None:
                img_transforms.append(lambda img: torchvision.transforms.functional.adjust_saturation(img, saturation))
            if hue is not None:
                img_transforms.append(lambda img: torchvision.transforms.functional.adjust_hue(img, hue))
            if contrast is not None:
                img_transforms.append(lambda img: torchvision.transforms.functional.adjust_contrast(img, contrast))
            random.shuffle(img_transforms)

            # Apply to all images
            jittered_clip = []
            for img in clip:
                for func in img_transforms:
                    jittered_img = func(img)
                jittered_clip.append(jittered_img)

        else:
            raise TypeError('Expected numpy.ndarray or PIL.Image' +
                            'but got list of {0}'.format(type(clip[0])))
        return jittered_clip


class Normalize(object):
    """Normalize a clip with mean and standard deviation.
    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform
    will normalize each channel of the input ``torch.*Tensor`` i.e.
    ``input[channel] = (input[channel] - mean[channel]) / std[channel]``
    .. note::
        This transform acts out of place, i.e., it does not mutates the input tensor.
    Args:
        mean (sequence): Sequence of means for each channel.
        std (sequence): Sequence of standard deviations for each channel.
    """

    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, clip):
        """
        Args:
            clip (Tensor): Tensor clip of size (T, C, H, W) to be normalized.
        Returns:
            Tensor: Normalized Tensor clip.
        """
        return FF.normalize(clip, self.mean, self.std)

    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/datasets/volume_transforms.py
================================================
import numpy as np
from PIL import Image
import torch


def convert_img(img):
    """Converts (H, W, C) numpy.ndarray to (C, W, H) format
    """
    if len(img.shape) == 3:
        img = img.transpose(2, 0, 1)
    if len(img.shape) == 2:
        img = np.expand_dims(img, 0)
    return img


class ClipToTensor(object):
    """Convert a list of m (H x W x C) numpy.ndarrays in the range [0, 255]
    to a torch.FloatTensor of shape (C x m x H x W) in the range [0, 1.0]
    """

    def __init__(self, channel_nb=3, div_255=True, numpy=False):
        self.channel_nb = channel_nb
        self.div_255 = div_255
        self.numpy = numpy

    def __call__(self, clip):
        """
        Args: clip (list of numpy.ndarray): clip (list of images)
        to be converted to tensor.
        """
        # Retrieve shape
        if isinstance(clip[0], np.ndarray):
            h, w, ch = clip[0].shape
            assert ch == self.channel_nb, 'Got {0} instead of 3 channels'.format(
                ch)
        elif isinstance(clip[0], Image.Image):
            w, h = clip[0].size
        else:
            raise TypeError('Expected numpy.ndarray or PIL.Image\
            but got list of {0}'.format(type(clip[0])))

        np_clip = np.zeros([self.channel_nb, len(clip), int(h), int(w)])

        # Convert
        for img_idx, img in enumerate(clip):
            if isinstance(img, np.ndarray):
                pass
            elif isinstance(img, Image.Image):
                img = np.array(img, copy=False)
            else:
                raise TypeError('Expected numpy.ndarray or PIL.Image\
                but got list of {0}'.format(type(clip[0])))
            img = convert_img(img)
            np_clip[:, img_idx, :, :] = img
        if self.numpy:
            if self.div_255:
                np_clip = np_clip / 255.0
            return np_clip

        else:
            tensor_clip = torch.from_numpy(np_clip)

            if not isinstance(tensor_clip, torch.FloatTensor):
                tensor_clip = tensor_clip.float()
            if self.div_255:
                tensor_clip = torch.div(tensor_clip, 255)
            return tensor_clip


# Note this norms data to -1/1
class ClipToTensor_K(object):
    """Convert a list of m (H x W x C) numpy.ndarrays in the range [0, 255]
    to a torch.FloatTensor of shape (C x m x H x W) in the range [0, 1.0]
    """

    def __init__(self, channel_nb=3, div_255=True, numpy=False):
        self.channel_nb = channel_nb
        self.div_255 = div_255
        self.numpy = numpy

    def __call__(self, clip):
        """
        Args: clip (list of numpy.ndarray): clip (list of images)
        to be converted to tensor.
        """
        # Retrieve shape
        if isinstance(clip[0], np.ndarray):
            h, w, ch = clip[0].shape
            assert ch == self.channel_nb, 'Got {0} instead of 3 channels'.format(
                ch)
        elif isinstance(clip[0], Image.Image):
            w, h = clip[0].size
        else:
            raise TypeError('Expected numpy.ndarray or PIL.Image\
            but got list of {0}'.format(type(clip[0])))

        np_clip = np.zeros([self.channel_nb, len(clip), int(h), int(w)])

        # Convert
        for img_idx, img in enumerate(clip):
            if isinstance(img, np.ndarray):
                pass
            elif isinstance(img, Image.Image):
                img = np.array(img, copy=False)
            else:
                raise TypeError('Expected numpy.ndarray or PIL.Image\
                but got list of {0}'.format(type(clip[0])))
            img = convert_img(img)
            np_clip[:, img_idx, :, :] = img
        if self.numpy:
            if self.div_255:
                np_clip = (np_clip - 127.5) / 127.5
            return np_clip

        else:
            tensor_clip = torch.from_numpy(np_clip)

            if not isinstance(tensor_clip, torch.FloatTensor):
                tensor_clip = tensor_clip.float()
            if self.div_255:
                tensor_clip = torch.div(torch.sub(tensor_clip, 127.5), 127.5)
            return tensor_clip


class ToTensor(object):
    """Converts numpy array to tensor
    """

    def __call__(self, array):
        tensor = torch.from_numpy(array)
        return tensor


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/functional.py
================================================
import numbers
import cv2
import numpy as np
import PIL
import torch


def _is_tensor_clip(clip):
    return torch.is_tensor(clip) and clip.ndimension() == 4


def crop_clip(clip, min_h, min_w, h, w):
    if isinstance(clip[0], np.ndarray):
        cropped = [img[min_h:min_h + h, min_w:min_w + w, :] for img in clip]

    elif isinstance(clip[0], PIL.Image.Image):
        cropped = [
            img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip
        ]
    else:
        raise TypeError('Expected numpy.ndarray or PIL.Image' +
                        'but got list of {0}'.format(type(clip[0])))
    return cropped


def resize_clip(clip, size, interpolation='bilinear'):
    if isinstance(clip[0], np.ndarray):
        if isinstance(size, numbers.Number):
            im_h, im_w, im_c = clip[0].shape
            # Min spatial dim already matches minimal size
            if (im_w <= im_h and im_w == size) or (im_h <= im_w
                                                   and im_h == size):
                return clip
            new_h, new_w = get_resize_sizes(im_h, im_w, size)
            size = (new_w, new_h)
        else:
            size = size[0], size[1]
        if interpolation == 'bilinear':
            np_inter = cv2.INTER_LINEAR
        else:
            np_inter = cv2.INTER_NEAREST
        scaled = [
            cv2.resize(img, size, interpolation=np_inter) for img in clip
        ]
    elif isinstance(clip[0], PIL.Image.Image):
        if isinstance(size, numbers.Number):
            im_w, im_h = clip[0].size
            # Min spatial dim already matches minimal size
            if (im_w <= im_h and im_w == size) or (im_h <= im_w
                                                   and im_h == size):
                return clip
            new_h, new_w = get_resize_sizes(im_h, im_w, size)
            size = (new_w, new_h)
        else:
            size = size[1], size[0]
        if interpolation == 'bilinear':
            pil_inter = PIL.Image.BILINEAR
        else:
            pil_inter = PIL.Image.NEAREST
        scaled = [img.resize(size, pil_inter) for img in clip]
    else:
        raise TypeError('Expected numpy.ndarray or PIL.Image' +
                        'but got list of {0}'.format(type(clip[0])))
    return scaled


def get_resize_sizes(im_h, im_w, size):
    if im_w < im_h:
        ow = size
        oh = int(size * im_h / im_w)
    else:
        oh = size
        ow = int(size * im_w / im_h)
    return oh, ow


def normalize(clip, mean, std, inplace=False):
    if not _is_tensor_clip(clip):
        raise TypeError('tensor is not a torch clip.')

    if not inplace:
        clip = clip.clone()

    dtype = clip.dtype
    mean = torch.as_tensor(mean, dtype=dtype, device=clip.device)
    std = torch.as_tensor(std, dtype=dtype, device=clip.device)
    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])

    return clip


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/models/__init__.py
================================================
from .clip import clip_b16, clip_l14, clip_l14_336
# from .modeling_finetune import vit_base_patch16_224, vit_base_patch16_384, vit_large_patch16_224, vit_large_patch16_384
from .modeling_finetune import vit_large_patch16_224
from .modeling_pretrain_umt import pretrain_umt_base_patch16_224, pretrain_umt_large_patch16_224 
from .modeling_pretrain import pretrain_videomae_base_patch16_224, pretrain_videomae_large_patch16_224, pretrain_videomae_huge_patch16_224 


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/models/clip.py
================================================
#!/usr/bin/env python
import os
from collections import OrderedDict

import torch
from torch import nn


MODEL_PATH = 'your_model_path/clip_visual_encoder'
_MODELS = {
    # extracted from OpenAI, see extract_clip
    "ViT-B/16": os.path.join(MODEL_PATH, "vit_b16.pth"),
    "ViT-L/14": os.path.join(MODEL_PATH, "vit_l14.pth"),
    "ViT-L/14_336": os.path.join(MODEL_PATH, "vit_l14_336.pth"),
}


class LayerNorm(nn.LayerNorm):
    """Subclass torch's LayerNorm to handle fp16."""

    def forward(self, x):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)


class QuickGELU(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):
    def __init__(self, d_model, n_head, attn_mask=None):
        super().__init__()

        self.attn = nn.MultiheadAttention(d_model, n_head)
        self.ln_1 = LayerNorm(d_model)
        self.mlp = nn.Sequential(OrderedDict([
            ("c_fc", nn.Linear(d_model, d_model * 4)),
            ("gelu", QuickGELU()),
            ("c_proj", nn.Linear(d_model * 4, d_model))
        ]))
        self.ln_2 = LayerNorm(d_model)
        self.attn_mask = attn_mask

    def attention(self, x, return_attn=False):
        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
        if return_attn:
            return self.attn(x, x, x, need_weights=True, attn_mask=self.attn_mask)
        else:
            return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

    def forward(self, x, return_attn=False):
        if return_attn:
            x_, attn = self.attention(self.ln_1(x), return_attn=True)
            x = x + x_
            x = x + self.mlp(self.ln_2(x))
            return x, attn
        else:
            x = x + self.attention(self.ln_1(x))
            x = x + self.mlp(self.ln_2(x))
            return x


class Transformer(nn.Module):
    def __init__(
            self, width, layers, heads, return_attn=False, 
            clip_return_layer=1, clip_return_interval=1,
        ):
        super().__init__()
        self.layers = layers
        self.return_attn = return_attn
        self.resblocks = nn.ModuleList()
        for _ in range(layers):
            self.resblocks.append(
                ResidualAttentionBlock(
                    width, heads,
                )
            )
        self.return_index = []
        for i in range(clip_return_layer):
            self.return_index.append(layers - int(i * clip_return_interval) - 1)
        print(f'Teacher return index: {self.return_index}')

    def forward(self, x):
        attn = None
        z = []
        for idx, blk in enumerate(self.resblocks):
            if idx == self.layers - 1 and self.return_attn:
                x, attn = blk(x, return_attn=True)
            else:
                x = blk(x)
            if idx in self.return_index:
                z.append(x)
        x = torch.stack(z)
        return x, attn


class VisionTransformer(nn.Module):
    def __init__(
        self, input_resolution, patch_size, width, layers, heads, output_dim, 
        clip_norm_type='l2', kernel_size=1,
        return_attn=False, clip_return_layer=1, clip_return_interval=1,
    ):
        super().__init__()
        self.clip_norm_type = clip_norm_type
        self.return_attn = return_attn
        print(f'Normalization Type: {clip_norm_type}')
        print(f'Return Attention: {return_attn}')
        print(f'Return Layer: {clip_return_layer}')
        print(f'Return Interval: {clip_return_interval}')

        self.output_dim = output_dim
        self.conv1 = nn.Conv3d(
            3, width, 
            (kernel_size, patch_size, patch_size), 
            (kernel_size, patch_size, patch_size), 
            (0, 0, 0), bias=False
        )

        scale = width ** -0.5
        self.class_embedding = nn.Parameter(scale * torch.randn(width))
        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
        self.ln_pre = LayerNorm(width)
        
        self.transformer = Transformer(
            width, layers, heads, return_attn=return_attn, 
            clip_return_layer=clip_return_layer,
            clip_return_interval=clip_return_interval,
        )

        self.ln_post = LayerNorm(width)
        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

    def forward(self, x, mask=None):
        x = self.conv1(x)  # shape = [*, width, grid, grid]
        N, C, T, H, W = x.shape
        x = x.permute(0, 2, 3, 4, 1).reshape(N * T, H * W, C)

        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
        x = x + self.positional_embedding.to(x.dtype)
        x = self.ln_pre(x)

        if mask is not None:
            cls_tokens = x[:, :1, :]
            x = x[:, 1:]
            x = x.reshape(N, T * H * W, C)
            x = x[~mask].view(N * T, -1, C)
            HW = x.shape[1]
            x = torch.cat([cls_tokens, x], dim=1)
        else:
            HW = H * W

        x = x.permute(1, 0, 2)  # NLD -> LND
        x, attn = self.transformer(x)

        K = x.shape[0]
        x = self.ln_post(x[:, 1:, :, :])  # [HW, NT, C]
        x = x.view(K, HW, N, T, C).permute(0, 2, 3, 1, 4).reshape(K, N, T * HW, C)  # [K, N, THW, C]
        x = x @ self.proj
        
        if self.clip_norm_type == 'l2':
            x = x / x.norm(dim=-1, keepdim=True)
        elif self.clip_norm_type == 'none':
            pass
        else:
            raise NotImplementedError

        if self.return_attn:
            return x, attn[:, 0, 1:]
        else:
            return x


def inflate_weight(weight_2d, time_dim, center=True):
    print(f'Init center: {center}')
    if center:
        weight_3d = torch.zeros(*weight_2d.shape)
        weight_3d = weight_3d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
        middle_idx = time_dim // 2
        weight_3d[:, :, middle_idx, :, :] = weight_2d
    else:
        weight_3d = weight_2d.unsqueeze(2).repeat(1, 1, time_dim, 1, 1)
        weight_3d = weight_3d / time_dim
    return weight_3d


def load_state_dict(model, state_dict, input_resolution=224, patch_size=16, center=True):
    state_dict_3d = model.state_dict()
    for k in state_dict.keys():
        if k in state_dict_3d.keys() and state_dict[k].shape != state_dict_3d[k].shape:
            if len(state_dict_3d[k].shape) <= 2:
                print(f'Ignore: {k}')
                continue
            print(f'Inflate: {k}, {state_dict[k].shape} => {state_dict_3d[k].shape}')
            time_dim = state_dict_3d[k].shape[2]
            state_dict[k] = inflate_weight(state_dict[k], time_dim, center=center)

    pos_embed_checkpoint = state_dict['positional_embedding']
    embedding_size = pos_embed_checkpoint.shape[-1]
    num_patches = (input_resolution // patch_size) ** 2
    orig_size = int((pos_embed_checkpoint.shape[-2] - 1) ** 0.5)
    new_size = int(num_patches ** 0.5)
    if orig_size != new_size:
        print(f'Pos_emb from {orig_size} to {new_size}')
        extra_tokens = pos_embed_checkpoint[:1]
        pos_tokens = pos_embed_checkpoint[1:]
        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
        pos_tokens = torch.nn.functional.interpolate(
            pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(0, 2)
        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=0)
        state_dict['positional_embedding'] = new_pos_embed
    
    model.load_state_dict(state_dict, strict=True)


def clip_b16(
    pretrained=True, 
    clip_norm_type='l2', input_resolution=224, kernel_size=1,
    return_attn=False, center=True, clip_return_layer=1,
    clip_return_interval=1
):
    model = VisionTransformer(
        input_resolution=input_resolution, patch_size=16, 
        width=768, layers=12, heads=12, output_dim=512,
        clip_norm_type=clip_norm_type,
        kernel_size=kernel_size, return_attn=return_attn,
        clip_return_layer=clip_return_layer, 
        clip_return_interval=clip_return_interval
    )
    if pretrained:
        print('load pretrained weights')
        state_dict = torch.load(_MODELS["ViT-B/16"], map_location='cpu')
        load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=16, center=center)
    return model.eval()


def clip_l14(
    pretrained=True, 
    clip_norm_type='l2', input_resolution=224, kernel_size=1,
    return_attn=False, center=True, clip_return_layer=1,
    clip_return_interval=1
):
    model = VisionTransformer(
        input_resolution=input_resolution, patch_size=14,
        width=1024, layers=24, heads=16, output_dim=768,
        clip_norm_type=clip_norm_type,
        kernel_size=kernel_size, return_attn=return_attn,
        clip_return_layer=clip_return_layer,
        clip_return_interval=clip_return_interval
    )
    if pretrained:
        print('load pretrained weights')
        state_dict = torch.load(_MODELS["ViT-L/14"], map_location='cpu')
        load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=14, center=center)
    return model.eval()


def clip_l14_336(
    pretrained=True, 
    clip_norm_type='l2', input_resolution=336, kernel_size=1,
    return_attn=False, center=True, clip_return_layer=1,
    clip_return_interval=1
):
    model = VisionTransformer(
        input_resolution=input_resolution, patch_size=14, 
        width=1024, layers=24, heads=16, output_dim=768,
        clip_norm_type=clip_norm_type,
        kernel_size=kernel_size, return_attn=return_attn,
        clip_return_layer=clip_return_layer,
        clip_return_interval=clip_return_interval,
    )
    if pretrained:
        print('load pretrained weights')
        state_dict = torch.load(_MODELS["ViT-L/14_336"], map_location='cpu')
        load_state_dict(model, state_dict, input_resolution=input_resolution, patch_size=14, center=center)
    return model.eval()


if __name__ == '__main__':
    import time
    from fvcore.nn import FlopCountAnalysis
    from fvcore.nn import flop_count_table
    import numpy as np

    seed = 4217
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    num_frames = 8

    model = clip_ml_b16(pretrained=True, kernel_size=1, return_attn=False, clip_return_layer=1)
    # print(model)

    # flops = FlopCountAnalysis(model, torch.rand(1, 3, num_frames, 224, 224))
    # s = time.time()
    # print(flop_count_table(flops, max_depth=1))
    # print(time.time()-s)
    print(model(torch.rand(1, 3, num_frames, 224, 224)).shape)

================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/models/modeling_finetune.py
================================================
from functools import partial
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from timm.models.layers import drop_path, to_2tuple, trunc_normal_
from timm.models.registry import register_model
import torch.utils.checkpoint as checkpoint


def _cfg(url='', **kwargs):
    return {
        'url': url,
        'num_classes': 400, 'input_size': (3, 224, 224), 'pool_size': None,
        'crop_pct': .9, 'interpolation': 'bicubic',
        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
        **kwargs
    }


class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)
    
    def extra_repr(self) -> str:
        return 'p={}'.format(self.drop_prob)


class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        # x = self.drop(x)
        # commit this for the orignal BERT implement 
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(nn.Module):
    def __init__(
            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
            proj_drop=0., attn_head_dim=None):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        if attn_head_dim is not None:
            head_dim = attn_head_dim
        all_head_dim = head_dim * self.num_heads
        self.scale = qk_scale or head_dim ** -0.5

        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
        if qkv_bias:
            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
        else:
            self.q_bias = None
            self.v_bias = None

        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(all_head_dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv_bias = None
        if self.q_bias is not None:
            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))
        
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
                 attn_head_dim=None):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

        if init_values > 0:
            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
        else:
            self.gamma_1, self.gamma_2 = None, None

    def forward(self, x):
        if self.gamma_1 is None:
            x = x + self.drop_path(self.attn(self.norm1(x)))
            x = x + self.drop_path(self.mlp(self.norm2(x)))
        else:
            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
        return x


class PatchEmbed(nn.Module):
    """ Image to Patch Embedding
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, num_frames=16, tubelet_size=2):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        self.tubelet_size = int(tubelet_size)
        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches
        self.proj = nn.Conv3d(in_channels=in_chans, out_channels=embed_dim, 
                            kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]), 
                            stride=(self.tubelet_size, patch_size[0], patch_size[1]))

    def forward(self, x, **kwargs):
        B, C, T, H, W = x.shape
        # FIXME look at relaxing size constraints
        assert H == self.img_size[0] and W == self.img_size[1], \
            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
        x = self.proj(x).flatten(2).transpose(1, 2)
        return x
    
# sin-cos position encoding
# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
def get_sinusoid_encoding_table(n_position, d_hid, cur_frame=-1, pre_n_position=1568): 
    ''' Sinusoid position encoding table ''' 
    # TODO: make it with torch instead of numpy 
    def get_position_angle_vec(position): 
        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] 
    
    # generate checkpoint position embedding
    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(pre_n_position)]) 
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 
    sinusoid_table = torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
    print(f"n_position: {n_position}")
    print(f"pre_n_position: {pre_n_position}")
    if n_position // cur_frame * 8 != pre_n_position and cur_frame != -1:
        T = 8 # checkpoint frame
        P = 14 # checkpoint size
        C = d_hid
        new_P = int((n_position // cur_frame) ** 0.5) # testing size
        print(f'Pretraining uses 14x14, but current version is {new_P}x{new_P}')
        print(f'Interpolate the position embedding')
        sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
        sinusoid_table = sinusoid_table.reshape(-1, P, P, C).permute(0, 3, 1, 2)
        sinusoid_table = torch.nn.functional.interpolate(
            sinusoid_table, size=(new_P, new_P), mode='bicubic', align_corners=False)
        # BT, C, H, W -> BT, H, W, C ->  B, T, H, W, C
        sinusoid_table = sinusoid_table.permute(0, 2, 3, 1).reshape(-1, T, new_P, new_P, C)
        sinusoid_table = sinusoid_table.flatten(1, 3)  # B, THW, C
    if cur_frame != -1 and cur_frame != 8:
        print(f'Pretraining uses 8 frames, but current frame is {cur_frame}')
        print(f'Interpolate the position embedding')
        T = 8 # checkpoint frame
        new_T = cur_frame # testing frame
        # interpolate
        P = int((n_position // cur_frame) ** 0.5) # testing size
        C = d_hid
        sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
        sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, 1).reshape(-1, C, T)  # BHW, C, T
        sinusoid_table = torch.nn.functional.interpolate(sinusoid_table, size=new_T, mode='linear')
        sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(0, 4, 1, 2, 3) # B, T, H, W, C
        sinusoid_table = sinusoid_table.flatten(1, 3)  # B, THW, C
    if n_position == pre_n_position:
        return sinusoid_table
    else:
        print("Use learnable position embedding")
        return nn.Parameter(sinusoid_table, requires_grad=True)


class VisionTransformer(nn.Module):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """
    def __init__(self, 
                 img_size=224, 
                 patch_size=16, 
                 in_chans=3, 
                 num_classes=1000, 
                 embed_dim=768, 
                 depth=12,
                 num_heads=12, 
                 mlp_ratio=4., 
                 qkv_bias=False, 
                 qk_scale=None, 
                 fc_drop_rate=0., 
                 drop_rate=0., 
                 attn_drop_rate=0.,
                 drop_path_rate=0., 
                 norm_layer=nn.LayerNorm, 
                 init_values=0.,
                 use_learnable_pos_emb=False, 
                 init_scale=0.,
                 all_frames=16,
                 tubelet_size=2,
                 use_checkpoint=False,
                 checkpoint_num=0,
                 use_mean_pooling=True):
        super().__init__()
        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.tubelet_size = tubelet_size
        self.patch_embed = PatchEmbed(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, num_frames=all_frames, tubelet_size=self.tubelet_size)
        num_patches = self.patch_embed.num_patches
        self.use_checkpoint = use_checkpoint
        self.checkpoint_num = checkpoint_num
        print(f'Use checkpoint: {use_checkpoint}')
        print(f'Checkpoint number: {checkpoint_num}')

        if use_learnable_pos_emb:
            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
        else:
            # sine-cosine positional embeddings is on the way
            if patch_size == 14:
                pre_n_position = 2048
            else:
                pre_n_position = 1568
            self.pos_embed = get_sinusoid_encoding_table(
                num_patches, embed_dim, all_frames // tubelet_size,
                pre_n_position=pre_n_position
            )

        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
                init_values=init_values)
            for i in range(depth)])
        self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
        self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
        self.fc_dropout = nn.Dropout(p=fc_drop_rate) if fc_drop_rate > 0 else nn.Identity()
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        if use_learnable_pos_emb:
            trunc_normal_(self.pos_embed, std=.02)

        trunc_normal_(self.head.weight, std=.02)
        self.apply(self._init_weights)

        self.head.weight.data.mul_(init_scale)
        self.head.bias.data.mul_(init_scale)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def get_num_layers(self):
        return len(self.blocks)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'pos_embed', 'cls_token'}

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=''):
        self.num_classes = num_classes
        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

    def forward_features(self, x):
        x = self.patch_embed(x)
        B, _, _ = x.size()

        if self.pos_embed is not None:
            x = x + self.pos_embed.expand(B, -1, -1).type_as(x).to(x.device).clone().detach()
        x = self.pos_drop(x)

        for idx, blk in enumerate(self.blocks):
            if self.use_checkpoint and idx < self.checkpoint_num:
                x = checkpoint.checkpoint(blk, x)
            else:
                x = blk(x)

        x = self.norm(x)
        if self.fc_norm is not None:
            return self.fc_norm(x.mean(1))
        else:
            return x[:, 0]

    def forward(self, x):
        x = self.forward_features(x)
        x = self.head(self.fc_dropout(x))
        return x


# @register_model
# def vit_base_patch16_224(pretrained=False, **kwargs):
#     model = VisionTransformer(
#         patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
#         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
#     model.default_cfg = _cfg()
#     return model
# 
# 
# # @register_model
# def vit_base_patch16_384(pretrained=False, **kwargs):
#     model = VisionTransformer(
#         img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
#         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
#     model.default_cfg = _cfg()
#     return model


@register_model
def vit_large_patch16_224(pretrained=False, **kwargs):
    kwargs.pop('pretrained_cfg', None) # added by Ziqi to accommodate timm=0.9.12
    kwargs.pop('pretrained_cfg_overlay', None) # added by Ziqi to accommodate timm=0.9.12
    model = VisionTransformer(
        patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
    model.default_cfg = _cfg()
    return model


# @register_model
# def vit_large_patch16_384(pretrained=False, **kwargs):
#     model = VisionTransformer(
#         img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
#         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
#     model.default_cfg = _cfg()
#     return model


if __name__ == '__main__':
    import time
    from fvcore.nn import FlopCountAnalysis
    from fvcore.nn import flop_count_table
    import numpy as np

    seed = 4217
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    num_frames = 8

    # model = vit_base_patch16_384(all_frames=num_frames, tubelet_size=1)
    # model = vit_large_patch16_384(all_frames=num_frames, tubelet_size=1)
    # print(model)

    flops = FlopCountAnalysis(model, torch.rand(1, 3, num_frames, 384, 384))
    s = time.time()
    print(flop_count_table(flops, max_depth=1))
    print(time.time()-s)
    # print(model(torch.rand(1, 3, num_frames, 224, 224)).shape)


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/models/modeling_pretrain.py
================================================
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from functools import partial

from .modeling_finetune import Block, _cfg, PatchEmbed, get_sinusoid_encoding_table
from timm.models.registry import register_model
from timm.models.layers import trunc_normal_ as __call_trunc_normal_


def trunc_normal_(tensor, mean=0., std=1.):
    __call_trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std)


class PretrainVisionTransformerEncoder(nn.Module):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, 
                 num_frames=16, tubelet_size=2, use_checkpoint=False,
                 use_learnable_pos_emb=False):
        super().__init__()
        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.patch_embed = PatchEmbed(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
            num_frames=num_frames, tubelet_size=tubelet_size
        )
        num_patches = self.patch_embed.num_patches
        self.use_checkpoint = use_checkpoint

        # TODO: Add the cls token
        if use_learnable_pos_emb:
            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
        else:
            # sine-cosine positional embeddings 
            self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim)

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
                init_values=init_values)
            for i in range(depth)])
        self.norm =  norm_layer(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        if use_learnable_pos_emb:
            trunc_normal_(self.pos_embed, std=.02)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def get_num_layers(self):
        return len(self.blocks)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'pos_embed', 'cls_token'}

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=''):
        self.num_classes = num_classes
        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

    def forward_features(self, x, mask):
        _, _, T, _, _ = x.shape
        x = self.patch_embed(x)
        
        x = x + self.pos_embed.type_as(x).to(x.device).clone().detach()

        B, _, C = x.shape
        x_vis = x[~mask].reshape(B, -1, C) # ~mask means visible

        if self.use_checkpoint:
            for blk in self.blocks:
                x_vis = checkpoint.checkpoint(blk, x_vis)
        else:   
            for blk in self.blocks:
                x_vis = blk(x_vis)

        x_vis = self.norm(x_vis)
        return x_vis

    def forward(self, x, mask):
        x = self.forward_features(x, mask)
        x = self.head(x)
        return x


class PretrainVisionTransformerDecoder(nn.Module):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """
    def __init__(self, patch_size=16, num_classes=768, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.,
                 qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.,
                 norm_layer=nn.LayerNorm, init_values=None, num_patches=196, tubelet_size=2, use_checkpoint=False
                 ):
        super().__init__()
        self.num_classes = num_classes
        assert num_classes == 3 * tubelet_size * patch_size ** 2 
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.patch_size = patch_size
        self.use_checkpoint = use_checkpoint

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
                init_values=init_values)
            for i in range(depth)])
        self.norm =  norm_layer(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def get_num_layers(self):
        return len(self.blocks)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'pos_embed', 'cls_token'}

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=''):
        self.num_classes = num_classes
        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

    def forward(self, x, return_token_num):
        if self.use_checkpoint:
            for blk in self.blocks:
                x = checkpoint.checkpoint(blk, x)
        else:   
            for blk in self.blocks:
                x = blk(x)

        if return_token_num > 0:
            x = self.head(self.norm(x[:, -return_token_num:])) # only return the mask tokens predict pixels
        else:
            x = self.head(self.norm(x))

        return x


class PretrainVisionTransformer(nn.Module):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """
    def __init__(self,
                 img_size=224, 
                 patch_size=16, 
                 encoder_in_chans=3, 
                 encoder_num_classes=0, 
                 encoder_embed_dim=768, 
                 encoder_depth=12,
                 encoder_num_heads=12, 
                 decoder_num_classes=1536, #  decoder_num_classes=768, 
                 decoder_embed_dim=512, 
                 decoder_depth=8,
                 decoder_num_heads=8, 
                 mlp_ratio=4., 
                 qkv_bias=False, 
                 qk_scale=None, 
                 drop_rate=0., 
                 attn_drop_rate=0.,
                 drop_path_rate=0., 
                 norm_layer=nn.LayerNorm, 
                 init_values=0.,
                 use_learnable_pos_emb=False,
                 use_checkpoint=False,
                 num_frames=16,
                 tubelet_size=2,
                 num_classes=0, # avoid the error from create_fn in timm
                 in_chans=0, # avoid the error from create_fn in timm
                 ):
        super().__init__()
        self.encoder = PretrainVisionTransformerEncoder(
            img_size=img_size, 
            patch_size=patch_size, 
            in_chans=encoder_in_chans, 
            num_classes=encoder_num_classes, 
            embed_dim=encoder_embed_dim, 
            depth=encoder_depth,
            num_heads=encoder_num_heads, 
            mlp_ratio=mlp_ratio, 
            qkv_bias=qkv_bias, 
            qk_scale=qk_scale, 
            drop_rate=drop_rate, 
            attn_drop_rate=attn_drop_rate,
            drop_path_rate=drop_path_rate, 
            norm_layer=norm_layer, 
            init_values=init_values,
            num_frames=num_frames,
            tubelet_size=tubelet_size,
            use_checkpoint=use_checkpoint,
            use_learnable_pos_emb=use_learnable_pos_emb)

        self.decoder = PretrainVisionTransformerDecoder(
            patch_size=patch_size, 
            num_patches=self.encoder.patch_embed.num_patches,
            num_classes=decoder_num_classes, 
            embed_dim=decoder_embed_dim, 
            depth=decoder_depth,
            num_heads=decoder_num_heads, 
            mlp_ratio=mlp_ratio, 
            qkv_bias=qkv_bias, 
            qk_scale=qk_scale, 
            drop_rate=drop_rate, 
            attn_drop_rate=attn_drop_rate,
            drop_path_rate=drop_path_rate, 
            norm_layer=norm_layer, 
            init_values=init_values,
            tubelet_size=tubelet_size,
            use_checkpoint=use_checkpoint)

        self.encoder_to_decoder = nn.Linear(encoder_embed_dim, decoder_embed_dim, bias=False)

        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))

        self.pos_embed = get_sinusoid_encoding_table(self.encoder.patch_embed.num_patches, decoder_embed_dim)

        trunc_normal_(self.mask_token, std=.02)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def get_num_layers(self):
        return len(self.blocks)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'pos_embed', 'cls_token', 'mask_token'}

    def forward(self, x, mask):
        _, _, T, _, _ = x.shape
        x_vis = self.encoder(x, mask) # [B, N_vis, C_e]
        x_vis = self.encoder_to_decoder(x_vis) # [B, N_vis, C_d]
        B, N, C = x_vis.shape
        # we don't unshuffle the correct visible token order, 
        # but shuffle the pos embedding accorddingly.
        expand_pos_embed = self.pos_embed.expand(B, -1, -1).type_as(x).to(x.device).clone().detach()
        pos_emd_vis = expand_pos_embed[~mask].reshape(B, -1, C)
        pos_emd_mask = expand_pos_embed[mask].reshape(B, -1, C)
        x_full = torch.cat([x_vis + pos_emd_vis, self.mask_token + pos_emd_mask], dim=1) # [B, N, C_d]
        x = self.decoder(x_full, pos_emd_mask.shape[1]) # [B, N_mask, 3 * 16 * 16]

        return x


@register_model
def pretrain_videomae_base_patch16_224(pretrained=False, **kwargs):
    model = PretrainVisionTransformer(
        img_size=224,
        patch_size=16, 
        encoder_embed_dim=768, 
        encoder_depth=12, 
        encoder_num_heads=12,
        encoder_num_classes=0,
        decoder_num_classes=1536,
        decoder_embed_dim=384,
        decoder_num_heads=6,
        mlp_ratio=4, 
        qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, eps=1e-6), 
        **kwargs)
    model.default_cfg = _cfg()
    if pretrained:
        checkpoint = torch.load(
            kwargs["init_ckpt"], map_location="cpu"
        )
        model.load_state_dict(checkpoint["model"])
    return model
 

@register_model
def pretrain_videomae_large_patch16_224(pretrained=False, **kwargs):
    model = PretrainVisionTransformer(
        img_size=224,
        patch_size=16, 
        encoder_embed_dim=1024, 
        encoder_depth=24, 
        encoder_num_heads=16,
        encoder_num_classes=0,
        decoder_num_classes=1536, 
        decoder_embed_dim=512,
        decoder_num_heads=8,
        mlp_ratio=4, 
        qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, eps=1e-6), 
        **kwargs)
    model.default_cfg = _cfg()
    if pretrained:
        checkpoint = torch.load(
            kwargs["init_ckpt"], map_location="cpu"
        )
        model.load_state_dict(checkpoint["model"])
    return model


@register_model
def pretrain_videomae_huge_patch16_224(pretrained=False, **kwargs):
    model = PretrainVisionTransformer(
        img_size=224,
        patch_size=16, 
        encoder_embed_dim=1280, 
        encoder_depth=32, 
        encoder_num_heads=16,
        encoder_num_classes=0,
        decoder_num_classes=1536, 
        decoder_embed_dim=640,
        decoder_num_heads=8,
        mlp_ratio=4, 
        qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, eps=1e-6), 
        **kwargs)
    model.default_cfg = _cfg()
    if pretrained:
        checkpoint = torch.load(
            kwargs["init_ckpt"], map_location="cpu"
        )
        model.load_state_dict(checkpoint["model"])
    return model


================================================
FILE: Open-Sora/build/lib/vbench/third_pary/umt/models/modeling_pretrain_umt.py
================================================
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from functools import partial

from .modeling_finetune import Block, DropPath, Mlp, _cfg, PatchEmbed
from timm.models.registry import register_model
from timm.models.layers import trunc_normal_ as __call_trunc_normal_


def trunc_normal_(tensor, mean=0., std=1.):
    __call_trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std)


# sin-cos position encoding
# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
def get_sinusoid_encoding_table(n_position, d_hid): 
    ''' Sinusoid position encoding table ''' 
    # TODO: make it with torch instead of numpy 
    def get_position_angle_vec(position): 
        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] 

    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) 
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 

    return  torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0) 


class PretrainVisionTransformerEncoder(nn.Module):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, num_frames=16, tubelet_size=2,
                 use_checkpoint=False, checkpoint_num=0, use_learnable_pos_emb=False, clip_return_layer=1,
                 clip_student_return_interval=1):
        super().__init__()
        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.patch_embed = PatchEmbed(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, 
            num_frames=num_frames, tubelet_size=tubelet_size
        )
        num_patches = self.patch_embed.num_patches
        self.use_checkpoint = use_checkpoint
        self.checkpoint_num = checkpoint_num
        print(f'Use checkpoint: {use_checkpoint}')
        print(f'Checkpoint number: {checkpoint_num}')
        self.return_index = []
        for i in range(clip_return_layer):
            self.return_index.append(depth - int(i * clip_student_return_interval) - 1)
        print(f'Student return index: {self.return_index}')
        
        self.use_learnable_pos_emb = use_learnable_pos_emb
        if use_learnable_pos_emb:
            print('Use learnable position embedding')
            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
        else:
            # sine-cosine positional embeddings 
            self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim)

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
                init_values=init_values)
            for i in range(depth)])
        self.norm =  norm_layer(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        if use_learnable_pos_emb:
            trunc_normal_(self.pos_embed, std=.02)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def get_num_layers(self):
        return len(self.blocks)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'pos_embed', 'cls_token'}

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=''):
        self.num_classes = num_classes
        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

    def forward_features(self, x, mask):
        x = self.patch_embed(x)
        
        if self.use_learnable_pos_emb:
            x = x + self.pos_embed.type_as(x).to(x.device)
        else:
            x = x + self.pos_embed.type_as(x).to(x.device).clone().detach()

        B, _, C = x.shape
        x_vis = x[~mask].reshape(B, -1, C) # ~mask means visible
        x_clip_vis = []

        for idx, blk in enumerate(self.blocks):
            if self.use_checkpoint and idx < self.checkpoint_num:
                x_vis = checkpoint.checkpoint(blk, x_vis)
            else:
                x_vis = blk(x_vis)
            if idx in self.return_index:
                x_clip_vis.append(x_vis)

        x_vis = self.norm(x_vis)
        x_clip_vis = self.norm(torch.stack(x_clip_vis))
        return x_vis, x_clip_vis

    def forward(self, x, mask):
        x, x_clip_vis = self.forward_features(x, mask)
        x = self.head(x)
        x_clip_vis = self.head(x_clip_vis)
        return x_clip_vis


class Linear_Decoder(nn.Module):
    def __init__(self, num_classes=768, embed_dim=768, 
                 norm_layer=nn.LayerNorm, clip_norm_type='l2'):
        super().__init__()
        self.clip_norm_type = clip_norm_type
        print(f'Normalization Type: {clip_norm_type}')

        self.head = nn.Linear(embed_dim, num_classes)
        self.norm =  norm_layer(num_classes)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def forward(self, x):
        x = self.norm(self.head(x))

        if self.clip_norm_type == 'l2':
            x = x / x.norm(dim=-1, keepdim=True)
        elif self.clip_norm_type == 'none':
            pass
        else:
            raise NotImplementedError

        return x


class PretrainVisionTransformer(nn.Module):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """
    def __init__(self,
                 img_size=224, 
                 patch_size=16, 
                 encoder_in_chans=3, 
                 encoder_num_classes=0, 
                 encoder_embed_dim=768, 
                 encoder_depth=12,
                 encoder_num_heads=12, 
                 mlp_ratio=4., 
                 qkv_bias=False, 
                 qk_scale=None, 
                 drop_rate=0., 
                 attn_drop_rate=0.,
                 drop_path_rate=0., 
                 norm_layer=nn.LayerNorm, 
                 init_values=0.,
                 use_learnable_pos_emb=False,
                 use_checkpoint=False,
                 checkpoint_num=0,
                 num_frames=16,
                 tubelet_size=2,
                 # clip,
                 clip_decoder_embed_dim=768,
                 clip_output_dim=512,
                 clip_norm_type='l2',
                 clip_return_layer=1,
                 clip_student_return_interval=1,
                ):
        super().__init__()

        self.encoder = PretrainVisionTransformerEncoder(
            img_size=img_size, 
            patch_size=patch_size, 
            in_chans=encoder_in_chans, 
            num_classes=encoder_num_classes, 
            embed_dim=encoder_embed_dim, 
            depth=encoder_depth,
            num_heads=encoder_num_heads, 
            mlp_ratio=mlp_ratio, 
            qkv_bias=qkv_bias, 
            qk_scale=qk_scale, 
            drop_rate=drop_rate, 
            attn_drop_rate=attn_drop_rate,
            drop_path_rate=drop_path_rate, 
            norm_layer=norm_layer, 
            init_values=init_values,
            num_frames=num_frames,
            tubelet_size=tubelet_size,
            use_checkpoint=use_checkpoint,
            checkpoint_num=checkpoint_num,
            use_learnable_pos_emb=use_learnable_pos_emb,
            clip_return_layer=clip_return_layer,
            clip_student_return_interval=clip_student_return_interval
        )

        # CLIP decoder
        self.clip_decoder = nn.ModuleList([
            Linear_Decoder(
                num_classes=clip_output_dim, 
                embed_dim=clip_decoder_embed_dim, 
                norm_layer=norm_layer, 
                clip_norm_type=clip_norm_type
            ) for _ in range(clip_return_layer)
        ])

        self.clip_pos_embed = get_sinusoid_encoding_table(self.encoder.patch_embed.num_patches, clip_decoder_embed_dim)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def get_num_layers(self):
        return len(self.blocks)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {'pos_embed', 'cls_token', 'mask_token', 'clip_mask_token', 'clip_pos_embed'}

    def forward(self, x, mask):
        x_clip_vis = self.encoder(x, mask) # [B, N_vis, C_e]
        
        # align CLIP
        K, B, _, C_CLIP = x_clip_vis.shape
        expand_clip_pos_embed = self.clip_pos_embed.repeat(B, 1, 1).type_as(x).to(x.device).clone().detach()
        clip_pos_emd_vis = expand_clip_pos_embed[~mask].view(B, -1, C_CLIP).unsqueeze(0).repeat(K, 1, 1, 1)
        x_clip_full = x_clip_vis + clip_pos_emd_vis # [K, B, N, C_d_clip]

        x_clip = []
        for idx, clip_decoder in enumerate(self.clip_decoder):
            x_clip.append(clip_decoder(x_clip_full[idx]))
        x_clip = torch.stack(x_clip) # align and normalize
        
        return x_clip
    

@register_model
def pretrain_umt_base_patch16_224(pretrained=False, **kwargs):
    model = PretrainVisionTransformer(
        img_size=224,
        patch_size=16, 
        encoder_embed_dim=768, 
        encoder_depth=12, 
        encoder_num_heads=12,
        encoder_num_classes=0,
        mlp_ratio=4, 
        qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, eps=1e-6), 
        **kwargs)
    model.default_cfg = _cfg()
    if pretrained:
        checkpoint = torch.load(
            kwargs["init_ckpt"], map_location="cpu"
        )
        model.load_state_dict(checkpoint["model"])
    return model
 

@register_model
def pretrain_umt_large_patch16_224(pretrained=False, **kwargs):
    model = PretrainVisionTransformer(
        img_size=224,
        patch_size=16, 
        encoder_embed_dim=1024, 
        encoder_depth=24, 
        encoder_num_heads=16,
        encoder_num_classes=0,
        mlp_ratio=4, 
        qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, eps=1e-6), 
        **kwargs)
    model.default_cfg = _cfg()
    if pretrained:
        checkpoint = torch.load(
            kwargs["init_ckpt"], map_location="cpu"
        )
        model.load_state_dict(checkpoint["model"])
    return model


if __name__ == '__main__':
    import time
    from fvcore.nn import FlopCountAnalysis
    from fvcore.nn import flop_count_table
    import numpy as np

    seed = 4217
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    model = pretrain_umt_base_patch16_224()

    # flops = FlopCountAnalysis(model, torch.rand(1, 3, 16, 224, 224))
    # s = time.time()
    # print(flop_count_table(flops, max_depth=1))
    # print(time.time()-s)
    mask = torch.cat([
        torch.ones(1, 8 * int(14 * 14 * 0.75)),
        torch.zeros(1, 8 * int(14 * 14 * 0.25)),
    ], dim=-1).to(torch.bool)
    print(model(torch.rand(1, 3, 16, 224, 224), mask)[1].shape)

================================================
FILE: Open-Sora/build/lib/vbench/utils.py
================================================
import os
import json
import numpy as np
import logging
import subprocess
import torch
import re
from pathlib import Path
from PIL import Image, ImageSequence
from decord import VideoReader, cpu
from torchvision import transforms
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, ToPILImage
try:
    from torchvision.transforms import InterpolationMode
    BICUBIC = InterpolationMode.BICUBIC
    BILINEAR = InterpolationMode.BILINEAR
except ImportError:
    BICUBIC = Image.BICUBIC
    BILINEAR = Image.BILINEAR

CACHE_DIR = os.environ.get('VBENCH_CACHE_DIR')
if CACHE_DIR is None:
    CACHE_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'vbench')

logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def clip_transform(n_px):
    return Compose([
        Resize(n_px, interpolation=BICUBIC),
        CenterCrop(n_px),
        transforms.Lambda(lambda x: x.float().div(255.0)),
        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])

def clip_transform_Image(n_px):
    return Compose([
        Resize(n_px, interpolation=BICUBIC),
        CenterCrop(n_px),
        ToTensor(),
        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])

def dino_transform(n_px):
    return Compose([
        Resize(size=n_px),
        transforms.Lambda(lambda x: x.float().div(255.0)),
        Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

def dino_transform_Image(n_px):
    return Compose([
        Resize(size=n_px),
        ToTensor(),
        Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

def tag2text_transform(n_px):
    normalize = Normalize(mean=[0.485, 0.456, 0.406],
                                        std=[0.229, 0.224, 0.225])
    return Compose([ToPILImage(),Resize((n_px, n_px)),ToTensor(),normalize])

def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1):
    if sample in ["rand", "middle"]: # uniform sampling
        acc_samples = min(num_frames, vlen)
        # split the video into `acc_samples` intervals, and sample from each interval.
        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
        ranges = []
        for idx, interv in enumerate(intervals[:-1]):
            ranges.append((interv, intervals[idx + 1] - 1))
        if sample == 'rand':
            try:
                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
            except:
                frame_indices = np.random.permutation(vlen)[:acc_samples]
                frame_indices.sort()
                frame_indices = list(frame_indices)
        elif fix_start is not None:
            frame_indices = [x[0] + fix_start for x in ranges]
        elif sample == 'middle':
            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
        else:
            raise NotImplementedError

        if len(frame_indices) < num_frames:  # padded with last frame
            padded_frame_indices = [frame_indices[-1]] * num_frames
            padded_frame_indices[:len(frame_indices)] = frame_indices
            frame_indices = padded_frame_indices
    elif "fps" in sample:  # fps0.5, sequentially sample frames at 0.5 fps
        output_fps = float(sample[3:])
        duration = float(vlen) / input_fps
        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
        frame_indices = np.around(frame_seconds * input_fps).astype(int)
        frame_indices = [e for e in frame_indices if e < vlen]
        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
            frame_indices = frame_indices[:max_num_frames]
            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
    else:
        raise ValueError
    return frame_indices

def load_video(video_path, data_transform=None, num_frames=None, return_tensor=True, width=None, height=None):
    """
    Load a video from a given path and apply optional data transformations.

    The function supports loading video in GIF (.gif), PNG (.png), and MP4 (.mp4) formats.
    Depending on the format, it processes and extracts frames accordingly.
    
    Parameters:
    - video_path (str): The file path to the video or image to be loaded.
    - data_transform (callable, optional): A function that applies transformations to the video data.
    
    Returns:
    - frames (torch.Tensor): A tensor containing the video frames with shape (T, C, H, W),
      where T is the number of frames, C is the number of channels, H is the height, and W is the width.
    
    Raises:
    - NotImplementedError: If the video format is not supported.
    
    The function first determines the format of the video file by its extension.
    For GIFs, it iterates over each frame and converts them to RGB.
    For PNGs, it reads the single frame, converts it to RGB.
    For MP4s, it reads the frames using the VideoReader class and converts them to NumPy arrays.
    If a data_transform is provided, it is applied to the buffer before converting it to a tensor.
    Finally, the tensor is permuted to match the expected (T, C, H, W) format.
    """
    if video_path.endswith('.gif'):
        frame_ls = []
        img = Image.open(video_path)
        for frame in ImageSequence.Iterator(img):
            frame = frame.convert('RGB')
            frame = np.array(frame).astype(np.uint8)
            frame_ls.append(frame)
        buffer = np.array(frame_ls).astype(np.uint8)
    elif video_path.endswith('.png'):
        frame = Image.open(video_path)
        frame = frame.convert('RGB')
        frame = np.array(frame).astype(np.uint8)
        frame_ls = [frame]
        buffer = np.array(frame_ls)
    elif video_path.endswith('.mp4'):
        import decord
        decord.bridge.set_bridge('native')
        if width:
            video_reader = VideoReader(video_path, width=width, height=height, num_threads=1)
        else:
            video_reader = VideoReader(video_path, num_threads=1)
        frames = video_reader.get_batch(range(len(video_reader)))  # (T, H, W, C), torch.uint8

        buffer = frames.asnumpy().astype(np.uint8)
    else:
        raise NotImplementedError
    
    frames = buffer
    if num_frames:
        frame_indices = get_frame_indices(
        num_frames, len(frames), sample="middle"
        )
        frames = frames[frame_indices]
    
    if data_transform:
        frames = data_transform(frames)
    elif return_tensor:
        frames = torch.Tensor(frames)
        frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8

    return frames

def read_frames_decord_by_fps(
        video_path, sample_fps=2, sample='rand', fix_start=None, 
        max_num_frames=-1,  trimmed30=False, num_frames=8
    ):
    import decord
    decord.bridge.set_bridge("torch")
    video_reader = VideoReader(video_path, num_threads=1)
    vlen = len(video_reader)
    fps = video_reader.get_avg_fps()
    duration = vlen / float(fps)

    if trimmed30 and duration > 30:
        duration = 30
        vlen = int(30 * float(fps))

    frame_indices = get_frame_indices(
        num_frames, vlen, sample=sample, fix_start=fix_start,
        input_fps=fps, max_num_frames=max_num_frames
    )
    frames = video_reader.get_batch(frame_indices)  # (T, H, W, C), torch.uint8
    frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8
    return frames
    
def load_dimension_info(json_dir, dimension, lang):
    """
    Load video list and prompt information based on a specified dimension and language from a JSON file.
    
    Parameters:
    - json_dir (str): The directory path where the JSON file is located.
    - dimension (str): The dimension for evaluation to filter the video prompts.
    - lang (str): The language key used to retrieve the appropriate prompt text.
    
    Returns:
    - video_list (list): A list of video file paths that match the specified dimension.
    - prompt_dict_ls (list): A list of dictionaries, each containing a prompt and its corresponding video list.
    
    The function reads the JSON file to extract video information. It filters the prompts based on the specified
    dimension and compiles a list of video paths and associated prompts in the specified language.
    
    Notes:
    - The JSON file is expected to contain a list of dictionaries with keys 'dimension', 'video_list', and language-based prompts.
    - The function assumes that the 'video_list' key in the JSON can either be a list or a single string value.
    """
    video_list = []
    prompt_dict_ls = []
    full_prompt_list = load_json(json_dir)
    for prompt_dict in full_prompt_list:
        if dimension in prompt_dict['dimension'] and 'video_list' in prompt_dict:
            prompt = prompt_dict[f'prompt_{lang}']
            cur_video_list = prompt_dict['video_list'] if isinstance(prompt_dict['video_list'], list) else [prompt_dict['video_list']]
            video_list += cur_video_list
            if 'auxiliary_info' in prompt_dict and dimension in prompt_dict['auxiliary_info']:
                prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list, 'auxiliary_info': prompt_dict['auxiliary_info'][dimension]}]
            else:
                prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list}]
    return video_list, prompt_dict_ls

def init_submodules(dimension_list, local=False, read_frame=False):
    submodules_dict = {}
    if local:
        logger.info("\x1b[32m[Local Mode]\x1b[0m Working in local mode, please make sure that the pre-trained model has been fully downloaded.")
    for dimension in dimension_list:
        os.makedirs(CACHE_DIR, exist_ok=True)
        if dimension == 'background_consistency':
            # read_frame = False
            if local:
                vit_b_path = f'{CACHE_DIR}/clip_model/ViT-B-32.pt'
                if not os.path.isfile(vit_b_path):
                    wget_command = ['wget', 'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt', '-P', os.path.dirname(vit_b_path)]
                    subprocess.run(wget_command, check=True)
            else:
                vit_b_path = 'ViT-B/32'

            submodules_dict[dimension] = [vit_b_path, read_frame]
        elif dimension == 'human_action':
            umt_path = f'{CACHE_DIR}/umt_model/l16_ptk710_ftk710_ftk400_f16_res224.pth'
            if not os.path.isfile(umt_path):
                wget_command = ['wget', 'https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/umt/single_modality/l16_ptk710_ftk710_ftk400_f16_res224.pth', '-P', os.path.dirname(umt_path)]
                subprocess.run(wget_command, check=True)
            submodules_dict[dimension] = [umt_path,]
        elif dimension == 'temporal_flickering':
            submodules_dict[dimension] = []
        elif dimension == 'motion_smoothness':
            CUR_DIR = os.path.dirname(os.path.abspath(__file__))
            submodules_dict[dimension] = {
                    'config': f'{CUR_DIR}/third_party/amt/cfgs/AMT-S.yaml',
                    'ckpt': f'{CACHE_DIR}/amt_model/amt-s.pth'
                }
            details = submodules_dict[dimension]
            # Check if the file exists, if not, download it with wget
            if not os.path.isfile(details['ckpt']):
                print(f"File {details['ckpt']} does not exist. Downloading...")
                wget_command = ['wget', '-P', os.path.dirname(details['ckpt']),
                                'https://huggingface.co/lalala125/AMT/resolve/main/amt-s.pth']
                subprocess.run(wget_command, check=True)

        elif dimension == 'dynamic_degree':
            submodules_dict[dimension] = {
                'model': f'{CACHE_DIR}/raft_model/models/raft-things.pth'
            }
            details = submodules_dict[dimension]
            if not os.path.isfile(details['model']):
                # raise NotImplementedError
                print(f"File {details['model']} does not exist. Downloading...")
                wget_command = ['wget', '-P', f'{CACHE_DIR}/raft_model/', 'https://dl.dropboxusercontent.com/s/4j4z58wuv8o0mfz/models.zip']
                unzip_command = ['unzip', '-d', f'{CACHE_DIR}/raft_model/', f'{CACHE_DIR}/raft_model/models.zip']
                remove_command = ['rm', '-r', f'{CACHE_DIR}/raft_model/models.zip']
                try:
                    subprocess.run(wget_command, check=True)
                    subprocess.run(unzip_command, check=True)
                    subprocess.run(remove_command, check=True)
                except subprocess.CalledProcessError as err:
                    print(f"Error during downloading RAFT model: {err}")
        # Assign the DINO model path for subject consistency dimension
        elif dimension == 'subject_consistency':
            if local:
                submodules_dict[dimension] = {
                    'repo_or_dir': f'{CACHE_DIR}/dino_model/facebookresearch_dino_main/',
                    'path': f'{CACHE_DIR}/dino_model/dino_vitbase16_pretrain.pth', 
                    'model': 'dino_vitb16',
                    'source': 'local',
                    'read_frame': read_frame
                    }
                details = submodules_dict[dimension]
                # Check if the file exists, if not, download it with wget
                if not os.path.isdir(details['repo_or_dir']):
                    print(f"Directory {details['repo_or_dir']} does not exist. Cloning repository...")
                    subprocess.run(['git', 'clone', 'https://github.com/facebookresearch/dino', details['repo_or_dir']], check=True)

                if not os.path.isfile(details['path']):
                    print(f"File {details['path']} does not exist. Downloading...")
                    wget_command = ['wget', '-P', os.path.dirname(details['path']),
                                    'https://dl.fbaipublicfiles.com/dino/dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth']
                    subprocess.run(wget_command, check=True)
            else:
                submodules_dict[dimension] = {
                    'repo_or_dir':'facebookresearch/dino:main',
                    'source':'github',
                    'model': 'dino_vitb16',
                    'read_frame': read_frame
                    }
        elif dimension == 'aesthetic_quality':
            aes_path = f'{CACHE_DIR}/aesthetic_model/emb_reader'
            if local:
                vit_l_path = f'{CACHE_DIR}/clip_model/ViT-L-14.pt'
                if not os.path.isfile(vit_l_path):
                    wget_command = ['wget' ,'https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt', '-P', os.path.dirname(vit_l_path)]
                    subprocess.run(wget_command, check=True)
            else:
                vit_l_path = 'ViT-L/14'
            submodules_dict[dimension] = [vit_l_path, aes_path]
        elif dimension == 'imaging_quality':
            musiq_spaq_path = f'{CACHE_DIR}/pyiqa_model/musiq_spaq_ckpt-358bb6af.pth'
            if not os.path.isfile(musiq_spaq_path):
                wget_command = ['wget', 'https://github.com/chaofengc/IQA-PyTorch/releases/download/v0.1-weights/musiq_spaq_ckpt-358bb6af.pth', '-P', os.path.dirname(musiq_spaq_path)]
                subprocess.run(wget_command, check=True)
            submodules_dict[dimension] = {'model_path': musiq_spaq_path}
        elif dimension in ["object_class", "multiple_objects", "color", "spatial_relationship" ]:
            submodules_dict[dimension] = {
                "model_weight": f'{CACHE_DIR}/grit_model/grit_b_densecap_objectdet.pth'
            }
            if not os.path.exists(submodules_dict[dimension]['model_weight']):
                wget_command = ['wget', 'https://datarelease.blob.core.windows.net/grit/models/grit_b_densecap_objectdet.pth', '-P', os.path.dirname(submodules_dict[dimension]["model_weight"])]
                subprocess.run(wget_command, check=True)
        elif dimension == 'scene':
            submodules_dict[dimension] = {
                "pretrained": f'{CACHE_DIR}/caption_model/tag2text_swin_14m.pth',
                "image_size":384, 
                "vit":"swin_b"
            }
            if not os.path.exists(submodules_dict[dimension]['pretrained']):
                wget_command = ['wget', 'https://huggingface.co/spaces/xinyu1205/recognize-anything/resolve/main/tag2text_swin_14m.pth', '-P', os.path.dirname(submodules_dict[dimension]["pretrained"])]
                subprocess.run(wget_command, check=True)
        elif dimension == 'appearance_style':
            if local:
                submodules_dict[dimension] = {"name": f'{CACHE_DIR}/clip_model/ViT-B-32.pt'}
                if not os.path.isfile(submodules_dict[dimension]["name"]):
                    wget_command = ['wget', 'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt', '-P', os.path.dirname(submodules_dict[dimension]["name"])]
                    subprocess.run(wget_command, check=True)
            else:
                submodules_dict[dimension] = {"name": 'ViT-B/32'}
        elif dimension in ["temporal_style", "overall_consistency"]:
            submodules_dict[dimension] = {
                "pretrain": f'{CACHE_DIR}/ViCLIP/ViClip-InternVid-10M-FLT.pth',
            }
            if not os.path.exists(submodules_dict[dimension]['pretrain']):
                wget_command = ['wget', 'https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/internvideo/viclip/ViClip-InternVid-10M-FLT.pth', '-P', os.path.dirname(submodules_dict[dimension]["pretrain"])]
                subprocess.run(wget_command, check=True)
    return submodules_dict

def get_prompt_from_filename(path: str):
    """
    1. prompt-0.suffix -> prompt
    2. prompt.suffix -> prompt
    """
    prompt = Path(path).stem
    number_ending = r'-\d+$' # checks ending with -<number>
    if re.search(number_ending, prompt):
        return re.sub(number_ending, '', prompt)
    return prompt

def save_json(data, path, indent=4):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=indent)

def load_json(path):
    """
    Load a JSON file from the given file path.
    
    Parameters:
    - file_path (str): The path to the JSON file.
    
    Returns:
    - data (dict or list): The data loaded from the JSON file, which could be a dictionary or a list.
    """
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)


================================================
FILE: Open-Sora/build/lib/vbench2_beta_i2v/__init__.py
================================================
import os

from vbench2_beta_i2v.utils import init_submodules, save_json, load_json
from vbench import VBench
import importlib


class VBenchI2V(VBench):
    def build_full_dimension_list(self, ):
        return ["subject_consistency", "background_consistency", "aesthetic_quality", "imaging_quality", "object_class", "multiple_objects", "color", "spatial_relationship", "scene", "temporal_style", 'overall_consistency', "human_action", "temporal_flickering", "motion_smoothness", "dynamic_degree", "appearance_style", "i2v_subject", "i2v_background", "camera_motion"]     

    def evaluate(self, videos_path, name, dimension_list=None, local=False, read_frame=False, custom_prompt=False, resolution="1-1"):
        results_dict = {}
        if dimension_list is None:
            dimension_list = self.build_full_dimension_list()
        submodules_dict = init_submodules(dimension_list, local=local, read_frame=read_frame, resolution=resolution)
        # print('BEFORE BUILDING')
        cur_full_info_path = self.build_full_info_json(videos_path, name, dimension_list, custom_prompt=custom_prompt)
        # print('AFTER BUILDING')
        for dimension in dimension_list:
            try:
                dimension_module = importlib.import_module(f'vbench2_beta_i2v.{dimension}')
                evaluate_func = getattr(dimension_module, f'compute_{dimension}')
            except Exception as e:
                raise NotImplementedError(f'UnImplemented dimension {dimension}!, {e}')
            submodules_list = submodules_dict[dimension]
            print(f'cur_full_info_path: {cur_full_info_path}') # TODO: to delete
            results = evaluate_func(cur_full_info_path, self.device, submodules_list)
            results_dict[dimension] = results
        output_name = os.path.join(self.output_path, name+'_eval_results.json')
        save_json(results_dict, output_name)
        print(f'Evaluation results saved to {output_name}')


================================================
FILE: Open-Sora/build/lib/vbench2_beta_i2v/camera_motion.py
================================================
import torch
import os
import numpy as np
from tqdm import tqdm

from vbench2_beta_i2v.third_party.cotracker.utils.visualizer import Visualizer
from vbench2_beta_i2v.utils import load_video, load_dimension_info


def transform(vector):
    x = np.mean([item[0] for item in vector])
    y = np.mean([item[1] for item in vector])
    return [x, y]


def transform_class(vector, min_reso, factor=0.005): # 768*0.05
    scale = min_reso * factor
    x, y = vector
    direction = []

    if x > scale:
        direction.append("right")
    elif x < -scale:
        direction.append("left")
    
    if y > scale:
        direction.append("down")
    elif y < -scale:
        direction.append("up")

    return direction if direction else ["static"]


class CameraPredict:
    def __init__(self, device, submodules_list):
        self.device = device
        self.grid_size = 10
        try:
            self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device)
        except:
            # workaround for CERTIFICATE_VERIFY_FAILED (see: https://github.com/pytorch/pytorch/issues/33288#issuecomment-954160699)
            import ssl
            ssl._create_default_https_context = ssl._create_unverified_context
            self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device)

    def infer(self, video_path, save_video=False, save_dir="./saved_videos"):
        # load video
        video = load_video(video_path, return_tensor=False)
        # set scale
        height, width = video.shape[1], video.shape[2]
        self.scale = min(height, width)
        video = torch.from_numpy(video).permute(0, 3, 1, 2)[None].float().to(self.device) # B T C H W
        pred_tracks, pred_visibility = self.model(video, grid_size=self.grid_size) # B T N 2,  B T N 1
        
        if save_video:
            video_name = os.path.basename(video_path)[:-4]
            vis = Visualizer(save_dir=save_dir, pad_value=120, linewidth=3)
            vis.visualize(video, pred_tracks, pred_visibility, filename=video_name)

        return pred_tracks[0].long().detach().cpu().numpy()
    

    def get_edge_point(self, track):
        middle = self.grid_size // 2
        top = [list(track[0, i, :]) for i in range(middle-2, middle+2)]
        down = [list(track[self.grid_size-1, i, :]) for i in range(middle-2, middle+2)]
        left = [list(track[i, 0, :]) for i in range(middle-2, middle+2)]
        right = [list(track[i, self.grid_size-1, :]) for i in range(middle-2, middle+2)]
        
        return top, down, left, right
    

    def get_edge_direction(self, track1, track2):
        edge_points1 = self.get_edge_point(track1)
        edge_points2 = self.get_edge_point(track2)

        vector_results = []
        for points1, points2 in zip(edge_points1, edge_points2):
            vectors = [[end[0]-start[0], end[1]-start[1]] for start, end in zip(points1, points2)]
            vector_results.append(vectors)
        vector_results = list(map(transform, vector_results)) 
        class_results = [transform_class(vector, min_reso=self.scale) for vector in vector_results]

        return class_results


    def classify_top_down(self, top, down):
        results = []
        classes = [f"{item_t}_{item_d}" for item_t in top for item_d in down]

        results_mapping = {
            "left_left": "pan_right",
            "right_right": "pan_left",
            "down_down": "tilt_up",
            "up_up": "tilt_down",
            "up_down": "zoom_in",
            "down_up": "zoom_out",
            "static_static": "static"
        }
        results = [results_mapping.get(cls) for cls in classes if cls in results_mapping]
        return results if results else ["None"]


    def classify_left_right(self, left, right):
        results = []
        classes = [f"{item_l}_{item_r}" for item_l in left for item_r in right]

        results_mapping = {
            "left_left": "pan_right",
            "right_right": "pan_left",
            "down_down": "tilt_up",
            "up_up": "tilt_down",
            "left_right": "zoom_in",
            "right_left": "zoom_out",
            "static_static": "static"
        }
        results = [results_mapping.get(cls) for cls in classes if cls in results_mapping]
        return results if results else ["None"]


    def camera_classify(self, track1, track2):
        top, down, left, right = self.get_edge_direction(track1, track2)

        top_results = self.classify_top_down(top, down)
        left_results = self.classify_left_right(left, right)

        results = list(set(top_results+left_results))
        if "static" in results and len(results)>1:
            results.remove("static")
        if "None" in results and len(results)>1:
            results.remove("None")  

        return results


    def predict(self, video_path):
        pred_track = self.infer(video_path)
        track1 = pred_track[0].reshape((self.grid_size, self.grid_size, 2))
        track2 = pred_track[-1].reshape((self.grid_size, self.grid_size, 2))
        results = self.camera_classify(track1, track2)

        return results


def get_type(video_name):
    camera_mapping = {
        "camera pans left": "pan_left",
        "camera pans right": "pan_right",
        "camera tilts up": "tilt_up",
        "camera tilts down": "tilt_down",
        "camera zooms in": "zoom_in",
        "camera zooms out": "zoom_out",
        "camera static": "static"
    }

    for item, value in camera_mapping.items():
        if item in video_name:
            return value
        
    raise ValueError("Not a recognized video name")


def camera_motion(camera, video_list):
    sim = []
    video_results = []
    diff_type_results = {
        "pan_left":[],
        "pan_right":[],
        "tilt_up":[],
        "tilt_down":[],
        "zoom_in":[],
        "zoom_out":[],
        "static":[],
    }
    for video_path in tqdm(video_list):
        target_type = get_type(os.path.basename(video_path))
        predict_results = camera.predict(video_path)

        video_score = 1.0 if target_type in predict_results else 0.0
        diff_type_results[target_type].append(video_score)
        video_results.append({'video_path': video_path, 'video_results': video_score, 'prompt_type':target_type, 'predict_type': predict_results})
        sim.append(video_score)
    
    avg_score = np.mean(sim)

    for key, value in diff_type_results.items():
        diff_type_results[key] = np.mean(value)

    return avg_score, diff_type_results, video_results


def compute_camera_motion(json_dir, device, submodules_list):
    camera = CameraPredict(device, submodules_list)
    video_list, _ = load_dimension_info(json_dir, dimension='camera_motion', lang='en')
    all_results, diff_type_results, video_results = camera_motion(camera, video_list)
    return all_results, diff_type_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench2_beta_i2v/crop_to_diff_ratio.py
================================================
import os
from PIL import Image
import json
import os.path as osp
import random
import argparse
from tqdm import tqdm

import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def save_json(data, save_file):
    json.dump(data, open(save_file, "w"))


def crop(img_path, bbox, save_root):
    os.makedirs(save_root, exist_ok=True)
    img = Image.open(img_path)
    x, y, width, height = map(int, bbox)
    crop_img = img.crop((x, y, x+width, y+height))
    crop_img.save(osp.join(save_root, osp.basename(img_path)))
    
    
def get_other_ratio_crop(second_crop_info, ratio="8-5"):
    random.seed(123)
    ratio_w, ratio_h = map(int, ratio.split('-'))
    assert 1.0 <= ratio_w/ratio_h < 1.7778, "The ratio does not meet the requirements, it needs to be between 1:1 and 16:9."
    width, height = second_crop_info['width'], second_crop_info['height']
    x, y, crop_w, crop_h = second_crop_info['second_bbox']
    
    if width == height:
        target_w = int(width/ratio_w) * ratio_w
        target_h = int(width/ratio_w) * ratio_h
        assert target_h >= crop_h
        target_x = 0
        y_min = max(y - (target_h - crop_h), 0)
        y_max = min(y + target_h, height) - target_h
        assert y_max >= y_min
        target_y = random.randint(y_min, y_max)
    else:
        target_w = int(height/ratio_h) * ratio_w
        target_h = int(height/ratio_h) * ratio_h
        assert target_w >= crop_w
        target_y = 0
        x_min = max(x - (target_w - crop_w), 0)
        x_max = min(x + target_w, width) - target_w
        assert x_max >= x_min
        target_x = random.randint(x_min, x_max)
        
    return [target_x, target_y, target_w, target_h]


def transfer_bbox_to_origin_img(first_crop_info, old_bbox):
    x, y, _, _ = first_crop_info["first_bbox"]
    old_x, old_y, width, height = old_bbox
    return [x + old_x, y + old_y, width, height]


def get_target_crop(args):

    data = json.load(open(args.crop_info_path, "r"))
    target_results = []
    os.makedirs(args.result_path, exist_ok=True)
    
    ####### get target crop info ########
    for item in tqdm(data):
        second_crop_info = item["second_crop"]
        first_crop_info = item["first_crop"]
        target_crop = transfer_bbox_to_origin_img(first_crop_info, get_other_ratio_crop(second_crop_info, args.target_ratio))
        item["target_crop"] = {
            "target_ratio":args.target_ratio,
            "target_bbox":target_crop
        }
        target_results.append(item)

    target_file = os.path.join(args.result_path, f"target_crop_info_{args.target_ratio}.json")
    save_json(target_results, target_file)
    logger.info(f"Target crop info are saved in the '{target_file}' file")    
    
    ####### crop images #########
    ori_path = args.ori_image_path
    target_path = f"{args.result_path}/{args.target_ratio}"

    for sample in tqdm(target_results):
        img_path = osp.join(ori_path, sample["file_name"])
        target_bbox = sample["target_crop"]["target_bbox"]
        crop(img_path, target_bbox, target_path)
    
    logger.info(f"Cropped images are saved in the '{target_path}' path")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--crop_info_path', type=str, default="vbench2_beta_i2v/data/i2v-bench-info.json", help="image suite meta info")
    parser.add_argument('--target_ratio', default="5-4", required=True, help="the required crop ratio")
    parser.add_argument('--ori_image_path', type=str, default="vbench2_beta_i2v/data/origin", help='the file path of the original image data')
    parser.add_argument('--result_path', type=str, default="vbench2_beta_i2v/data/target_crop", help='result save path')
    args = parser.parse_args()
    get_target_crop(args)

================================================
FILE: Open-Sora/build/lib/vbench2_beta_i2v/i2v_background.py
================================================
import io
import os
import cv2
import json
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms

from vbench2_beta_i2v.utils import load_video, load_i2v_dimension_info, dino_transform, dino_transform_Image
import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def i2v_background(model, video_pair_list, device):
    video_results = []
    sim_list = []

    max_weight = 0.5
    mean_weight = 0.5
    min_weight = 0.0

    image_transform = dino_transform_Image(224)
    frames_transform = dino_transform(224)

    for image_path, video_path in tqdm(video_pair_list):
        # input image preprocess & extract feature
        input_image = image_transform(Image.open(image_path))
        input_image = input_image.unsqueeze(0)
        input_image = input_image.to(device)
        input_image_features = model(input_image)
        input_image_features = F.normalize(input_image_features, dim=-1, p=2)

        # get frames from video
        images = load_video(video_path)
        images = frames_transform(images)

        # calculate sim between input image and frames in generated video
        conformity_scores = []
        consec_scores = []
        for i in range(len(images)):
            with torch.no_grad():
                image = images[i].unsqueeze(0)
                image = image.to(device)
                image_features = model(image)
                image_features = F.normalize(image_features, dim=-1, p=2)
                if i != 0:
                    sim_consec = max(0.0, F.cosine_similarity(former_image_features, image_features).item())
                    consec_scores.append(sim_consec)
                sim_to_input = max(0.0, F.cosine_similarity(input_image_features, image_features).item())
                conformity_scores.append(sim_to_input)
                former_image_features = image_features

        video_score = max_weight * np.max(conformity_scores) + \
            mean_weight * np.mean(consec_scores) + \
            min_weight * np.min(consec_scores)

        sim_list.append(video_score)
        video_results.append({'image_path': image_path, 'video_path': video_path, 'video_results': video_score})
    return np.mean(sim_list), video_results


def compute_i2v_background(json_dir, device, submodules_list):
    dino_model = torch.hub.load(**submodules_list).to(device)
    resolution = submodules_list['resolution']
    logger.info("Initialize DINO success")
    video_pair_list, _ = load_i2v_dimension_info(json_dir, dimension='i2v_background', lang='en', resolution=resolution)
    all_results, video_results = i2v_background(dino_model, video_pair_list, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench2_beta_i2v/i2v_subject.py
================================================
import io
import os
import cv2
import json
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms

from vbench2_beta_i2v.utils import load_video, load_i2v_dimension_info, dino_transform, dino_transform_Image
import logging
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def i2v_subject(model, video_pair_list, device):
    video_results = []
    sim_list = []

    max_weight = 0.5
    mean_weight = 0.5
    min_weight = 0.0

    image_transform = dino_transform_Image(224)
    frames_transform = dino_transform(224)

    for image_path, video_path in tqdm(video_pair_list):
        # input image preprocess & extract feature
        input_image = image_transform(Image.open(image_path))
        input_image = input_image.unsqueeze(0)
        input_image = input_image.to(device)
        input_image_features = model(input_image)
        input_image_features = F.normalize(input_image_features, dim=-1, p=2)

        # get frames from video
        images = load_video(video_path)
        images = frames_transform(images)

        # calculate sim between input image and frames in generated video
        conformity_scores = []
        consec_scores = []
        for i in range(len(images)):
            with torch.no_grad():
                image = images[i].unsqueeze(0)
                image = image.to(device)
                image_features = model(image)
                image_features = F.normalize(image_features, dim=-1, p=2)
                if i != 0:
                    sim_consec = max(0.0, F.cosine_similarity(former_image_features, image_features).item())
                    consec_scores.append(sim_consec)
                sim_to_input = max(0.0, F.cosine_similarity(input_image_features, image_features).item())
                conformity_scores.append(sim_to_input)
                former_image_features = image_features

        video_score = max_weight * np.max(conformity_scores) + \
            mean_weight * np.mean(consec_scores) + \
            min_weight * np.min(consec_scores)

        sim_list.append(video_score)
        video_results.append({'image_path': image_path, 'video_path': video_path, 'video_results': video_score})
    return np.mean(sim_list), video_results


def compute_i2v_subject(json_dir, device, submodules_list):
    dino_model = torch.hub.load(**submodules_list).to(device)
    resolution = submodules_list['resolution']
    logger.info("Initialize DINO success")
    video_pair_list, _ = load_i2v_dimension_info(json_dir, dimension='i2v_subject', lang='en', resolution=resolution)
    all_results, video_results = i2v_subject(dino_model, video_pair_list, device)
    return all_results, video_results


================================================
FILE: Open-Sora/build/lib/vbench2_beta_i2v/utils.py
================================================
import os
import json
import numpy as np
import logging
import subprocess
import torch
from PIL import Image, ImageSequence
from decord import VideoReader, cpu
from torchvision import transforms
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, ToPILImage
try:
    from torchvision.transforms import InterpolationMode
    BICUBIC = InterpolationMode.BICUBIC
    BILINEAR = InterpolationMode.BILINEAR
except ImportError:
    BICUBIC = Image.BICUBIC
    BILINEAR = Image.BILINEAR

CACHE_DIR = os.environ.get('VBENCH_CACHE_DIR')
if CACHE_DIR is None:
    CACHE_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'vbench')

logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def clip_transform(n_px):
    return Compose([
        Resize(n_px, interpolation=BICUBIC),
        CenterCrop(n_px),
        transforms.Lambda(lambda x: x.float().div(255.0)),
        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])

def clip_transform_Image(n_px):
    return Compose([
        Resize(n_px, interpolation=BICUBIC),
        CenterCrop(n_px),
        ToTensor(),
        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])

def dino_transform(n_px):
    return Compose([
        Resize(size=n_px),
        transforms.Lambda(lambda x: x.float().div(255.0)),
        Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

def dino_transform_Image(n_px):
    return Compose([
        Resize(size=n_px),
        ToTensor(),
        Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

def tag2text_transform(n_px):
    normalize = Normalize(mean=[0.485, 0.456, 0.406],
                                        std=[0.229, 0.224, 0.225])
    return Compose([ToPILImage(),Resize((n_px, n_px)),ToTensor(),normalize])

def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1):
    if sample in ["rand", "middle"]: # uniform sampling
        acc_samples = min(num_frames, vlen)
        # split the video into `acc_samples` intervals, and sample from each interval.
        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
        ranges = []
        for idx, interv in enumerate(intervals[:-1]):
            ranges.append((interv, intervals[idx + 1] - 1))
        if sample == 'rand':
            try:
                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
            except:
                frame_indices = np.random.permutation(vlen)[:acc_samples]
                frame_indices.sort()
                frame_indices = list(frame_indices)
        elif fix_start is not None:
            frame_indices = [x[0] + fix_start for x in ranges]
        elif sample == 'middle':
            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
        else:
            raise NotImplementedError

        if len(frame_indices) < num_frames:  # padded with last frame
            padded_frame_indices = [frame_indices[-1]] * num_frames
            padded_frame_indices[:len(frame_indices)] = frame_indices
            frame_indices = padded_frame_indices
    elif "fps" in sample:  # fps0.5, sequentially sample frames at 0.5 fps
        output_fps = float(sample[3:])
        duration = float(vlen) / input_fps
        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
        frame_indices = np.around(frame_seconds * input_fps).astype(int)
        frame_indices = [e for e in frame_indices if e < vlen]
        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
            frame_indices = frame_indices[:max_num_frames]
            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
    else:
        raise ValueError
    return frame_indices

def load_video(video_path, data_transform=None, num_frames=None, return_tensor=True, width=None, height=None):
    """
    Load a video from a given path and apply optional data transformations.

    The function supports loading video in GIF (.gif), PNG (.png), and MP4 (.mp4) formats.
    Depending on the format, it processes and extracts frames accordingly.
    
    Parameters:
    - video_path (str): The file path to the video or image to be loaded.
    - data_transform (callable, optional): A function that applies transformations to the video data.
    
    Returns:
    - frames (torch.Tensor): A tensor containing the video frames with shape (T, C, H, W),
      where T is the number of frames, C is the number of channels, H is the height, and W is the width.
    
    Raises:
    - NotImplementedError: If the video format is not supported.
    
    The function first determines the format of the video file by its extension.
    For GIFs, it iterates over each frame and converts them to RGB.
    For PNGs, it reads the single frame, converts it to RGB.
    For MP4s, it reads the frames using the VideoReader class and converts them to NumPy arrays.
    If a data_transform is provided, it is applied to the buffer before converting it to a tensor.
    Finally, the tensor is permuted to match the expected (T, C, H, W) format.
    """
    if video_path.endswith('.gif'):
        frame_ls = []
        img = Image.open(video_path)
        for frame in ImageSequence.Iterator(img):
            frame = frame.convert('RGB')
            frame = np.array(frame).astype(np.uint8)
            frame_ls.append(frame)
        buffer = np.array(frame_ls).astype(np.uint8)
    elif video_path.endswith('.png'):
        frame = Image.open(video_path)
        frame = frame.convert('RGB')
        frame = np.array(frame).astype(np.uint8)
        frame_ls = [frame]
        buffer = np.array(frame_ls)
    elif video_path.endswith('.mp4'):
        import decord
        decord.bridge.set_bridge('native')
        if width:
            video_reader = VideoReader(video_path, width=width, height=height, num_threads=1)
        else:
            video_reader = VideoReader(video_path, num_threads=1)
        frames = video_reader.get_batch(range(len(video_reader)))  # (T, H, W, C), torch.uint8

        buffer = frames.asnumpy().astype(np.uint8)
    else:
        raise NotImplementedError
    
    frames = buffer
    if num_frames:
        frame_indices = get_frame_indices(
        num_frames, len(frames), sample="middle"
        )
        frames = frames[frame_indices]
    
    if data_transform:
        frames = data_transform(frames)
    elif return_tensor:
        frames = torch.Tensor(frames)
        frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8

    return frames

def read_frames_decord_by_fps(
        video_path, sample_fps=2, sample='rand', fix_start=None, 
        max_num_frames=-1,  trimmed30=False, num_frames=8
    ):
    import decord
    decord.bridge.set_bridge("torch")
    video_reader = VideoReader(video_path, num_threads=1)
    vlen = len(video_reader)
    fps = video_reader.get_avg_fps()
    duration = vlen / float(fps)

    if trimmed30 and duration > 30:
        duration = 30
        vlen = int(30 * float(fps))

    frame_indices = get_frame_indices(
        num_frames, vlen, sample=sample, fix_start=fix_start,
        input_fps=fps, max_num_frames=max_num_frames
    )
    frames = video_reader.get_batch(frame_indices)  # (T, H, W, C), torch.uint8
    frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8
    return frames
    
def load_dimension_info(json_dir, dimension, lang):
    """
    Load video list and prompt information based on a specified dimension and language from a JSON file.
    
    Parameters:
    - json_dir (str): The directory path where the JSON file is located.
    - dimension (str): The dimension for evaluation to filter the video prompts.
    - lang (str): The language key used to retrieve the appropriate prompt text.
    
    Returns:
    - video_list (list): A list of video file paths that match the specified dimension.
    - prompt_dict_ls (list): A list of dictionaries, each containing a prompt and its corresponding video list.
    
    The function reads the JSON file to extract video information. It filters the prompts based on the specified
    dimension and compiles a list of video paths and associated prompts in the specified language.
    
    Notes:
    - The JSON file is expected to contain a list of dictionaries with keys 'dimension', 'video_list', and language-based prompts.
    - The function assumes that the 'video_list' key in the JSON can either be a list or a single string value.
    """
    video_list = []
    prompt_dict_ls = []
    full_prompt_list = load_json(json_dir)
    for prompt_dict in full_prompt_list:
        if dimension in prompt_dict['dimension'] and 'video_list' in prompt_dict:
            prompt = prompt_dict[f'prompt_{lang}']
            cur_video_list = prompt_dict['video_list'] if isinstance(prompt_dict['video_list'], list) else [prompt_dict['video_list']]
            video_list += cur_video_list
            if 'auxiliary_info' in prompt_dict and dimension in prompt_dict['auxiliary_info']:
                prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list, 'auxiliary_info': prompt_dict['auxiliary_info'][dimension]}]
            else:
                prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list}]
    return video_list, prompt_dict_ls


def load_i2v_dimension_info(json_dir, dimension, lang, resolution):
    """
    Load video list and prompt information based on a specified dimension and language from a JSON file.
    
    Parameters:
    - json_dir (str): The directory path where the JSON file is located.
    - dimension (str): The dimension for evaluation to filter the video prompts.
    - lang (str): The language key used to retrieve the appropriate prompt text.
    - resulution (str): The resolution of the image will be used
    
    Returns:
    - video_list (list): A list of video file paths that match the specified dimension.
    - prompt_dict_ls (list): A list of dictionaries, each containing a prompt and its corresponding video list.
    
    The function reads the JSON file to extract video information. It filters the prompts based on the specified
    dimension and compiles a list of video paths and associated prompts in the specified language.
    
    Notes:
    - The JSON file is expected to contain a list of dictionaries with keys 'dimension', 'video_list', and language-based prompts.
    - The function assumes that the 'video_list' key in the JSON can either be a list or a single string value.
    """
    video_pair_list = []
    prompt_dict_ls = []
    full_prompt_list = load_json(json_dir)
    image_root = f'vbench2_beta_i2v/data/crop/{resolution}'
    image_root = '/root/autodl-tmp/video_samples/samples_sora-original_model.safetensors_vbench'
    for prompt_dict in full_prompt_list:
        if dimension in prompt_dict['dimension'] and 'video_list' in prompt_dict:
            prompt = prompt_dict[f'prompt_{lang}']
            cur_video_list = prompt_dict['video_list'] if isinstance(prompt_dict['video_list'], list) else [prompt_dict['video_list']]
            # create image-video pair
            image_path = os.path.join(image_root, prompt_dict["image_name"])
            cur_video_pair = [(image_path, video) for video in cur_video_list]
            video_pair_list += cur_video_pair
            if 'auxiliary_info' in prompt_dict and dimension in prompt_dict['auxiliary_info']:
                prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list, 'auxiliary_info': prompt_dict['auxiliary_info'][dimension]}]
            else:
                prompt_dict_ls += [{'prompt': prompt, 'video_list': cur_video_list}]
    return video_pair_list, prompt_dict_ls


def init_submodules(dimension_list, local=False, read_frame=False, resolution="1-1"):
    submodules_dict = {}
    if local:
        logger.info("\x1b[32m[Local Mode]\x1b[0m Working in local mode, please make sure that the pre-trained model has been fully downloaded.")
    for dimension in dimension_list:
        os.makedirs(CACHE_DIR, exist_ok=True)
        if dimension == 'i2v_subject' or dimension == 'i2v_background':
            if local:
                submodules_dict[dimension] = {
                    'repo_or_dir': f'{CACHE_DIR}/dino_model/facebookresearch_dino_main/',
                    'path': f'{CACHE_DIR}/dino_model/dino_vitbase16_pretrain.pth', 
                    'model': 'dino_vitb16',
                    'source': 'local',
                    'resolution': resolution
                    }
                details = submodules_dict[dimension]
                # Check if the file exists, if not, download it with wget
                if not os.path.isdir(details['repo_or_dir']):
                    print(f"Directory {details['repo_or_dir']} does not exist. Cloning repository...")
                    subprocess.run(['git', 'clone', 'https://github.com/facebookresearch/dino', details['repo_or_dir']], check=True)

                if not os.path.isfile(details['path']):
                    print(f"File {details['path']} does not exist. Downloading...")
                    wget_command = ['wget', '-P', os.path.dirname(details['path']),
                                    'https://dl.fbaipublicfiles.com/dino/dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth']
                    subprocess.run(wget_command, check=True)
            else:
                submodules_dict[dimension] = {
                    'repo_or_dir':'facebookresearch/dino:main',
                    'source':'github',
                    'model': 'dino_vitb16',
                    'resolution': resolution
                    }
        elif dimension == 'camera_motion':
            submodules_dict[dimension] = {
                "repo":"facebookresearch/co-tracker",
                "model":"cotracker2"
            }
    return submodules_dict


def save_json(data, path, indent=4):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=indent)

def load_json(path):
    """
    Load a JSON file from the given file path.
    
    Parameters:
    - file_path (str): The path to the JSON file.
    
    Returns:
    - data (dict or list): The data loaded from the JSON file, which could be a dictionary or a list.
    """
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)


================================================
FILE: Open-Sora/configs/dit/inference/16x256x256.py
================================================
num_frames = 16
fps = 8
image_size = (256, 256)

# Define model
model = dict(
    type="DiT-XL/2",
    condition="text",
    from_pretrained="PRETRAINED_MODEL",
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="clip",
    from_pretrained="openai/clip-vit-base-patch32",
    model_max_length=77,
)
scheduler = dict(
    type="dpm-solver",
    num_sampling_steps=20,
    cfg_scale=4.0,
)
dtype = "bf16"

# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/ucf101_labels.txt"
save_dir = "./samples/samples/"


================================================
FILE: Open-Sora/configs/dit/inference/1x256x256-class.py
================================================
num_frames = 1
fps = 1
image_size = (256, 256)

# Define model
model = dict(
    type="DiT-XL/2",
    no_temporal_pos_emb=True,
    condition="label_1000",
    from_pretrained="DiT-XL-2-256x256.pt",
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="classes",
    num_classes=1000,
)
scheduler = dict(
    type="dpm-solver",
    num_sampling_steps=20,
    cfg_scale=4.0,
)
dtype = "bf16"

# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/imagenet_id.txt"
save_dir = "./samples/samples/"


================================================
FILE: Open-Sora/configs/dit/inference/1x256x256.py
================================================
num_frames = 1
fps = 1
image_size = (256, 256)

# Define model
model = dict(
    type="DiT-XL/2",
    no_temporal_pos_emb=True,
    condition="text",
    from_pretrained="PRETRAINED_MODEL",
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="clip",
    from_pretrained="openai/clip-vit-base-patch32",
    model_max_length=77,
)
scheduler = dict(
    type="dpm-solver",
    num_sampling_steps=20,
    cfg_scale=4.0,
)
dtype = "bf16"

# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/imagenet_labels.txt"
save_dir = "./samples/samples/"


================================================
FILE: Open-Sora/configs/dit/train/16x256x256.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=16,
    frame_interval=3,
    image_size=(256, 256),
)

# Define acceleration
num_workers = 4
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="DiT-XL/2",
    from_pretrained="DiT-XL-2-256x256.pt",
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="clip",
    from_pretrained="openai/clip-vit-base-patch32",
    model_max_length=77,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 1000
load = None

batch_size = 8
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/dit/train/1x256x256.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=1,
    frame_interval=1,
    image_size=(256, 256),
    transform_name="center",
)

# Define acceleration
num_workers = 4
dtype = "bf16"
grad_checkpoint = False
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="DiT-XL/2",
    no_temporal_pos_emb=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="clip",
    from_pretrained="openai/clip-vit-base-patch32",
    model_max_length=77,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 1000
load = None

batch_size = 128
lr = 1e-4  # according to DiT repo
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/latte/inference/16x256x256-class.py
================================================
num_frames = 16
fps = 8
image_size = (256, 256)

# Define model
model = dict(
    type="Latte-XL/2",
    condition="label_101",
    from_pretrained="Latte-XL-2-256x256-ucf101.pt",
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="classes",
    num_classes=101,
)
scheduler = dict(
    type="dpm-solver",
    num_sampling_steps=20,
    cfg_scale=4.0,
)
dtype = "bf16"

# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/ucf101_id.txt"
save_dir = "./samples/samples/"


================================================
FILE: Open-Sora/configs/latte/inference/16x256x256.py
================================================
num_frames = 16
fps = 8
image_size = (256, 256)

# Define model
model = dict(
    type="Latte-XL/2",
    condition="text",
    from_pretrained="PRETRAINED_MODEL",
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="clip",
    from_pretrained="openai/clip-vit-base-patch32",
    model_max_length=77,
)
scheduler = dict(
    type="dpm-solver",
    num_sampling_steps=20,
    cfg_scale=4.0,
)
dtype = "bf16"

# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/ucf101_labels.txt"
save_dir = "./samples/samples/"


================================================
FILE: Open-Sora/configs/latte/train/16x256x256.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=16,
    frame_interval=3,
    image_size=(256, 256),
)

# Define acceleration
num_workers = 4
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="Latte-XL/2",
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="clip",
    from_pretrained="openai/clip-vit-base-patch32",
    model_max_length=77,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 1000
load = None

batch_size = 8
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora/inference/16x256x256.py
================================================
num_frames = 16
fps = 24 // 3
image_size = (256, 256)

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=0.5,
    time_scale=1.0,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
    from_pretrained="PRETRAINED_MODEL",
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=4,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
)
scheduler = dict(
    type="iddpm",
    num_sampling_steps=100,
    cfg_scale=7.0,
    cfg_channel=3,  # or None
)
dtype = "bf16"

# Condition
prompt_path = "./assets/texts/t2v_samples.txt"
prompt = None  # prompt has higher priority than prompt_path

# Others
batch_size = 1
seed = 42
save_dir = "./samples/samples/"


================================================
FILE: Open-Sora/configs/opensora/inference/16x512x512-rflow.py
================================================
num_frames = 16
fps = 24 // 3
image_size = (512, 512)

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=1.0,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
    from_pretrained="PRETRAINED_MODEL",
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=2,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
)
scheduler = dict(
    type="rflow",
    num_sampling_steps=10,
    cfg_scale=7.0,
)
dtype = "bf16"

# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2v_samples.txt"
save_dir = "./outputs/samples/"


================================================
FILE: Open-Sora/configs/opensora/inference/16x512x512.py
================================================
num_frames = 16
fps = 24 // 3
image_size = (512, 512)

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=1.0,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
    from_pretrained="PRETRAINED_MODEL",
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=2,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
)
scheduler = dict(
    type="iddpm",
    num_sampling_steps=100,
    cfg_scale=7.0,
)
dtype = "bf16"

# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2v_samples.txt"
save_dir = "./samples/samples/"


================================================
FILE: Open-Sora/configs/opensora/inference/64x512x512.py
================================================
num_frames = 64
fps = 24 // 2
image_size = (512, 512)

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=2 / 3,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
    from_pretrained="PRETRAINED_MODEL",
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=128,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
)
scheduler = dict(
    type="iddpm",
    num_sampling_steps=100,
    cfg_scale=7.0,
)
dtype = "bf16"

# Others
batch_size = 1
seed = 42
prompt_path = "./assets/texts/t2v_samples.txt"
save_dir = "./samples/samples/"


================================================
FILE: Open-Sora/configs/opensora/train/16x256x256-mask.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=16,
    frame_interval=3,
    image_size=(256, 256),
)

# Define acceleration
num_workers = 4
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=0.5,
    time_scale=1.0,
    from_pretrained="PixArt-XL-2-512x512.pth",
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
mask_ratios = {
    "identity": 0.7,
    "random": 0.15,
    "mask_head": 0.05,
    "mask_tail": 0.05,
    "mask_head_tail": 0.05,
}
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 1000
load = None

batch_size = 8
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora/train/16x256x256-spee-rflow.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=16,
    frame_interval=3,
    image_size=(256, 256),
)

# Define acceleration
num_workers = 4
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=0.5,
    time_scale=1.0,
    # from_pretrained="PixArt-XL-2-512x512.pth",
    # from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth",
    # from_pretrained = "OpenSora-v1-HQ-16x512x512.pth",
    from_pretrained="PRETRAINED_MODEL",
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
# mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07]
# mask_ratios = {
#     "identity": 0.9,
#     "random": 0.06,
#     "mask_head": 0.01,
#     "mask_tail": 0.01,
#     "mask_head_tail": 0.02,
# }
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
)
scheduler = dict(
    type="rflow",
    # timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = True

epochs = 1
log_every = 10
ckpt_every = 1000
load = None

batch_size = 16
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora/train/16x256x256-spee.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=16,
    frame_interval=3,
    image_size=(256, 256),
)

# Define acceleration
num_workers = 4
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=0.5,
    time_scale=1.0,
    from_pretrained="PixArt-XL-2-512x512.pth",
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
mask_ratios = {
    "identity": 0.5,
    "random": 0.29,
    "mask_head": 0.07,
    "mask_tail": 0.07,
    "mask_head_tail": 0.07,
}
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
)
scheduler = dict(
    type="iddpm-speed",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 1000
load = None

batch_size = 8
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora/train/16x256x256.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=16,
    frame_interval=3,
    image_size=(256, 256),
)

# Define acceleration
num_workers = 0
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=0.5,
    time_scale=1.0,
    from_pretrained="PixArt-XL-2-512x512.pth",
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 1000
load = None

batch_size = 8
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora/train/16x512x512.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=16,
    frame_interval=3,
    image_size=(512, 512),
)

# Define acceleration
num_workers = 4
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=1.0,
    from_pretrained=None,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=128,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 500
load = None

batch_size = 8
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora/train/360x512x512.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=360,
    frame_interval=3,
    image_size=(512, 512),
)

# Define acceleration
num_workers = 4
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define acceleration
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2-seq"
sp_size = 2

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=2 / 3,
    from_pretrained=None,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
    enable_sequence_parallelism=True,  # enable sq here
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=128,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 250
load = None

batch_size = 1
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora/train/64x512x512-sp.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=16,
    frame_interval=3,
    image_size=(512, 512),
)

# Define acceleration
num_workers = 4
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 2

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=2 / 3,
    from_pretrained=None,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
    enable_sequence_parallelism=True,  # enable sq here
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 1000
load = None

batch_size = 1
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora/train/64x512x512.py
================================================
# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=64,
    frame_interval=3,
    image_size=(512, 512),
)

# Define acceleration
num_workers = 4
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=2 / 3,
    from_pretrained=None,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=64,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 250
load = None

batch_size = 4
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora-v1-1/inference/sample-ref.py
================================================
num_frames = 16
frame_interval = 3
fps = 24
image_size = (240, 426)
multi_resolution = "STDiT2"

# Condition
prompt_path = None
prompt = [
    'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. {"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
    'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png","mask_strategy": "0"}',
    'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,-8,0,8"}',
    'A snowy forest.{"reference_path": "https://cdn.pixabay.com/video/2021/04/25/72171-542991404_large.mp4","mask_strategy": "0,0,0,0,15,0.8"}',
    'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}',
    '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,15"}',
]

loop = 2
condition_frame_length = 4
# (
#   loop id, [the loop index of the condition image or video]
#   reference id, [the index of the condition image or video in the reference_path]
#   reference start, [the start frame of the condition image or video]
#   target start, [the location to insert]
#   length, [the number of frames to insert]
#   edit_ratio [the edit rate of the condition image or video]
# )
# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples

# Define model
model = dict(
    type="STDiT2-XL/2",
    from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
    input_sq_size=512,
    qk_norm=True,
    qk_norm_legacy=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    cache_dir=None,  # "/mnt/hdd/cached_models",
    micro_batch_size=4,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    cache_dir=None,  # "/mnt/hdd/cached_models",
    model_max_length=200,
)
scheduler = dict(
    type="iddpm",
    num_sampling_steps=100,
    cfg_scale=7.0,
    cfg_channel=3,  # or None
)
dtype = "bf16"

# Others
batch_size = 1
seed = 42
save_dir = "./samples/samples/"


================================================
FILE: Open-Sora/configs/opensora-v1-1/inference/sample.py
================================================
num_frames = 16
frame_interval = 3
fps = 24
image_size = (240, 426)
multi_resolution = "STDiT2"

# Define model
model = dict(
    type="STDiT2-XL/2",
    from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
    input_sq_size=512,
    qk_norm=True,
    qk_norm_legacy=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    cache_dir=None,  # "/mnt/hdd/cached_models",
    micro_batch_size=4,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    cache_dir=None,  # "/mnt/hdd/cached_models",
    model_max_length=200,
)
scheduler = dict(
    type="iddpm",
    num_sampling_steps=100,
    cfg_scale=7.0,
    cfg_channel=3,  # or None
)
dtype = "bf16"

# Condition
prompt_path = "./assets/texts/t2v_samples.txt"
prompt = None  # prompt has higher priority than prompt_path

# Others
batch_size = 1
seed = 42
save_dir = "./samples/samples/"


================================================
FILE: Open-Sora/configs/opensora-v1-1/train/benchmark.py
================================================
# this file is only for batch size search and is not used for training

# Define dataset
dataset = dict(
    type="VariableVideoTextDataset",
    data_path=None,
    num_frames=None,
    frame_interval=3,
    image_size=(None, None),
    transform_name="resize_crop",
)

# bucket config format:
# 1. { resolution: {num_frames: (prob, batch_size)} }, in this case batch_size is ignored when searching
# 2. { resolution: {num_frames: (prob, (max_batch_size, ))} }, batch_size is searched in the range [batch_size_start, max_batch_size), batch_size_start is configured via CLI
# 3. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size)
# 4. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) with step_size (grid search)
# 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used

bucket_config = {
    # == manual search ==
    # "240p": {128: (1.0, 2)}, # 4.28s/it
    # "240p": {64: (1.0, 4)},
    # "240p": {32: (1.0, 8)},  # 4.6s/it
    # "240p": {16: (1.0, 16)},  # 4.6s/it
    # "480p": {16: (1.0, 4)},  # 4.6s/it
    # "720p": {16: (1.0, 2)},  # 5.89s/it
    # "256": {1: (1.0, 256)},  # 4.5s/it
    # "512": {1: (1.0, 96)}, # 4.7s/it
    # "512": {1: (1.0, 128)}, # 6.3s/it
    # "480p": {1: (1.0, 50)},  # 4.0s/it
    # "1024": {1: (1.0, 32)},  # 6.8s/it
    # "1024": {1: (1.0, 20)}, # 4.3s/it
    # "1080p": {1: (1.0, 16)}, # 8.6s/it
    # "1080p": {1: (1.0, 8)},  # 4.4s/it
    # == stage 2 ==
    # "240p": {
    #     16: (1.0, (2, 32)),
    #     32: (1.0, (2, 16)),
    #     64: (1.0, (2, 8)),
    #     128: (1.0, (2, 6)),
    # },
    # "256": {1: (1.0, (128, 300))},
    # "512": {1: (0.5, (64, 128))},
    # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
    # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)},  # No examples now
    # "1024": {1: (0.3, (8, 64))},
    # "1080p": {1: (0.3, (2, 32))},
    # == stage 3 ==
    "720p": {1: (20, 40), 32: (0.5, (2, 4)), 64: (0.5, (1, 1))},
}


# Define acceleration
num_workers = 4
num_bucket_build_workers = 16
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT2-XL/2",
    from_pretrained=None,
    input_sq_size=512,  # pretrained model is trained on 512x512
    qk_norm=True,
    qk_norm_legacy=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=4,
    local_files_only=True,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=200,
    shardformer=True,
    local_files_only=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 1000
load = None

batch_size = None
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora-v1-1/train/image.py
================================================
# Define dataset
dataset = dict(
    type="VariableVideoTextDataset",
    data_path=None,
    num_frames=None,
    frame_interval=3,
    image_size=(None, None),
    transform_name="resize_crop",
)
bucket_config = {  # 6s/it
    "256": {1: (1.0, 256)},
    "512": {1: (1.0, 80)},
    "480p": {1: (1.0, 52)},
    "1024": {1: (1.0, 20)},
    "1080p": {1: (1.0, 8)},
}

# Define acceleration
num_workers = 4
num_bucket_build_workers = 16
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT2-XL/2",
    from_pretrained=None,
    input_sq_size=512,  # pretrained model is trained on 512x512
    qk_norm=True,
    qk_norm_legacy=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=4,
    local_files_only=True,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=200,
    shardformer=True,
    local_files_only=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 500
load = None

batch_size = 10  # only for logging
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora-v1-1/train/image_rflow.py
================================================
# Define dataset
# dataset = dict(
#     type="VariableVideoTextDataset",
#     data_path=None,
#     num_frames=None,
#     frame_interval=3,
#     image_size=(None, None),
#     transform_name="resize_crop",
# )
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=1,
    frame_interval=1,
    image_size=(256, 256),
    transform_name="center",
)
bucket_config = {  # 6s/it
    "256": {1: (1.0, 256)},
    "512": {1: (1.0, 80)},
    "480p": {1: (1.0, 52)},
    "1024": {1: (1.0, 20)},
    "1080p": {1: (1.0, 8)},
}

# Define acceleration
num_workers = 16
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
# model = dict(
#     type="DiT-XL/2",
#     from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
#     # input_sq_size=512,  # pretrained model is trained on 512x512
#     enable_flash_attn=True,
#     enable_layernorm_kernel=True,
# )
model = dict(
    type="PixArt-XL/2",
    space_scale=1.0,
    time_scale=1.0,
    no_temporal_pos_emb=True,
    from_pretrained="PixArt-XL-2-512x512.pth",
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
# model = dict(
#     type="DiT-XL/2",
#     # space_scale=1.0,
#     # time_scale=1.0,
#     no_temporal_pos_emb=True,
#     # from_pretrained="PixArt-XL-2-512x512.pth",
#     from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
#     enable_flash_attn=True,
#     enable_layernorm_kernel=True,
# )
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=4,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=200,
    shardformer=True,
)
scheduler = dict(
    type="rflow",
    # timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 10
log_every = 10
ckpt_every = 500
load = None

batch_size = 100  # only for logging
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora-v1-1/train/stage1.py
================================================
# Define dataset
dataset = dict(
    type="VariableVideoTextDataset",
    data_path=None,
    num_frames=None,
    frame_interval=3,
    image_size=(None, None),
    transform_name="resize_crop",
)
# IMG: 1024 (20%) 512 (30%) 256 (50%) drop (50%)
bucket_config = {  # 1s/it
    "144p": {1: (0.5, 48), 16: (1.0, 6), 32: (1.0, 3), 96: (1.0, 1)},
    "256": {1: (0.5, 24), 16: (0.5, 3), 48: (0.5, 1), 64: (0.0, None)},
    "240p": {16: (0.3, 2), 32: (0.3, 1), 64: (0.0, None)},
    "512": {1: (0.4, 12)},
    "1024": {1: (0.3, 3)},
}
mask_ratios = {
    "identity": 0.75,
    "quarter_random": 0.025,
    "quarter_head": 0.025,
    "quarter_tail": 0.025,
    "quarter_head_tail": 0.05,
    "image_random": 0.025,
    "image_head": 0.025,
    "image_tail": 0.025,
    "image_head_tail": 0.05,
}

# Define acceleration
num_workers = 8
num_bucket_build_workers = 16
dtype = "bf16"
grad_checkpoint = False
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT2-XL/2",
    from_pretrained=None,
    input_sq_size=512,  # pretrained model is trained on 512x512
    qk_norm=True,
    qk_norm_legacy=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=4,
    local_files_only=True,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=200,
    shardformer=True,
    local_files_only=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 500
load = None

batch_size = None
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora-v1-1/train/stage2.py
================================================
# Define dataset
dataset = dict(
    type="VariableVideoTextDataset",
    data_path=None,
    num_frames=None,
    frame_interval=3,
    image_size=(None, None),
    transform_name="resize_crop",
)
bucket_config = {  # 7s/it
    "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)},
    "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
    "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
    "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)},
    "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
    "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
    "1024": {1: (0.3, 20)},
    "1080p": {1: (0.4, 8)},
}
mask_ratios = {
    "identity": 0.75,
    "quarter_random": 0.025,
    "quarter_head": 0.025,
    "quarter_tail": 0.025,
    "quarter_head_tail": 0.05,
    "image_random": 0.025,
    "image_head": 0.025,
    "image_tail": 0.025,
    "image_head_tail": 0.05,
}

# Define acceleration
num_workers = 8
num_bucket_build_workers = 16
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT2-XL/2",
    from_pretrained=None,
    input_sq_size=512,  # pretrained model is trained on 512x512
    qk_norm=True,
    qk_norm_legacy=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=4,
    local_files_only=True,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=200,
    shardformer=True,
    local_files_only=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 500
load = None

batch_size = None
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora-v1-1/train/stage3.py
================================================
# Define dataset
dataset = dict(
    type="VariableVideoTextDataset",
    data_path=None,
    num_frames=None,
    frame_interval=3,
    image_size=(None, None),
    transform_name="resize_crop",
)
bucket_config = {  # 13s/it
    "144p": {1: (1.0, 200), 16: (1.0, 36), 32: (1.0, 18), 64: (1.0, 9), 128: (1.0, 4)},
    "256": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 11), 64: (0.5, 6), 128: (0.8, 4)},
    "240p": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 10), 64: (0.5, 6), 128: (0.5, 3)},
    "360p": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.5, 1)},
    "512": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.8, 1)},
    "480p": {1: (0.4, 80), 16: (0.6, 6), 32: (0.6, 3), 64: (0.6, 1), 128: (0.0, None)},
    "720p": {1: (0.4, 40), 16: (0.6, 3), 32: (0.6, 1), 96: (0.0, None)},
    "1024": {1: (0.3, 40)},
}
mask_ratios = {
    "identity": 0.75,
    "quarter_random": 0.025,
    "quarter_head": 0.025,
    "quarter_tail": 0.025,
    "quarter_head_tail": 0.05,
    "image_random": 0.025,
    "image_head": 0.025,
    "image_tail": 0.025,
    "image_head_tail": 0.05,
}

# Define acceleration
num_workers = 8
num_bucket_build_workers = 16
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT2-XL/2",
    from_pretrained=None,
    input_sq_size=512,  # pretrained model is trained on 512x512
    qk_norm=True,
    qk_norm_legacy=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=4,
    local_files_only=True,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=200,
    shardformer=True,
    local_files_only=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 500
load = None

batch_size = None
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora-v1-1/train/video.py
================================================
# Define dataset
dataset = dict(
    type="VariableVideoTextDataset",
    data_path=None,
    num_frames=None,
    frame_interval=3,
    image_size=(None, None),
    transform_name="resize_crop",
)
bucket_config = {  # 6s/it
    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
    "256": {1: (1.0, 256)},
    "512": {1: (0.5, 80)},
    "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)},
    "720p": {16: (0.1, 2), 32: (0.0, None)},  # No examples now
    "1024": {1: (0.3, 20)},
    "1080p": {1: (0.3, 8)},
}

# Define acceleration
num_workers = 4
num_bucket_build_workers = 16
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1

# Define model
model = dict(
    type="STDiT2-XL/2",
    from_pretrained=None,
    input_sq_size=512,  # pretrained model is trained on 512x512
    qk_norm=True,
    qk_norm_legacy=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=4,
    local_files_only=True,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=200,
    shardformer=True,
    local_files_only=True,
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",
)

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 1000
log_every = 10
ckpt_every = 500
load = None

batch_size = 10  # only for logging
lr = 2e-5
grad_clip = 1.0


================================================
FILE: Open-Sora/configs/opensora-v1-2/inference/sample.py
================================================
resolution = "240p"
aspect_ratio = "9:16"
num_frames = 51
fps = 24
frame_interval = 1
save_fps = 24

#save_dir = "./samples/samples/"
save_dir = "/root/autodl-tmp/video_samples/"
seed = 42
batch_size = 1
multi_resolution = "STDiT2"
dtype = "bf16"
condition_frame_length = 5
align = 5

model = dict(
    type="STDiT3-XL/2",
    from_pretrained="/root/autodl-tmp/pretrained_models/hpcai-tech/OpenSora-STDiT-v3",
    qk_norm=True,
    enable_flash_attn=True,#True
    enable_layernorm_kernel=True,#True
)
vae = dict(
    type="OpenSoraVAE_V1_2",
    from_pretrained="/root/autodl-tmp/pretrained_models/hpcai-tech/OpenSora-VAE-v1.2",
    micro_frame_size=17,
    micro_batch_size=4,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=300,
)
scheduler = dict(
    type="rflow",
    use_timestep_transform=True,
    num_sampling_steps=30,
    cfg_scale=7.0,
)

aes = 6.5
flow = None
#num_sample = 1


================================================
FILE: Open-Sora/docs/acceleration.md
================================================
# Acceleration

>This document corresponds to our v1.1 release

Open-Sora aims to provide a high-speed training framework for diffusion models. We can achieve **55%** training speed acceleration when training on **64 frames 512x512 videos**. Our framework support training **1min 1080p videos**.

## Accelerated Transformer

Open-Sora boosts the training speed by:

- Kernel optimization including [flash attention](https://github.com/Dao-AILab/flash-attention), fused layernorm kernel, and the ones compiled by colossalAI.
- Hybrid parallelism including ZeRO.
- Gradient checkpointing for larger batch size.

Our training speed on images is comparable to [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT), a project to accelerate DiT training. The training speed is measured on 8 H800 GPUs with batch size 128, image size 256x256.

| Model    | Throughput (img/s/GPU) | Throughput (tokens/s/GPU) |
| -------- | ---------------------- | ------------------------- |
| DiT      | 100                    | 26k                       |
| OpenDiT  | 175                    | 45k                       |
| OpenSora | 175                    | 45k                       |

## Efficient STDiT

Our STDiT adopts spatial-temporal attention to model the video data. Compared with directly applying full attention on DiT, our STDiT is more efficient as the number of frames increases. Our current framework only supports sequence parallelism for very long sequence.

The training speed is measured on 8 H800 GPUs with acceleration techniques applied, GC means gradient checkpointing. Both with T5 conditioning like PixArt.

| Model            | Setting        | Throughput (sample/s/GPU) | Throughput (tokens/s/GPU) |
| ---------------- | -------------- | ------------------------- | ------------------------- |
| DiT              | 16x256  (4k)   | 7.20                      | 29k                       |
| STDiT            | 16x256  (4k)   | 7.00                      | 28k                       |
| DiT              | 16x512  (16k)  | 0.85                      | 14k                       |
| STDiT            | 16x512  (16k)  | 1.45                      | 23k                       |
| DiT (GC)         | 64x512  (65k)  | 0.08                      | 5k                        |
| STDiT (GC)       | 64x512  (65k)  | 0.40                      | 25k                       |
| STDiT (GC, sp=2) | 360x512 (370k) | 0.10                      | 18k                       |

With a 4x downsampling in the temporal dimension with Video-VAE, an 24fps video has 450 frames. The gap between the speed of STDiT (28k tokens/s) and DiT on images (up to 45k tokens/s) mainly comes from the T5 and VAE encoding, and temporal attention.

## Accelerated Encoder (T5, VAE)

During training, texts are encoded by T5, and videos are encoded by VAE. Typically there are two ways to accelerate the training:

1. Preprocess text and video data in advance and save them to disk.
2. Encode text and video data during training, and accelerate the encoding process.

For option 1, 120 tokens for one sample require 1M disk space, and a 64x64x64 latent requires 4M. Considering a training dataset with 10M video clips, the total disk space required is 50TB. Our storage system is not ready at this time for this scale of data.

For option 2, we boost T5 speed and memory requirement. According to [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT), we find VAE consumes a large number of GPU memory. Thus we split batch size into smaller ones for VAE encoding. With both techniques, we can greatly accelerate the training speed.

The training speed is measured on 8 H800 GPUs with STDiT.

| Acceleration | Setting       | Throughput (img/s/GPU) | Throughput (tokens/s/GPU) |
| ------------ | ------------- | ---------------------- | ------------------------- |
| Baseline     | 16x256  (4k)  | 6.16                   | 25k                       |
| w. faster T5 | 16x256  (4k)  | 7.00                   | 29k                       |
| Baseline     | 64x512  (65k) | 0.94                   | 15k                       |
| w. both      | 64x512  (65k) | 1.45                   | 23k                       |


================================================
FILE: Open-Sora/docs/commands.md
================================================
# Commands

- [Config](#Config)
- [Inference](#inference)
  - [Inference with Open-Sora 1.2](#inference-with-open-sora-12)
  - [Inference with Open-Sora 1.1](#inference-with-open-sora-11)
  - [Inference with DiT pretrained on ImageNet](#inference-with-dit-pretrained-on-imagenet)
  - [Inference with Latte pretrained on UCF101](#inference-with-latte-pretrained-on-ucf101)
  - [Inference with PixArt-α pretrained weights](#inference-with-pixart-α-pretrained-weights)
  - [Inference with checkpoints saved during training](#inference-with-checkpoints-saved-during-training)
  - [Inference Hyperparameters](#inference-hyperparameters)
- [Training](#training)
  - [Training Hyperparameters](#training-hyperparameters)
- [Search batch size for buckets](#search-batch-size-for-buckets)

## Config
Note that currently our model loading for vae and diffusion model supports two types:

* load from local file path
* load from huggingface

Our config supports loading from huggingface online image by default.
If you wish to load from a local path downloaded from huggingface image, you need to set `force_huggingface=True`, for instance:

```python
# for vae
vae = dict(
    type="OpenSoraVAE_V1_2",
    from_pretrained="/root/commonData/OpenSora-VAE-v1.2",
    micro_frame_size=17,
    micro_batch_size=4,
    force_huggingface=True, # NOTE: set here
)
# for diffusion model
model = dict(
    type="STDiT3-XL/2",
    from_pretrained="/root/commonData/OpenSora-STDiT-v3",
    qk_norm=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
    force_huggingface=True, # NOTE: set here
)
```
However, if you want to load a self-trained model, do not set `force_huggingface=True` since your image won't be in huggingface format.

## Inference

You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).

### Inference with Open-Sora 1.2

The inference API is compatible with Open-Sora 1.1. To ease users' experience, we add support to `--resolution` and `--aspect-ratio` options, which is a more user-friendly way to specify the image size.

```bash
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
    --resolution 480p --aspect-ratio 9:16
# equivalent to
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
    --image-size 480 853
```

In this version, we have merged all functions in previous `inference-long.py` into `inference.py`. The command line arguments are the same as before (only note that the frame index and length is calculated with 4x compressed).

### Inference with Open-Sora 1.1

Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument.

```bash
# image sampling with prompt path
python scripts/inference.py configs/opensora-v1-1/inference/sample.py \
    --ckpt-path CKPT_PATH --prompt-path assets/texts/t2i_samples.txt --num-frames 1 --image-size 1024 1024

# image sampling with prompt
python scripts/inference.py configs/opensora-v1-1/inference/sample.py \
    --ckpt-path CKPT_PATH --prompt "A beautiful sunset over the city" --num-frames 1 --image-size 1024 1024

# video sampling
python scripts/inference.py configs/opensora-v1-1/inference/sample.py \
    --ckpt-path CKPT_PATH --prompt "A beautiful sunset over the city" --num-frames 16 --image-size 480 854
```

You can adjust the `--num-frames` and `--image-size` to generate different results. We recommend you to use the same image size as the training resolution, which is defined in [aspect.py](/opensora/datasets/aspect.py). Some examples are shown below.

- 240p
  - 16:9 240x426
  - 3:4 276x368
  - 1:1 320x320
- 480p
  - 16:9 480x854
  - 3:4 554x738
  - 1:1 640x640
- 720p
  - 16:9 720x1280
  - 3:4 832x1110
  - 1:1 960x960

`inference-long.py` is compatible with `inference.py` and supports advanced features.

```bash
# image condition
python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
  --num-frames 32 --image-size 240 426 --sample-name image-cond \
  --prompt 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/wave.png","mask_strategy": "0"}'

# video extending
python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
  --num-frames 32 --image-size 240 426 --sample-name image-cond \
  --prompt 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,0,-8,8"}'

# long video generation
python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
  --num-frames 32 --image-size 240 426 --loop 16 --condition-frame-length 8 --sample-name long \
  --prompt '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16"}'

# video connecting
python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
  --num-frames 32 --image-size 240 426 --sample-name connect \
  --prompt 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}'

# video editing
python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
  --num-frames 32 --image-size 480 853 --sample-name edit \
  --prompt 'A cyberpunk-style city at night.{"reference_path": "https://cdn.pixabay.com/video/2021/10/12/91744-636709154_large.mp4","mask_strategy": "0,0,0,0,32,0.4"}'
```

### Inference with DiT pretrained on ImageNet

The following command automatically downloads the pretrained weights on ImageNet and runs inference.

```bash
python scripts/inference.py configs/dit/inference/1x256x256-class.py --ckpt-path DiT-XL-2-256x256.pt
```

### Inference with Latte pretrained on UCF101

The following command automatically downloads the pretrained weights on UCF101 and runs inference.

```bash
python scripts/inference.py configs/latte/inference/16x256x256-class.py --ckpt-path Latte-XL-2-256x256-ucf101.pt
```

### Inference with PixArt-α pretrained weights

Download T5 into `./pretrained_models` and run the following command.

```bash
# 256x256
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x256x256.py --ckpt-path PixArt-XL-2-256x256.pth

# 512x512
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x512x512.py --ckpt-path PixArt-XL-2-512x512.pth

# 1024 multi-scale
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x1024MS.py --ckpt-path PixArt-XL-2-1024MS.pth
```

### Inference with checkpoints saved during training

During training, an experiment logging folder is created in `outputs` directory. Under each checkpoint folder, e.g. `epoch12-global_step2000`, there is a `ema.pt` and the shared `model` folder. Run the following command to perform inference.

```bash
# inference with ema model
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000/ema.pt

# inference with model
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000

# inference with sequence parallelism
# sequence parallelism is enabled automatically when nproc_per_node is larger than 1
torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000
```

The second command will automatically generate a `model_ckpt.pt` file in the checkpoint folder.

### Inference Hyperparameters

1. DPM-solver is good at fast inference for images. However, the video result is not satisfactory. You can use it for fast demo purpose.

```python
type="dmp-solver"
num_sampling_steps=20
```

2. You can use [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)'s finetuned VAE decoder on videos for inference (consumes more memory). However, we do not see significant improvement in the video result. To use it, download [the pretrained weights](https://huggingface.co/maxin-cn/Latte/tree/main/t2v_required_models/vae_temporal_decoder) into `./pretrained_models/vae_temporal_decoder` and modify the config file as follows.

```python
vae = dict(
    type="VideoAutoencoderKLTemporalDecoder",
    from_pretrained="pretrained_models/vae_temporal_decoder",
)
```

## Training

To resume training, run the following command. ``--load`` different from ``--ckpt-path`` as it loads the optimizer and dataloader states.

```bash
torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --load YOUR_PRETRAINED_CKPT
```

To enable wandb logging, add `--wandb` to the command.

```bash
WANDB_API_KEY=YOUR_WANDB_API_KEY torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --wandb True
```

You can modify corresponding config files to change the training settings. See more details [here](/docs/structure.md#training-config-demos).

### Training Hyperparameters

1. `dtype` is the data type for training. Only `fp16` and `bf16` are supported. ColossalAI automatically enables the mixed precision training for `fp16` and `bf16`. During training, we find `bf16` more stable.

## Search batch size for buckets

To search the batch size for buckets, run the following command.

```bash
torchrun --standalone --nproc_per_node 1 scripts/misc/search_bs.py configs/opensora-v1-2/misc/bs.py --data-path /mnt/nfs-207/sora_data/meta/searchbs.csv
```

Here, your data should be a small one for searching purposes.

To control the batch size search range, you should specify `bucket_config` in the config file, where the value tuple is `(guess_value, range)` and the search will be performed in `guess_value±range`.

Here is an example of the bucket config:

```python
bucket_config = {
  "240p": {
        1: (100, 100),
        51: (24, 10),
        102: (12, 10),
        204: (4, 8),
        408: (2, 8),
    },
    "480p": {
        1: (50, 50),
        51: (6, 6),
        102: (3, 3),
        204: (1, 2),
    },
}
```

You can also specify a resolution to search for parallelism.

```bash
torchrun --standalone --nproc_per_node 1 scripts/misc/search_bs.py configs/opensora-v1-2/misc/bs.py --data-path /mnt/nfs-207/sora_data/meta/searchbs.csv --resolution 240p
```

The searching goal should be specified in the config file as well. There are two ways:

1. Specify a `base_step_time` in the config file. The searching goal is to find the batch size that can achieve the `base_step_time` for each bucket.
2. If `base_step_time` is not specified, it will be determined by `base` which is a tuple of `(batch_size, step_time)`. The step time is the maximum batch size allowed for the bucket.

The script will print the best batch size (and corresponding step time) for each bucket and save the output config file. Note that we assume a larger batch size is better, so the script use binary search to find the best batch size.


================================================
FILE: Open-Sora/docs/config.md
================================================
# Config Guide

- [Inference Config](#inference-config)
- [Advanced Inference config](#advanced-inference-config)
- [Inference Args](#inference-args)
- [Training Config](#training-config)
- [Training Args](#training-args)
- [Training Bucket Configs](#training-bucket-configs)

Our config files follows [MMEgine](https://github.com/open-mmlab/mmengine). MMEngine will reads the config file (a `.py` file) and parse it into a dictionary-like object. We expose some fields in the config file to the command line arguments (defined in [opensora/utils/config_util.py](/opensora/utils/config_utils.py)). To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file.

## Inference Config

The explanation of each field is provided below.

```python
# Define sampling size
num_frames = 64               # number of frames, 1 means image
fps = 24                      # frames per second (condition for generation)
frame_interval = 3            # output video will have fps/frame_interval frames per second
image_size = (240, 426)       # image size (height, width)

# Define model
model = dict(
    type="STDiT2-XL/2",       # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
    from_pretrained="PRETRAINED_MODEL",  # (Optional) Load from pretrained model
    input_sq_size=512,        # Base spatial position embedding size
    qk_norm=True,             # Normalize query and key in attention
    enable_flash_attn=True,    # (Optional) Speed up training and inference with flash attention
    # Turn enable_flash_attn to False if you skip flashattn installation
    enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
    # Turn enable_layernorm_kernel to False if you skip apex installation
)
vae = dict(
    type="VideoAutoencoderKL", # Select VAE type
    from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE
    micro_batch_size=4,        # VAE with micro batch size to save memory
)
text_encoder = dict(
    type="t5",                 # Select text encoder type (t5, clip)
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
    model_max_length=200,      # Maximum length of input text
)
scheduler = dict(
    type="iddpm",              # Select scheduler type (iddpm, dpm-solver)
    num_sampling_steps=100,    # Number of sampling steps
    cfg_scale=7.0,             # hyper-parameter for classifier-free diffusion
    cfg_channel=3,             # how many channels to use for classifier-free diffusion, if None, use all channels
)
dtype = "bf16"                 # Computation type (fp16, fp32, bf16)

# Condition
prompt_path = "./assets/texts/t2v_samples.txt" # path to prompt file
prompt = None                  # prompt has higher priority than prompt_path

# Other settings
batch_size = 1                 # batch size
seed = 42                      # random seed
save_dir = "./samples"         # path to save samples
```

## Advanced Inference config

The [`inference-long.py`](/scripts/inference-long.py) script is used to generate long videos, and it also provides all functions of the [`inference.py`](/scripts/inference.py) script. The following arguments are specific to the `inference-long.py` script.

```python
loop = 10
condition_frame_length = 4
reference_path = [
    "https://cdn.openai.com/tmp/s/interp/d0.mp4",
    None,
    "assets/images/condition/wave.png",
]
mask_strategy = [
    "0,0,0,0,8,0.3",
    None,
    "0,0,0,0,1;0,0,0,-1,1",
]
```

The following figure provides an illustration of the `mask_strategy`:

![mask_strategy](/assets/readme/report_mask_config.png)

To generate a long video of infinite time, our strategy is to generate a video with a fixed length first, and then use the last `condition_frame_length` number of frames for the next video generation. This will loop for `loop` times. Thus, the total length of the video is `loop * (num_frames - condition_frame_length) + condition_frame_length`.

To condition the generation on images or videos, we introduce the `mask_strategy`. It is 6 number tuples separated by `;`.  Each tuple indicate an insertion of the condition image or video to the target generation. The meaning of each number is:

- **First number**: the loop index of the condition image or video. (0 means the first loop, 1 means the second loop, etc.)
- **Second number**: the index of the condition image or video in the `reference_path`.
- **Third number**: the start frame of the condition image or video. (0 means the first frame, and images only have one frame)
- **Fourth number**: the location to insert. (0 means insert at the beginning, 1 means insert at the end, and -1 means insert at the end of the video)
- **Fifth number**: the number of frames to insert. (1 means insert one frame, and images only have one frame)
- **Sixth number**: the edit rate of the condition image or video. (0 means no edit, 1 means full edit).

To facilitate usage, we also accept passing the reference path and mask strategy as a json appended to the prompt. For example,

```plaintext
'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}'
```

## Inference Args

You can use `python scripts/inference.py --help` to see the following arguments:

- `--seed`: random seed
- `--ckpt-path`: path to the checkpoint (`model["from_pretrained"]`)
- `--batch-size`: batch size
- `--save-dir`: path to save samples
- `--sample-name`: if None, the sample will be name by `sample_{index}.mp4/png`, otherwise, the sample will be named by `{sample_name}_{index}.mp4/png`
- `--start-index`: start index of the sample
- `--end-index`: end index of the sample
- `--num-sample`: number of samples to generate for each prompt. The sample will be suffixed by `-0`, `-1`, `-2`, etc.
- `--prompt-as-path`: if True, use the prompt as the name for saving samples
- `--prompt-path`: path to the prompt file
- `--prompt`: prompt string list
- `--num-frames`: number of frames
- `--fps`: frames per second
- `--image-size`: image size
- `--num-sampling-steps`: number of sampling steps (`scheduler["num_sampling_steps"]`)
- `--cfg-scale`: hyper-parameter for classifier-free diffusion (`scheduler["cfg_scale"]`)
- `--loop`: loop for long video generation
- `--condition-frame-length`: condition frame length for long video generation
- `--reference-path`: reference path for long video generation
- `--mask-strategy`: mask strategy for long video generation

Example commands for inference can be found in [commands.md](/docs/commands.md).

## Training Config

```python
# Define dataset
dataset = dict(
    type="VariableVideoTextDataset",   # Select dataset type
    # VideoTextDataset for OpenSora 1.0, VariableVideoTextDataset for OpenSora 1.1 and 1.2
    data_path=None,                    # Path to the dataset
    num_frames=None,                   # Number of frames, set None since we support dynamic training
    frame_interval=3,                  # Frame interval
    image_size=(None, None),           # Image size, set None since we support dynamic training
    transform_name="resize_crop",      # Transform name
)
# bucket config usage see next section
bucket_config = {
    "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)},
    "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
    "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
    "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)},
    "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
    "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
    "1024": {1: (0.3, 20)},
    "1080p": {1: (0.4, 8)},
}
# mask ratio in training
mask_ratios = {
    "identity": 0.75,                   # 75% no mask
    "quarter_random": 0.025,      # 2.5% random mask with 1 frame to 1/4 #frames
    "quarter_head": 0.025,        # 2.5% mask at the beginning with 1 frame to 1/4 #frames
    "quarter_tail": 0.025,        # 2.5% mask at the end with 1 frame to 1/4 #frames
    "quarter_head_tail": 0.05,    # 5% mask at the beginning and end with 1 frame to 1/4 #frames
    "image_random": 0.025,        # 2.5% random mask with 1 image to 1/4 #images
    "image_head": 0.025,          # 2.5% mask at the beginning with 1 image to 1/4 #images
    "image_tail": 0.025,          # 2.5% mask at the end with 1 image to 1/4 #images
    "image_head_tail": 0.05,      # 5% mask at the beginning and end with 1 image to 1/4 #images
}

# Define acceleration
num_workers = 8                        # Number of workers for dataloader
num_bucket_build_workers = 16          # Number of workers for bucket building
dtype = "bf16"                         # Computation type (fp16, fp32, bf16)
grad_checkpoint = True                 # Use gradient checkpointing
plugin = "zero2"                       # Plugin for training
sp_size = 1                            # Sequence parallel size

# Define model
model = dict(
    type="STDiT2-XL/2",                # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
    from_pretrained=None,              # Load from pretrained model
    input_sq_size=512,                 # Base spatial position embedding size
    qk_norm=True,                      # Normalize query and key in attention
    enable_flash_attn=True,             # (Optional) Speed up training and inference with flash attention
    enable_layernorm_kernel=True,      # (Optional) Speed up training and inference with fused kernel
)
vae = dict(
    type="VideoAutoencoderKL",         # Select VAE type
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=4,                # VAE with micro batch size to save memory
    local_files_only=True,             # Load from local files only (first time should be false)
)
text_encoder = dict(
    type="t5",                         # Select text encoder type (t5, clip)
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=200,              # Maximum length of input text
    shardformer=True,                  # Use shardformer
    local_files_only=True,             # Load from local files only (first time should be false)
)
scheduler = dict(
    type="iddpm",                      # Select scheduler type (iddpm, iddpm-speed)
    timestep_respacing="",
)

# Others
seed = 42                              # random seed
outputs = "outputs"                    # path to save outputs
wandb = False                          # Use wandb or not

epochs = 1000                          # Number of epochs (set a large number and kill the process when you want to stop)
log_every = 10
ckpt_every = 500
load = None

batch_size = None
lr = 2e-5
grad_clip = 1.0
```

## Training Args

- `--seed`: random seed
- `--ckpt-path`: path to the checkpoint (`model["from_pretrained"]`)
- `--batch-size`: batch size
- `--wandb`: use wandb or not
- `--load`: path to the checkpoint to load
- `--data-path`: path to the dataset (`dataset["data_path"]`)

See [commands.md](/docs/commands.md) for example commands.

## Training Bucket Configs

We support multi-resolution/aspect-ratio/num_frames training with bucket. To enable dynamic training (for STDiT2), use `VariableVideoText` dataset, and set the `bucket_config` in the config. An example is:

```python
bucket_config = {
    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
    "256": {1: (1.0, 256)},
    "512": {1: (1.0, 80)},
    "480p": {1: (1.0, 52), 16: (0.5, 4), 32: (0.0, None)},
    "720p": {16: (1.0, 2), 32: (0.0, None)},
    "1024": {1: (1.0, 20)},
    "1080p": {1: (1.0, 8)},
}
```

This looks a bit difficult to understand at the first glance. Let's understand this config step by step.

### Three-level bucket

![bucket](/assets/readme/report_bucket.png)

We design a three-level bucket: `(resolution, num_frames, aspect_ratios)`. The resolution and aspect ratios is predefined in [aspect.py](/opensora/datasets/aspect.py). Commonly used resolutions (e.g., 240p, 1080p) are supported, and the name represents the number of pixels (e.g., 240p is 240x426, however, we define 240p to represent any size with HxW approximately 240x426=102240 pixels). The aspect ratios are defined for each resolution. You do not need to define the aspect ratios in the `bucket_config`.

The `num_frames` is the number of frames in each sample, with `num_frames=1` especially for images. If `frame_intervals` is not 1, a bucket with `num_frames=k` will contain videos with `k*frame_intervals` frames except for images. Only a video with more than `num_frames` and more than `resolution` pixels will be likely to be put into the bucket.

The two number defined in the bucket config is `(keep_prob, batch_size)`. Since the memory and speed of samples from different buckets may be different, we use `batch_size` to balance the processing speed. Since our computation is limited, we cannot process videos with their original resolution as stated in OpenAI's sora's report. Thus, we give a `keep_prob` to control the number of samples in each bucket. The `keep_prob` is the probability to keep a sample in the bucket. Let's take the following config as an example:

```python
bucket_config = {
    "480p": {16: (1.0, 8),},
    "720p": {16: (0.5, 4),},
    "1080p": {16: (0.2, 2)},
    "4K", {16: (0.1, 1)},
}
```

Given a 2K video with more than 16 frames, the program will first try to put it into bucket "1080p" since it has a larger resolution than 1080p but less than 4K. Since the `keep_prob` for 1080p is 20%, a random number is generated, and if it is less than 0.2, the video will be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "720p" bucket. Since the `keep_prob` for 720p is 50%, the video has a 50% chance to be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "480p" bucket directly as it is the smallest resolution.

### Examples

Let's see some simple examples to understand the bucket config. First, the aspect ratio bucket is compulsory, if you want to modify this you need to add your own resolution definition in [aspect.py](/opensora/datasets/aspect.py). Then, to keep only 256x256 resolution and 16 frames as OpenSora 1.0, you can use the following config:

```python
bucket_config = {
    "256": {16: (1.0, 8)},
}
```

If you want to train a model supporting different resolutions of images, you can use the following config (example [image.py](/configs/opensora-v1-1/train/image.py)):

```python
bucket_config = {
    "256": {1: (1.0, 256)},
    "512": {1: (1.0, 80)},
    "480p": {1: (1.0, 52)},
    "1024": {1: (1.0, 20)},
    "1080p": {1: (1.0, 8)},
}
```

Or if you find the number of high-resolution images is too large, you can modify the `keep_prob` to reduce the number of samples in the bucket:

```python
bucket_config = {
    "256": {1: (1.0, 256)},
    "512": {1: (0.8, 80)},
    "480p": {1: (0.5, 52)},
    "1024": {1: (0.5, 20)},
    "1080p": {1: (0.2, 8)},
}
```

And similarly for videos (example [video.py](/configs/opensora-v1-1/train/video.py)):

```python
bucket_config = {
    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
    "480p": {16: (1.0, 4)},
    "720p": {16: (0.5, 2)},
}
```

Note that in the above case, a video with 480p resolution and more than 16 frames will all go into bucket `("480p", 16)`, since they all satisfy this bucket's requirement. But training long videos with 480p resolution may be slow, so you can modify the config as follows to enforce the video with more than 32 frames to go into the 240p bucket.

```python
bucket_config = {
    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
    "480p": {16: (1.0, 4), 32: (0.0, None)},
    "720p": {16: (0.5, 2)},
}
```

Combine the above examples together, we think you can understand the bucket config provided at the beginning of this section and in the config files.


================================================
FILE: Open-Sora/docs/data_processing.md
================================================
# Data Processing
>Open-Sora v1.2 uses Data Propcessing Pipeline v1.1.

We establish a complete pipeline for video/image data processing. The pipeline is shown below.

![pipeline](/assets/readme/report_data_pipeline.png)

First, raw videos,
either from the  Internet or public datasets, are split into shorter clips based on scene detection.
Then, we evaluate these videos by predicting multiple scores using existing models. We first predict the aesthetic score
and the optical flow score for a video. We also conduct OCR to detect texts in the video. Only videos with satisfactory
evaluation results are sent to the next step for captioning. After captioning, the matching score is also calculated as
an assessment of video-text alignment. Finally, we filter samples based on the matching score and
conduct camera motion detection for the remaining samples.
In summary, our pipeline produces video-text pairs which have high aesthetic quality, large video motion and strong
semantic consistency.

Below is an example workflow to process videos.

```bash
ROOT_VIDEO="/path/to/video/folder"
ROOT_CLIPS="/path/to/video/clips/folder"
ROOT_META="/path/to/meta/folder"

# 1.1 Create a meta file from a video folder. This should output ${ROOT_META}/meta.csv
python -m tools.datasets.convert video ${ROOT_VIDEO} --output ${ROOT_META}/meta.csv

# 1.2 Get video information and remove broken videos. This should output ${ROOT_META}/meta_info_fmin1.csv
python -m tools.datasets.datautil ${ROOT_META}/meta.csv --info --fmin 1

# 2.1 Detect scenes. This should output ${ROOT_META}/meta_info_fmin1_timestamp.csv
python -m tools.scene_cut.scene_detect ${ROOT_META}/meta_info_fmin1.csv

# 2.2 Cut video into clips based on scenes. This should produce video clips under ${ROOT_CLIPS}
python -m tools.scene_cut.cut ${ROOT_META}/meta_info_fmin1_timestamp.csv --save_dir ${ROOT_CLIPS}

# 2.3 Create a meta file for video clips. This should output ${ROOT_META}/meta_clips.csv
python -m tools.datasets.convert video ${ROOT_CLIPS} --output ${ROOT_META}/meta_clips.csv

# 2.4 Get clips information and remove broken ones. This should output ${ROOT_META}/meta_clips_info_fmin1.csv
python -m tools.datasets.datautil ${ROOT_META}/meta_clips.csv --info --fmin 1

# 3.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes.csv
torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference \
  ${ROOT_META}/meta_clips_info_fmin1.csv \
  --bs 1024 \
  --num_workers 16

# 3.2 Filter by aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv
python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes.csv --aesmin 5

# 4.1 Generate caption. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5_caption_part*.csv
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava \
  ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv \
  --dp-size 8 \
  --tp-size 1 \
  --model-path /path/to/llava-v1.6-mistral-7b \
  --prompt video

# 4.2 Merge caption results. This should output ${ROOT_META}/meta_clips_caption.csv
python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5_caption_part*.csv --output ${ROOT_META}/meta_clips_caption.csv

# 4.3 Clean caption. This should output ${ROOT_META}/meta_clips_caption_cleaned.csv
python -m tools.datasets.datautil \
  ${ROOT_META}/meta_clips_caption.csv \
  --clean-caption \
  --refine-llm-caption \
  --remove-empty-caption \
  --output ${ROOT_META}/meta_clips_caption_cleaned.csv

# 4.4 Optionally generate tags (e.g., objects) based on the captions. This should output your_output_prefix_{key}.csv
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llama3 ${ROOT_META}/meta_clips_caption_cleaned.csv --key objects --output_prefix your_output_prefix

```


For more information, please refer to:
- [Dataset Management](../tools/datasets/README.md)
- [Scene Detection and Video Splitting](../tools/scene_cut/README.md)
- [Scoring and Filtering](../tools/scoring/README.md)
- [Captioning](../tools/caption/README.md)


================================================
FILE: Open-Sora/docs/datasets.md
================================================
# Datasets

For Open-Sora 1.2, we conduct mixed training with both images and videos. The main datasets we use are listed below.
Please refer to [README](/README.md#data-processing) for data processing.

## Video

### Webvid-10M

[Webvid-10M](https://github.com/m-bain/webvid) contains 10 million video-text pairs scraped from the stock footage sites.
We first train the model on this dataset (40k hours) for 30k steps (2 epochs).

### Panda-70M

[Panda-70M](https://github.com/snap-research/Panda-70M) is a large-scale dataset with 70M video-caption pairs.
We use the [training-10M subset](https://github.com/snap-research/Panda-70M/tree/main/dataset_dataloading) for training,
which contains ~10M videos of better quality.

### Mixkit

[Mixkit](https://mixkit.co/) is a video website where we obtained 9k videos.

### Pixabay

[Pixabay](https://pixabay.com/videos/) is video website where we obtained 60.5k videos.

### Pexels

[Pexels](https://www.pexels.com/) is a popular online platform that provides high-quality stock photos, videos, and music for free.
Most videos from this website are of high quality. Thus, we use them for both pre-training and HQ fine-tuning.
We really appreciate the great platform and the contributors!

### Inter4K

[Inter4K](https://github.com/alexandrosstergiou/Inter4K) is a dataset containing 1K video clips with 4K resolution.
The dataset is proposed for super-resolution tasks. We use the dataset for HQ fine-tuning.

### HD-VG-130M

[HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) comprises 130M text-video pairs.
The caption is generated by BLIP-2.
We find the scene and the text quality are relatively poor. For OpenSora 1.0, we only use ~350K samples from this dataset.

### MiraData

[MiraData](https://github.com/mira-space/MiraData): a high-quality dataset with 77k long videos, mainly from games and city/scenic exploration.


### Vript

[Vript](https://github.com/mutonix/Vript/tree/main): a densely annotated dataset of 400k videos.


## Image

### Midjourney-v5-1.7M

[Midjourney-v5-1.7M](https://huggingface.co/datasets/wanng/midjourney-v5-202304-clean) includes 1.7M image-text pairs.
In detail, this dataset introduces two subsets: original and upscale.
This dataset is proposed for exploring the relationship of prompts and high-quality images.

### Midjourney-kaggle-clean

[Midjourney-kaggle-clean](https://huggingface.co/datasets/wanng/midjourney-kaggle-clean) is a reconstructed version of [Midjourney User Prompts & Generated Images (250k)](https://www.kaggle.com/datasets/succinctlyai/midjourney-texttoimage?select=general-01_2022_06_20.json%5D), which is cleaned by rules.
Moreover, this dataset is divided into two subsets: original and upscale.
This dataset is proposed for enabling research on text-to-image model prompting.

### Unsplash-lite

The [Unsplash-lite](https://github.com/unsplash/datasets) Dataset comprises 25k nature-themed Unsplash photos, 25k keywords, and 1M searches.
This dataset covers a vast range of uses and contexts. Its extensive scope in intent and semantics opens new avenues for research and learning.

### LAION-AESTHETICS 6.5+

LAION aesthetic 6.5+ dataset is a subset of the LAION dataset, which contains 625K high-quality images with aesthetic scores > 6.5. However, as LAION is currently not publicly available, we use this 168k [subset](https://huggingface.co/datasets/bhargavsdesai/laion_improved_aesthetics_6.5plus_with_images).


================================================
FILE: Open-Sora/docs/installation.md
================================================
# Installation

Requirements are listed in `requirements` folder.
Note that besides these packages, some packages needs to be mannually installed, and are detailed in the following sections.

## Training & Inference

You need to install `opensora` for training and inference. You can follow the steps below for installation. We also provide guideline for different CUDA versions for compatiblity.

Please note that the default installation is for training and inference only. Other optional dependencies are detailed in the sections [Data Processing](#data-processing), [Evaluation](#evaluation), and [VAE](#vae) respectively.

### Step 1: Install PyTorch and xformers

First of all, make sure you have the latest build toolkit for Python.

```bash
# update build libs
pip install -U pip setuptools wheel
```

If you are using **CUDA 12.1**,  you can execute the command below to directly install PyTorch, torchvision and xformers.

```bash
# install pytorch, torchvision, and xformers
pip install -r requirements/requirements-cu121.txt
```

If you are using different CUDA versions, you need to manually install `torch`, `torchvision` and `xformers`. You can find the compatible distributions according to the links below.

- PyTorch: choose install commands from [PyTorch installation page](https://pytorch.org/get-started/locally/) based on your own CUDA version.
- xformers: choose install commands from [xformers repo](https://github.com/facebookresearch/xformers?tab=readme-ov-file#installing-xformers) based on your own CUDA version.

### Step 2: Install Open-Sora

Then, you can install the project for training and inference with the following commands:

```bash
# install this project
git clone https://github.com/hpcaitech/Open-Sora
cd Open-Sora

# the default installation is for inference only
pip install -v . # NOTE: for development mode, run `pip install -v -e .`
```

### Step 3: Install Acceleration Tools (Optional)

This is optional but recommended for faster speed, especially for training. To enable `layernorm_kernel` and `flash_attn`, you need to install `apex` and `flash-attn` with the following commands.

```bash
# install flash attention
# set enable_flash_attn=False in config to disable flash attention
pip install packaging ninja
pip install flash-attn --no-build-isolation

# install apex, the compilation will take a long time
# set enable_layernorm_kernel=False in config to disable apex
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
```

## Data Processing

### Step 1: Install Requirements

First, run the following command to install requirements:

```bash
pip install -v .[data]
# For development: `pip install -v -e .[eval]`
```

Next, you need to manually install the packages listed in the following sections specific to your data processing needs.

### Step 2: Install OpenCV

To get image and video information, we use [opencv-python](https://github.com/opencv/opencv-python). You can install it with pip:

```bash
pip install opencv-python
```

However, if your videos are in av1 codec instead of h264, you need to install ffmpeg (already in our [requirement script](../requirements/requirements-data.txt)), then run the following to make conda support av1 codec:

```bash
pip uninstall opencv-python
conda install -c conda-forge opencv
```

### Step 3: Install Task-specific Dependencies

We have a variety of data processing pipelines, each requires its own dependencies. You can refer to the sections below to install dependencies according to your own needs.

#### LLaVA Captioning

You need to manually install LLaVA with the following command:

```bash
pip install --no-deps llava@git+https://github.com/haotian-liu/LLaVA.git@v1.2.2.post1
```

#### PLLaVA Captioning

You need to manually install PLLaVa with the following commands:

```bash
cd tools/caption/pllava_dir # Assume you are in Open-Sora-dev root directory
git clone https://github.com/magic-research/PLLaVA.git
cd PLLaVA
git checkout fd9194a # since there is no version tag, we use this commit
python python_scripts/hf.py # download the PLLaVA weights

# IMPORTANT: create new environment for reliable pllava performances:
conda create -n pllava python=3.10
# You need to manually install `torch`, `torchvision` and `xformers` for different CUDA versions, the following works for CUDA 12.1:
conda activate pllava
pip install -r ../../../requirements/requirements-cu121.txt
pip install packaging ninja
pip install flash-attn --no-build-isolation
# You may manually remove any lines in requirements.txt that contains `cu11`, then run `pip install -r requirements.txt`
# Alternatively, use our prepared pllava environment:
pip install -r ../../../../requirements/requirements-pllava.txt
```

#### Scene Detection

We use [`PySceneDetect`](https://github.com/Breakthrough/PySceneDetect) for this job. You need to manually run the following:

```bash
pip install scenedetect[opencv] --upgrade
```

#### OCR

You need to go into `path_to_your_env/lib/python3.10/site-packages/mmdet/__init__.py`
and change the assert of `mmcv_version < digit_version(mmcv_maximum_version)` to `mmcv_version <= digit_version(mmcv_maximum_version)`.

If you are unsure of your path to the mmdet init file, simply run our [OCR command](../tools/scoring/README.md), wait for the mmdeet assertion error on mmcv versions.
The error will contain the exact path to the mmdet init file.


## Evaluation

### Step 1: Install Requirements

To conduct evaluation, run the following command to install requirements:

```bash
pip install -v .[eval]
# For development:`pip install -v -e .[eval]`
```

### Step 2: Install VBench

<!-- You need to manually install [VBench](https://github.com/Vchitect/VBench):

```bash
pip install --no-deps vbench==0.1.1
# If the installation shows a warning about the intalled vbench not in PATH, you need to add it by:
export PATH="/path/to/vbench:$PATH"
``` -->

You need to install VBench mannually by:
```bash
# first clone their repo
cd .. # assume you are in the Open-Sora root folder, you may install at other location but make sure the soft link paths later are correct
git clone https://github.com/Vchitect/VBench.git
cd VBench
git checkout v0.1.2

# next, fix their hard-coded path isse
vim vbench2_beta_i2v/utils.py
# find `image_root` in the `load_i2v_dimension_info` function, change it to point to your appropriate image folder

# last, create softlinks
cd ../Open-Sora # or `cd ../Open-Sora-dev` for development
ln -s ../VBench/vbench vbench # you may need to change ../VBench/vbench to your corresponding path
ln -s ../VBench/vbench2_beta_i2v vbench2_beta_i2v # you may need to change ../VBench/vbench_beta_i2v to your corresponding path
# later you need to make sure to run evaluation from your Open-Sora folder, else vbench, vbench2_beta_i2v cannot be found
```


### Step 3: Install `cupy` for Potential VAE Errors

You need to mannually install [cupy](https://docs.cupy.dev/en/stable/install.html).

- For CUDA v11.2~11.8 (x86_64 / aarch64), `pip install cupy-cuda11x`
- For CUDA v12.x (x86_64 / aarch64), `pip install cupy-cuda12x`

Note that for VAE evaluation, you may run into error with `ModuleNotFoundError: No module named 'torchvision.transforms.functional_tensor'`, in this case, you need to go to the corresponding file (`.../pytorchvideo/transforms/augmentations.py`) reporting this error, then change as following:

```python
# find the original line:
import torchvision.transforms.functional_tensor as F_t
# change to:
import torchvision.transforms._functional_tensor as F_t
```


## VAE

### Step 1: Install Requirements

To train and evaluate your own VAE, run the following command to install requirements:

```bash
pip install -v .[vae]
# For development:`pip install -v -e .[vae]`
```

### Step 2: VAE Evaluation (`cupy` and Potential VAE Errors)

Refer to the [Evaluation's VAE section](#step-3-install-cupy-for-potential-vae-errors) above.


================================================
FILE: Open-Sora/docs/report_01.md
================================================
# Open-Sora 1.0 Report

OpenAI's Sora is amazing at generating one minutes high quality videos. However, it reveals almost no information about its details. To make AI more "open", we are dedicated to build an open-source version of Sora. This report describes our first attempt to train a transformer-based video diffusion model.

## Efficiency in choosing the architecture

To lower the computational cost, we want to utilize existing VAE models. Sora uses spatial-temporal VAE to reduce the temporal dimensions. However, we found that there is no open-source high-quality spatial-temporal VAE model. [MAGVIT](https://github.com/google-research/magvit)'s 4x4x4 VAE is not open-sourced, while [VideoGPT](https://wilson1yan.github.io/videogpt/index.html)'s 2x4x4 VAE has a low quality in our experiments. Thus, we decided to use a 2D VAE (from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)) in our first version.

The video training involves a large amount of tokens. Considering 24fps 1min videos, we have 1440 frames. With VAE downsampling 4x and patch size downsampling 2x, we have 1440x1024≈1.5M tokens. Full attention on 1.5M tokens leads to a huge computational cost. Thus, we use spatial-temporal attention to reduce the cost following [Latte](https://github.com/Vchitect/Latte).

As shown in the figure, we insert a temporal attention right after each spatial attention in STDiT (ST stands for spatial-temporal). This is similar to variant 3 in Latte's paper. However, we do not control a similar number of parameters for these variants. While Latte's paper claims their variant is better than variant 3, our experiments on 16x256x256 videos show that with same number of iterations, the performance ranks as: DiT (full) > STDiT (Sequential) > STDiT (Parallel) ≈ Latte. Thus, we choose STDiT (Sequential) out of efficiency. Speed benchmark is provided [here](/docs/acceleration.md#efficient-stdit).

![Architecture Comparison](/assets/readme/report_arch_comp.png)

To focus on video generation, we hope to train the model based on a powerful image generation model. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is an efficiently trained high-quality image generation model with T5-conditioned DiT structure. We initialize our model with PixArt-α and initialize the projection layer of inserted temporal attention with zero. This initialization preserves model's ability of image generation at beginning, while Latte's architecture cannot. The inserted attention increases the number of parameter from 580M to 724M.

![Architecture](/assets/readme/report_arch.jpg)

Drawing from the success of PixArt-α and Stable Video Diffusion, we also adopt a progressive training strategy: 16x256x256 on 366K pretraining datasets, and then 16x256x256, 16x512x512, and 64x512x512 on 20K datasets. With scaled position embedding, this strategy greatly reduces the computational cost.

We also try to use a 3D patch embedder in DiT. However, with 2x downsampling on temporal dimension, the generated videos have a low quality. Thus, we leave the downsampling to temporal VAE in our next version. For now, we sample at every 3 frames with 16 frames training and every 2 frames with 64 frames training.

## Data is the key to high quality

We find that the number and quality of data have a great impact on the quality of generated videos, even larger than the model architecture and training strategy. At this time, we only prepared the first split (366K video clips) from [HD-VG-130M](https://github.com/daooshee/HD-VG-130M). The quality of these videos varies greatly, and the captions are not that accurate. Thus, we further collect 20k relatively high quality videos from [Pexels](https://www.pexels.com/), which provides free license videos. We label the video with LLaVA, an image captioning model, with three frames and a designed prompt. With designed prompt, LLaVA can generate good quality of captions.

![Caption](/assets/readme/report_caption.png)

As we lay more emphasis on the quality of data, we prepare to collect more data and build a video preprocessing pipeline in our next version.

## Training Details

With a limited training budgets, we made only a few exploration. We find learning rate 1e-4 is too large and scales down to 2e-5. When training with a large batch size, we find `fp16` less stable than `bf16` and may lead to generation failure. Thus, we switch to `bf16` for training on 64x512x512. For other hyper-parameters, we follow previous works.

## Loss curves

16x256x256 Pretraining Loss Curve

![16x256x256 Pretraining Loss Curve](/assets/readme/report_loss_curve_1.png)

16x256x256 HQ Training Loss Curve

![16x256x256 HQ Training Loss Curve](/assets/readme/report_loss_curve_2.png)

16x512x512 HQ Training Loss Curve

![16x512x512 HQ Training Loss Curve](/assets/readme/report_loss_curve_3.png)

> Core Contributor: Zangwei Zheng*, Xiangyu Peng*, Shenggui Li, Hongxing Liu, Yang You


================================================
FILE: Open-Sora/docs/report_02.md
================================================
# Open-Sora 1.1 Report

- [Model Architecture Modification](#model-architecture-modification)
- [Support for Multi-time/resolution/aspect ratio/fps Training](#support-for-multi-timeresolutionaspect-ratiofps-training)
- [Masked DiT as Image/Video-to-Video Model](#masked-dit-as-imagevideo-to-video-model)
- [Data Collection \& Pipeline](#data-collection--pipeline)
- [Training Details](#training-details)
- [Limitation and Future Work](#limitation-and-future-work)

In Open-Sora 1.1 release, we train a 700M models on 10M data (Open-Sora 1.0 trained on 400K data) with a better STDiT architecture. We implement the following features mentioned in [sora's report](https://openai.com/research/video-generation-models-as-world-simulators):

- Variable durations, resolutions, aspect ratios (Sampling flexibility, Improved framing and composition)
- Prompting with images and videos (Animating images, Extending generated videos, Video-to-video editing, Connecting videos)
- Image generation capabilities

To achieve this goal, we use multi-task learning in the pretraining stage. For diffusion models, training with different sampled timestep is already a multi-task learning. We further extend this idea to multi-resolution, aspect ratio, frame length, fps, and different mask strategies for image and video conditioned generation. We train the model on **0s~15s, 144p to 720p, various aspect ratios** videos. Although the quality of time consistency is not that high due to limit training FLOPs, we can still see the potential of the model.

## Model Architecture Modification

We made the following modifications to the original ST-DiT for better training stability and performance (ST-DiT-2):

- **[Rope embedding](https://arxiv.org/abs/2104.09864) for temporal attention**: Following LLM's best practice, we change the sinusoidal positional encoding to rope embedding for temporal attention since it is also a sequence prediction task.
- **AdaIN and Layernorm for temporal attention**: we wrap the temporal attention with AdaIN and layernorm as the spatial attention to stabilize the training.
- **[QK-normalization](https://arxiv.org/abs/2302.05442) with [RMSNorm](https://arxiv.org/abs/1910.07467)**: Following [SD3](https://arxiv.org/pdf/2403.03206.pdf), we apply QK-normalization to the all attention for better training stability in half-precision.
- **Dynamic input size support and video infomation condition**: To support multi-resolution, aspect ratio, and fps training, we make ST-DiT-2 to accept any input size, and automatically scale positional embeddings. Extending [PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)'s idea, we conditioned on video's height, width, aspect ratio, frame length, and fps.
- **Extending T5 tokens from 120 to 200**: our caption is usually less than 200 tokens, and we find the model can handle longer text well.

## Support for Multi-time/resolution/aspect ratio/fps Training

As mentioned in the [sora's report](https://openai.com/research/video-generation-models-as-world-simulators), training with original video's resolution, aspect ratio, and length increase sampling flexibility and improve framing and composition. We found three ways to achieve this goal:

- [NaViT](https://arxiv.org/abs/2307.06304): support dynamic size within the same batch by masking, with little efficiency loss. However, the system is a bit complex to implement, and may not benefit from optimized kernels such as flash attention.
- Padding ([FiT](https://arxiv.org/abs/2402.12376), [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)): support dynamic size within the same batch by padding. However, padding different resolutions to the same size is not efficient.
- Bucket ([SDXL](https://arxiv.org/abs/2307.01952), [PixArt](https://arxiv.org/abs/2310.00426)): support dynamic size in different batches by bucketing, but the size must be the same within the same batch, and only a fixed number of size can be applied. With the same size in a batch, we do not need to implement complex masking or padding.

For the simplicity of implementation, we choose the bucket method. We pre-define some fixed resolution, and allocate different samples to different bucket. The concern for bucketing is listed below. But we can see that the concern is not a big issue in our case.

<details>
<summary>View the concerns</summary>

- The bucket size is limited to a fixed number: First, in real-world applications, only a few aspect ratios (9:16, 3:4) and resolutions (240p, 1080p) are commonly used. Second, we find trained models can generalize well to unseen resolutions.
- The size in each batch is the same, breaks the i.i.d. assumption: Since we are using multiple GPUs, the local batches on different GPUs have different sizes. We did not see a significant performance drop due to this issue.
- The may not be enough samples to fill each bucket and the distribution may be biased: First, our dataset is large enough to fill each bucket when local batch size is not too large. Second, we should analyze the data's distribution on sizes and define the bucket size accordingly. Third, an unbalanced distribution did not affect the training process significantly.
- Different resolutions and frame lengths may have different processing speed: Different from PixArt, which only deals with aspect ratios of similar resolutions (similar token numbers), we need to consider the processing speed of different resolutions and frame lengths. We can use the `bucket_config` to define the batch size for each bucket to ensure the processing speed is similar.

</details>

![bucket](/assets/readme/report_bucket.png)

As shown in the figure, a bucket is a triplet of `(resolution, num_frame, aspect_ratio)`. We provide pre-defined aspect ratios for different resolution that covers most of the common video aspect ratios. Before each epoch, we shuffle the dataset and allocate the samples to different buckets as shown in the figure. We put a sample into a bucket with largest resolution and frame length that is smaller than the video's.

Considering our computational resource is limited, we further introduce two attributes `keep_prob` and `batch_size` for each `(resolution, num_frame)` to reduce the computational cost and enable multi-stage training. Specifically, a high-resolution video will be downsampled to a lower resolution with probability `1-keep_prob` and the batch size for each bucket is `batch_size`. In this way, we can control the number of samples in different buckets and balance the GPU load by search a good batch size for each bucket.

A detailed explanation of the bucket usage in training is available in [docs/config.md](/docs/config.md#training-bucket-configs).

## Masked DiT as Image/Video-to-Video Model

Transformers can be easily extended to support image-to-image and video-to-video tasks. We propose a mask strategy to support image and video conditioning. The mask strategy is shown in the figure below.

![mask strategy](/assets/readme/report_mask.png)

Typically, we unmask the frames to be conditioned on for image/video-to-video condition. During the ST-DiT forward, unmasked frames will have timestep 0, while others remain the same (t). We find directly apply the strategy to trained model yield poor results as the diffusion model did not learn to handle different timesteps in one sample during training.

Inspired by [UL2](https://arxiv.org/abs/2205.05131), we introduce random mask strategy during training. Specifically, we randomly unmask the frames during training, including unmask the first frame, the first k frames, the last frame, the last k frames, the first and last k frames, random frames, etc. Based on Open-Sora 1.0, with 50% probability of applying masking, we see the model can learn to handle image conditioning (while 30% yields worse ability) for 10k steps, with a little text-to-video performance drop. Thus, for Open-Sora 1.1, we pretrain the model from scratch with masking strategy.

An illustration of masking strategy config to use in inference is given as follow. A five number tuple provides great flexibility in defining the mask strategy. By conditioning on generated frames, we can autogressively generate infinite frames (although error propagates).

![mask strategy config](/assets/readme/report_mask_config.png)

A detailed explanation of the mask strategy usage is available in [docs/config.md](/docs/config.md#advanced-inference-config).

## Data Collection & Pipeline

As we found in Open-Sora 1.0, the data number and quality are crucial for training a good model, we work hard on scaling the dataset. First, we create an automatic pipeline following [SVD](https://arxiv.org/abs/2311.15127), inlcuding scene cutting, captioning, various scoring and filtering, and dataset management scripts and conventions. More infomation can be found in [docs/data_processing.md](/docs/data_processing.md).

![pipeline](/assets/readme/report_data_pipeline.png)

We plan to use [panda-70M](https://snap-research.github.io/Panda-70M/) and other data to traing the model, which is approximately 30M+ data. However, we find disk IO a botteleneck for training and data processing at the same time. Thus, we can only prepare a 10M dataset and did not go through all processing pipeline that we built. Finally, we use a dataset with 9.7M videos + 2.6M images for pre-training, and 560k videos + 1.6M images for fine-tuning. The pretraining dataset statistics are shown below. More information about the dataset can be found in [docs/datasets.md](/docs/datasets.md).

Image text tokens (by T5 tokenizer):

![image text tokens](/assets/readme/report_image_textlen.png)

Video text tokens (by T5 tokenizer). We directly use panda's short caption for training, and caption other datasets by ourselves. The generated caption is usually less than 200 tokens.

![video text tokens](/assets/readme/report_video_textlen.png)

Video duration:

![video duration](/assets/readme/report_video_duration.png)

## Training Details

With limited computational resources, we have to carefully monitor the training process, and change the training strategy if we speculate the model is not learning well since there is no computation for ablation study. Thus, Open-Sora 1.1's training includes multiple changes, and as a result, ema is not applied.

1. First, we fine-tune **6k** steps with images of different resolution from `Pixart-alpha-1024` checkpoints. We find the model easily adapts to generate images with different resolutions. We use [SpeeDiT](https://github.com/1zeryu/SpeeDiT) (iddpm-speed) to accelerate the diffusion training.
2. **[Stage 1]** Then, we pretrain the model with gradient-checkpointing for **24k** steps, which takes **4 days** on 64 H800 GPUs. Although the number of samples seen by the model is the same, we find the model learns slowly compared to a smaller batch size. We speculate that at an early stage, the number of steps is more important for training. The most videos are in **240p** resolution, and the config is similar to [stage2.py](/configs/opensora-v1-1/train/stage2.py). The video looking is good, but the model does not know much about the temporal knowledge. We use mask ratio of 10%.
3. **[Stage 1]** To increase the number of steps, we switch to a smaller batch size without gradient-checkpointing. We also add fps conditioning at this point. We trained **40k** steps for **2 days**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). We use a lower resolution as we find in Open-Sora 1.0 that the model can learn temporal knowledge with relatively low resolution.
4. **[Stage 1]** We find the model cannot learn well for long videos, and find a noised generation result as speculated to be half-precision problem found in Open-Sora 1.0 training. Thus, we adopt the QK-normalization to stabilize the training. Similar to SD3, we find the model quickly adapt to the QK-normalization. We also switch iddpm-speed to iddpm, and increase the mask ratio to 25% as we find image-condition not learning well. We trained for **17k** steps for **14 hours**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). The stage 1 training lasts for approximately one week, with total step **81k**.
5. **[Stage 2]** We switch to a higher resolution, where most videos are in **240p and 480p** resolution ([stage2.py](/configs/opensora-v1-1/train/stage2.py)). We trained **22k** steps for **one day** on all pre-training data.
6. **[Stage 3]** We switch to a higher resolution, where most videos are in **480p and 720p** resolution ([stage3.py](/configs/opensora-v1-1/train/stage3.py)). We trained **4k** with **one day** on high-quality data. We find loading previous stage's optimizer state can help the model learn faster.

To summarize, the training of Open-Sora 1.1 requires approximately **9 days** on 64 H800 GPUs.

## Limitation and Future Work

As we get one step closer to the replication of Sora, we find many limitations for the current model, and these limitations point to the future work.

- **Generation Failure**: we fine many cases (especially when the total token number is large or the content is complex),  our model fails to generate the scene. There may be a collapse in the temporal attention and we have identified a potential bug in our code. We are working hard to fix it. Besides, we will increase our model size and training data to improve the generation quality in the next version.
- **Noisy generation and influency**: we find the generated model is sometimes noisy and not fluent, especially for long videos. We think the problem is due to not using a temporal VAE. As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we plan to develop a temporal VAE for the model in the next version.
- **Lack of time consistency**: we find the model cannot generate videos with high time consistency. We think the problem is due to the lack of training FLOPs. We plan to collect more data and continue training the model to improve the time consistency.
- **Bad human generation**: We find the model cannot generate high-quality human videos. We think the problem is due to the lack of human data. We plan to collect more human data and continue training the model to improve the human generation.
- **Low aesthetic score**: we find the model's aesthetic score is not high. The problem is due to the lack of aesthetic score filtering, which is not conducted due to IO bottleneck. We plan to filter the data by aesthetic score and finetuning the model to improve the aesthetic score.
- **Worse quality for longer video generation**: we find with a same prompt, the longer video has worse quality. This means the image quality is not equally adapted to different lengths of sequences.

> - **Algorithm & Acceleration**: Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou, Tianyi Li
> - **Data Collection & Pipeline**: Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu


================================================
FILE: Open-Sora/docs/report_03.md
================================================
# Open-Sora 1.2 Report

- [Video compression network](#video-compression-network)
- [Rectified flow and model adaptation](#rectified-flow-and-model-adaptation)
- [More data and better multi-stage training](#more-data-and-better-multi-stage-training)
- [Easy and effective model conditioning](#easy-and-effective-model-conditioning)
- [Evaluation](#evaluation)
- [Sequence parallelism](#sequence-parallelism)

In Open-Sora 1.2 release, we train a 1.1B models on >30M data (\~80k hours), with training cost 35k H100 GPU hours, supporting 0s\~16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.

|      | image | 2s  | 4s  | 8s  | 16s |
| ---- | ----- | --- | --- | --- | --- |
| 240p | ✅     | ✅   | ✅   | ✅   | ✅   |
| 360p | ✅     | ✅   | ✅   | ✅   | ✅   |
| 480p | ✅     | ✅   | ✅   | ✅   | 🆗   |
| 720p | ✅     | ✅   | ✅   | 🆗   | 🆗   |

Here ✅ means that the data is seen during training, and 🆗 means although not trained, the model can inference at that config. Inference for 🆗 requires more than one 80G memory GPU and sequence parallelism.

Besides features introduced in Open-Sora 1.1, Open-Sora 1.2 highlights:

- Video compression network
- Rectifie-flow training
- More data and better multi-stage training
- Easy and effective model conditioning
- Better evaluation metrics

All implementations (both training and inference) of the above improvements are available in the Open-Sora 1.2 release. The following sections will introduce the details of the improvements. We also refine our codebase and documentation to make it easier to use and develop, and add a LLM to [refine input prompts](/README.md#gpt-4o-prompt-refinement) and support more languages.

## Video compression network

For Open-Sora 1.0 & 1.1, we used stability-ai's 83M 2D VAE, which compress the video only in the spatial dimension by 8x8 times. To reduce the temporal dimension, we extracted one frame in every three frames. However, this method led to the low fluency of generated video as the generated fps is sacrificed. Thus, in this release, we introduce the video compression network as OpenAI's Sora does. With a 4 times compression in the temporal dimension, we do not need to extract frames and can generate videos with the original fps.

Considering the high computational cost of training a 3D VAE, we hope to re-use the knowledge learnt in the 2D VAE. We notice that after 2D VAE's compression, the features adjacent in the temporal dimension are still highly correlated. Thus, we propose a simple video compression network, which first compress the video in the spatial dimension by 8x8 times, then compress the video in the temporal dimension by 4x times. The network is shown below:

![video_compression_network](/assets/readme/report_3d_vae.png)

We initialize the 2D VAE with [SDXL's VAE](https://huggingface.co/stabilityai/sdxl-vae), which is better than our previously used one. For the 3D VAE, we adopt the structure of VAE in [Magvit-v2](https://magvit.cs.cmu.edu/v2/), which contains 300M parameters. Along with 83M 2D VAE, the total parameters of the video compression network is 384M. We train the 3D VAE for 1.2M steps with local batch size 1. The training data is videos from pixels and pixabay, and the training video size is mainly 17 frames, 256x256 resolution. Causal convolutions are used in the 3D VAE to make the image reconstruction more accurate.

Our training involves three stages:

1. For the first 380k steps, we train on 8 GPUs and freeze the 2D VAE. The training objective includes the reconstruction of the compressed features from 2D VAE (pink one in the figure) and also add a loss to make features from the 3D VAE similar to the features from the 2D VAE (pink one and green one, called identity loss). We find the latter loss can quickly make the whole VAE achieve a good performance for image and much faster to converge in the next stage.
2. For the next 260k steps, We remove the identity loss and just learn the 3D VAE.
3. For the last 540k steps , since we find only reconstruction 2D VAE's feature cannot lead to further improvement, we remove the loss and train the whole VAE to reconstruct the original videos. This stage is trained on on 24 GPUs.

For both stage 1 and stage 2 training, we adopt 20% images and 80% videos. Following [Magvit-v2](https://magvit.cs.cmu.edu/v2/), we train video using 17 frames, while zero-padding the first 16 frames for image. However, we find that this setting leads to blurring of videos with length different from 17 frames. Thus, in stage 3, we use a random number within 34 frames for mixed video length training (a.k.a., zero-pad the first  `43-n` frames if we want to train a `n` frame video), to make our VAE more robust to different video lengths. Our [training](/scripts/train_vae.py) and [inference](/scripts/inference_vae.py) code is available in the Open-Sora 1.2 release.

When using the VAE for diffusion model, our stacked VAE requires small memory as the our VAE's input is already compressed. We also split the input videos input several 17 frames clips to make the inference more efficient.  The performance of our VAE is on par with another open-sourced 3D VAE in [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/docs/Report-v1.1.0.md).

| Model              | SSIM↑ | PSNR↑  |
| ------------------ | ----- | ------ |
| Open-Sora-Plan 1.1 | 0.882 | 29.890 |
| Open-Sora 1.2      | 0.880 | 30.590 |

## Rectified flow and model adaptation

Lastest diffusion model like Stable Diffusion 3 adopts the [rectified flow](https://github.com/gnobitab/RectifiedFlow) instead of DDPM for better performance. Pitiably, SD3's rectified flow training code is not open-sourced. However, Open-Sora 1.2 provides the training code following SD3's paper, including:

- Basic rectified flow training ([original rectified flow paper](https://arxiv.org/abs/2209.03003))
- Logit-norm sampling for training acceleration ([SD3 paper](https://arxiv.org/pdf/2403.03206) Section 3.1, intuitively it is more likely to sample timesteps at middle noise level)
- Resolution and video length aware timestep sampling ([SD3 paper](https://arxiv.org/pdf/2403.03206) Section 5.3.2, intuitively it is more likely to sample timesteps with more noise for larger resolution, and we extend it to longer video)

For the resolution-aware timestep sampling, we should use more noise for images with larger resolution. We extend this idea to video generation and use more noise for videos with longer length.

Open-Sora 1.2 starts from the [PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma) checkpoint. Note that this model is trained with DDPM and SDXL VAE, also a much higher resolution. We find finetuning on a small dataset can easily adapt the model for our video generation setting. The adaptation process is as follows, all training is done on 8 GPUs (the adaptation for the diffusion model is quite fast and straightforward):

1. Multi-resolution image generation ability: we train the model to generate different resolution ranging from 144p to 2K for 20k steps.
2. QK-norm: we add the QK-norm to the model and train for 18k steps.
3. Rectified flow: we transform from discrete-time DDPM to continuous-time rectified flow and train for 10k steps.
4. Rectified flow with logit-norm sampling and resolution-aware timestep sampling: we train for 33k steps.
5. Smaller AdamW epsilon: following SD3, with QK-norm, we can use a smaller epsilon (1e-15) for AdamW, we train for 8k steps.
6. New VAE and fps conditioning: we replace the original VAE with ours and add fps conditioning to the timestep conditioning, we train for 25k steps. Note that normalizing each channel is important for rectified flow training.
7. Temporal attention blocks: we add temporal attention blocks with zero initialized projection layers. We train on images for 3k steps.
8. Temporal blocks only for video with mask strategy: we train the temporal attention blocks only on videos for 38k steps.

After the above adaptation, we are ready to train the model on videos. The adaptation above maintains the original model's ability to generate high-quality images, and brings multiple benefits for video generation:

- With rectified flow, we can accelerate the training and reduce the number of sampling steps for video from 100 to 30, which greatly reduces the waiting time for inference.
- With qk-norm, the training is more stablized and an aggressive optimizer can be used.
- With new VAE, the temporal dimension is compressed by 4 times, which makes the training more efficient.
- With multi-resolution image generation ability, the model can generate videos with different resolutions.

## More data and better multi-stage training

Due to a limited computational budget, we carefully arrange the training data from low to high quality and split our training into three stages. Our training involves 12x8 GPUs, and the total training time is about 2 weeks for about 70k steps.

### First stage

We first train the model on Webvid-10M datasets (40k hours) for 30k steps (2 epochs). Since the video is all lower than 360p resolution and contains watermark, we train on this dataset first. The training mainly happens on 240p and 360p, with video length 2s~16s. We use the original caption in the dataset for training. The training config locates in [stage1.py](/configs/opensora-v1-2/train/stage1.py).

### Second stage

Then we train the model on Panda-70M datasets. This dataset is large but the quality varies. We use the official 30M subset which clips are more diverse, and filter out videos with aesthetic score lower than 4.5. This leads to a 20M subset with 41k hours. The captions in the dataset are directly used for our training. The training config locates in [stage2.py](/configs/opensora-v1-2/train/stage2.py).

The training mainly happens on 360p and 480p. We train the model for 23k steps, which is 0.5 epoch. The training is not fully done since we hope our new model can meet you earlier.

### Third stage

In this stage, we collect ~2M video clips with a total length of 5K hours from all kinds of sources, including:

- Free-license videos, sourced from Pexels, Pixabay, Mixkit, etc.
- [MiraData](https://github.com/mira-space/MiraData): a high-quality dataset with long videos, mainly from games and city/scenic exploration.
- [Vript](https://github.com/mutonix/Vript/tree/main): a densely annotated dataset.
- And some other datasets.

While MiraData and Vript have captions from GPT, we use [PLLaVA](https://github.com/magic-research/PLLaVA) to caption the rest ones. Compared with LLaVA, which is only capable of single frame/image captioning, PLLaVA is specially designed and trained for video captioning. The [accelerated PLLaVA](/tools/caption/README.md#pllava-captioning) is released in our `tools/`. In practice, we use the pretrained PLLaVA 13B model and select 4 frames from each video for captioning with a spatial pooling shape of 2*2.

Some statistics of the video data used in this stage are shown below. We present basic statistics of duration and resolution, as well as aesthetic score and optical flow score distribution.
We also extract tags for objects and actions from video captions and count their frequencies.
![stats](/assets/readme/report-03_video_stats.png)
![object_count](/assets/readme/report-03_objects_count.png)
![object_count](/assets/readme/report-03_actions_count.png)

We mainly train 720p and 1080p videos in this stage, aiming to extend the model's ability to larger resolutions. We use a mask ratio of 25% during training. The training config locates in [stage3.py](/configs/opensora-v1-2/train/stage3.py). We train the model for 15k steps, which is approximately 2 epochs.

## Easy and effective model conditioning

For stage 3, we calculate the aesthetic score and motion score for each video clip. However, since the number of video clips is small, we are not willing to filter out clips with low scores, which leads to a smaller dataset. Instead, we append the scores to the captions and use them as conditioning. We find this method can make model aware of the scores and follows the scores to generate videos with better quality.

For example, a video with aesthetic score 5.5, motion score 10, and a detected camera motion pan left, the caption will be:

```plaintext
[Original Caption] aesthetic score: 5.5, motion score: 10, camera motion: pan left.
```

During inference, we can also use the scores to condition the model. For camera motion, we only label 13k clips with high confidence, and the camera motion detection module is released in our tools.

## Evaluation

Previously, we monitor the training process only by human evaluation, as DDPM traning loss is not well correlated with the quality of generated videos. However, for rectified flow, we find the training loss is well correlated with the quality of generated videos as stated in SD3. Thus, we keep track of rectified flow evaluation loss on 100 images and 1k videos.

We sampled 1k videos from pixabay as validation dataset. We calculate the evaluation loss for image and different lengths of videos (2s, 4s, 8s, 16s) for different resolution (144p, 240p, 360p, 480p, 720p). For each setting, we equidistantly sample 10 timesteps. Then all the losses are averaged. We also provide a [video](https://streamable.com/oqkkf1) showing the sampled videos with a fixed prompt for different steps.

![Evaluation Loss](/assets/readme/report_val_loss.png)
![Video Evaluation Loss](/assets/readme/report_vid_val_loss.png)

In addition, we also keep track of [VBench](https://vchitect.github.io/VBench-project/) scores during training. VBench is an automatic video evaluation benchmark for short video generation. We calcuate the vbench score with 240p 2s videos. The two metrics verify that our model continues to improve during training.

![VBench](/assets/readme/report_vbench_score.png)

All the evaluation code is released in `eval` folder. Check the [README](/eval/README.md) for more details.

| Model          | Total Score | Quality Score | Semantic Score |
| -------------- | ----------- | ------------- | -------------- |
| Open-Sora V1.0 | 75.91%      | 78.81%        | 64.28%         |
| Open-Sora V1.2 | 79.23%      | 80.71%        | 73.30%         |

## Sequence parallelism

We use sequence parallelism to support long-sequence training and inference. Our implementation is based on Ulysses and the workflow is shown below. When sequence parallelism is enabled, we only need to apply the `all-to-all` communication to the spatial block in STDiT as only spatial computation is dependent on the sequence dimension.

![SP](../assets/readme/sequence_parallelism.jpeg)

Currently, we have not used sequence parallelism for training as data resolution is small and we plan to do so in the next release. As for inference, we can use sequence parallelism in case your GPU goes out of memory. A simple benchmark shows that sequence parallelism can achieve speedup

| Resolution | Seconds | Number of GPUs | Enable SP | Time taken/s | Speedup per GPU |
| ---------- | ------- | -------------- | --------- | ------------ | --------------- |
| 720p       | 16s     | 1              | No        | 547.97       | -               |
| 720p       | 16s     | 2              | Yes       | 244.38       | 12%             |


================================================
FILE: Open-Sora/docs/structure.md
================================================
# Repo Structure

```plaintext
Open-Sora
├── README.md
├── assets
│   ├── images                     -> images used for image-conditioned generation
│   ├── demo                       -> images used for demo
│   ├── texts                      -> prompts used for text-conditioned generation
│   └── readme                     -> images used in README
├── configs                        -> Configs for training & inference
├── docker                         -> dockerfile for Open-Sora
├── docs
│   ├── acceleration.md            -> Report on acceleration & speed benchmark
│   ├── commands.md                -> Commands for training & inference
│   ├── datasets.md                -> Datasets used in this project
|   ├── data_processing.md         -> Data pipeline documents
|   ├── installation.md            -> Data pipeline documents
│   ├── structure.md               -> This file
│   ├── config.md                  -> Configs for training and inference
│   ├── report_01.md               -> Report for Open-Sora 1.0
│   ├── report_02.md               -> Report for Open-Sora 1.1
│   ├── report_03.md               -> Report for Open-Sora 1.2
│   ├── vae.md                     -> our VAE report
│   └── zh_CN                      -> Chinese version of the above
├── eval                           -> Evaluation scripts
│   ├── README.md                  -> Evaluation documentation
|   ├── human_eval                 -> for human eval
|   ├── launch.sh                  -> script for launching 8 cards sampling
|   ├── loss                       -> eval loss
|   ├── sample.sh                  -> script for quickly launching inference on predefined prompts
|   ├── vae                        -> for vae eval
|   ├── vbench                     -> for VBench evaluation
│   └── vbench_i2v                 -> for VBench i2v evaluation
├── gradio                         -> Gradio demo related code
├── notebooks                      -> Jupyter notebooks for generating commands to run
├── scripts
│   ├── train.py                   -> diffusion training script
│   ├── train_vae.py               -> vae training script
│   ├── inference.py               -> diffusion inference script
│   ├── inference_vae.py           -> vae inference script
│   └── misc                       -> misc scripts, including batch size search
├── opensora
│   ├── __init__.py
│   ├── registry.py                -> Registry helper
│   ├── acceleration               -> Acceleration related code
│   ├── datasets                    -> Dataset related code
│   ├── models
│   │   ├── dit                    -> DiT
│   │   ├── layers                 -> Common layers
│   │   ├── vae                    -> VAE as image encoder
│   │   ├── text_encoder           -> Text encoder
│   │   │   ├── classes.py         -> Class id encoder (inference only)
│   │   │   ├── clip.py            -> CLIP encoder
│   │   │   └── t5.py              -> T5 encoder
│   │   ├── dit
│   │   ├── latte
│   │   ├── pixart
│   │   └── stdit                  -> Our STDiT related code
│   ├── schedulers                 -> Diffusion schedulers
│   │   ├── iddpm                  -> IDDPM for training and inference
│   │   └── dpms                   -> DPM-Solver for fast inference
│   └── utils
├── tests                          -> Tests for the project
└── tools                          -> Tools for data processing and more
```

## Configs

Our config files follows [MMEgine](https://github.com/open-mmlab/mmengine). MMEngine will reads the config file (a `.py` file) and parse it into a dictionary-like object.

```plaintext
Open-Sora
└── configs                        -> Configs for training & inference
    ├── opensora-v1-1              -> STDiT2 related configs
    │   ├── inference
    │   │   ├── sample.py          -> Sample videos and images
    │   │   └── sample-ref.py      -> Sample videos with image/video condition
    │   └── train
    │       ├── stage1.py          -> Stage 1 training config
    │       ├── stage2.py          -> Stage 2 training config
    │       ├── stage3.py          -> Stage 3 training config
    │       ├── image.py           -> Illustration of image training config
    │       ├── video.py           -> Illustration of video training config
    │       └── benchmark.py       -> For batch size searching
    ├── opensora                   -> STDiT related configs
    │   ├── inference
    │   │   ├── 16x256x256.py      -> Sample videos 16 frames 256x256
    │   │   ├── 16x512x512.py      -> Sample videos 16 frames 512x512
    │   │   └── 64x512x512.py      -> Sample videos 64 frames 512x512
    │   └── train
    │       ├── 16x256x256.py      -> Train on videos 16 frames 256x256
    │       ├── 16x256x256.py      -> Train on videos 16 frames 256x256
    │       └── 64x512x512.py      -> Train on videos 64 frames 512x512
    ├── dit                        -> DiT related configs
    │   ├── inference
    │   │   ├── 1x256x256-class.py -> Sample images with ckpts from DiT
    │   │   ├── 1x256x256.py       -> Sample images with clip condition
    │   │   └── 16x256x256.py      -> Sample videos
    │   └── train
    │       ├── 1x256x256.py       -> Train on images with clip condition
    │       └── 16x256x256.py      -> Train on videos
    ├── latte                      -> Latte related configs
    └── pixart                     -> PixArt related configs
```

## Tools

```plaintext
Open-Sora
└── tools
    ├── datasets                   -> dataset management related code
    ├── scene_cut                  -> scene cut related code
    ├── caption                    -> caption related code
    ├── scoring                    -> scoring related code
    │   ├── aesthetic              -> aesthetic scoring related code
    │   ├── matching               -> matching scoring related code
    │   ├── ocr                    -> ocr scoring related code
    │   └── optical_flow           -> optical flow scoring related code
    └── frame_interpolation        -> frame interpolation related code


================================================
FILE: Open-Sora/docs/vae.md
================================================
# VAE Report

As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we develop an additional temporal VAE.
Specifically, our VAE consists of a pipeline of a [spatial VAE](https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers) followed by a temporal VAE.
For the temporal VAE, we follow the implementation of [MAGVIT-v2](https://arxiv.org/abs/2310.05737), with the following modifications:

* We remove the architecture specific to the codebook.
* We do not use the discriminator, and use the VAE reconstruction loss, kl loss, and perceptual loss for training.
* In the last linear layer of the encoder, we scale down to a diagonal Gaussian Distribution of 4 channels, following our previously trained STDiT that takes in 4 channels input.
* Our decoder is symmetric to the encoder architecture.

## Training

We train the model in different stages.

We first train the temporal VAE only by freezing the spatial VAE for 380k steps on a single machine (8 GPUs).
We use an additional identity loss to make features from the 3D VAE similar to the features from the 2D VAE.
We train the VAE using 20% images and 80% videos with 17 frames.

```bash
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
```

Next, we remove the identity loss and train the 3D VAE pipeline to reconstructe the 2D-compressed videos for 260k steps.

```bash
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH
```

Finally, we remove the reconstruction loss for the 2D-compressed videos and train the VAE pipeline to construct the 3D videos for 540k steps.
We train our VAE with a random number within 34 frames to make it more robust to different video lengths.
This stage is trained on 24 GPUs.

```bash
torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH
```

Note that you need to adjust the `epochs` in the config file accordingly with respect to your own csv data size.

## Inference

To visually check the performance of the VAE, you may run the following inference.
It saves the original video to your specified video directory with `_ori` postfix (i.e. `"YOUR_VIDEO_DIR"_ori`), the reconstructed video from the full pipeline with the `_rec` postfix (i.e. `"YOUR_VIDEO_DIR"_rec`), and the reconstructed video from the 2D compression and decompression with the `_spatial` postfix (i.e. `"YOUR_VIDEO_DIR"_spatial`).

```bash
torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
```
## Evaluation

We can then calculate the scores of the VAE performances on metrics of SSIM, PSNR, LPIPS, and FLOLPIPS.

* SSIM: structural similarity index measure, the higher the better
* PSNR: peak-signal-to-noise ratio, the higher the better
* LPIPS:  learned perceptual image quality degradation, the lower the better
* [FloLPIPS](https://arxiv.org/pdf/2207.08119): LPIPS with video interpolation, the lower the better.

```bash
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
```

## Acknowledgement
We are grateful for the following work:
* [MAGVIT-v2](https://arxiv.org/abs/2310.05737): Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation
* [Taming Transformers](https://github.com/CompVis/taming-transformers): Taming Transformers for High-Resolution Image Synthesis
* [3D blur pooling](https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc)
* [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)


================================================
FILE: Open-Sora/docs/zh_CN/README.md
================================================
<p align="center">
    <img src="../../assets/readme/icon.png" width="250"/>
</p>
<div align="center">
    <a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
    <a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
    <a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
    <a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
    <a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
    <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
    <a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
    <a href="https://huggingface.co/spaces/hpcai-tech/open-sora"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Gradio Demo-blue"></a>
</div>

## Open-Sora: 让所有人都能轻松制作高效视频

我们设计并实施了**Open-Sora**，这是一项致力于高效制作高质量视频的计划。我们希望让所有人都能使用模型、工具和所有细节。通过采用开源原则，Open-Sora 不仅使高级视频生成技术的使用变得民主化，而且还提供了一个简化且用户友好的平台，简化了视频生成的复杂性。借助 Open-Sora，我们的目标是在内容创作领域促进创新、创造力和包容性。

[[中文文档](/docs/zh_CN/README.md)] [[潞晨云](https://cloud.luchentech.com/)|[OpenSora镜像](https://cloud.luchentech.com/doc/docs/image/open-sora/)|[视频教程](https://www.bilibili.com/video/BV1ow4m1e7PX/?vd_source=c6b752764cd36ff0e535a768e35d98d2)]

## 📰 资讯

* **[2024.06.22]** 🔥我们在[潞晨云](https://cloud.luchentech.com/)上发布了Open-Sora1.2镜像，并在B站上传了详细的[使用教程](https://www.bilibili.com/video/BV1ow4m1e7PX/)
* **[2024.06.17]** 🔥我们发布了**Open-Sora 1.2**，其中包括**3D-VAE**，**整流流**和**得分条件**。视频质量大大提高。[[模型权重]](#模型权重) [[技术报告]](report_v3.md) [[公众号文章]](https://mp.weixin.qq.com/s/QHq2eItZS9e00BVZnivdjg)
* **[2024.04.25]** 🤗 我们在 Hugging Face Spaces 上发布了 [Open-Sora的Gradio演示](https://huggingface.co/spaces/hpcai-tech/open-sora)。
* **[2024.04.25]** 我们发布了**Open-Sora 1.1**，支持**2s~15s、144p 到 720p、任意比例的文本转图片、文本转视频、图片转视频、视频转视频、无限时间生成**。此外，还发布了完整的视频处理管道。 [[模型权重]](#模型权重) [[技术报告]](report_v2.md)[[公众号文章]](https://mp.weixin.qq.com/s/nkPSTep2se__tzp5OfiRQQ)
* **[2024.03.18]** 我们发布了 **Open-Sora 1.0**, 一个完全开源的视频生成项目。Open-Sora 1.0 支持完整的视频数据预处理流程、加速训练
  <a href="https://github.com/hpcaitech/ColossalAI"><img src="/assets/readme/colossal_ai.png" width="8%" ></a>
、推理等。我们的模型只需 3 天的训练就可以生成 2 秒的 512x512 视频。 [[模型权重]](#模型权重)
  [[公众号文章]](https://mp.weixin.qq.com/s/H52GW8i4z1Dco3Sg--tCGw) [[技术报告]](report_v1.md)
* **[2024.03.04]** Open-Sora 提供培训，成本降低 46%。
  [[公众号文章]](https://mp.weixin.qq.com/s/OjRUdrM55SufDHjwCCAvXg)

## 🎥 Latest Demo

🔥 您可以在HuggingFace上的 [🤗 Gradio应用程序](https://huggingface.co/spaces/hpcai-tech/open-sora)上体验Open-Sora. 我们的[画廊](https://hpcaitech.github.io/Open-Sora/)中提供了更多示例.

| **4s 720×1280**                                                                                                                                      | **4s 720×1280**                                                                                                                                      | **4s 720×1280**                                                                                                                                      |
| ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="/assets/demo/v1.2/sample_0013.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/7895aab6-ed23-488c-8486-091480c26327) | [<img src="/assets/demo/v1.2/sample_1718.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/20f07c7b-182b-4562-bbee-f1df74c86c9a) | [<img src="/assets/demo/v1.2/sample_0087.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3d897e0d-dc21-453a-b911-b3bda838acc2) |
| [<img src="/assets/demo/v1.2/sample_0052.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/644bf938-96ce-44aa-b797-b3c0b513d64c) | [<img src="/assets/demo/v1.2/sample_1719.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/272d88ac-4b4a-484d-a665-8d07431671d0) | [<img src="/assets/demo/v1.2/sample_0002.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ebbac621-c34e-4bb4-9543-1c34f8989764) |
| [<img src="/assets/demo/v1.2/sample_0011.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/a1e3a1a3-4abd-45f5-8df2-6cced69da4ca) | [<img src="/assets/demo/v1.2/sample_0004.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/d6ce9c13-28e1-4dff-9644-cc01f5f11926) | [<img src="/assets/demo/v1.2/sample_0061.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/561978f8-f1b0-4f4d-ae7b-45bec9001b4a) |

<details>
<summary>OpenSora 1.1 演示</summary>

| **2秒 240×426**                                                                                                                                              | **2秒 240×426**                                                                                                                                             |
| ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="/assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [<img src="/assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) |
| [<img src="/assets/demo/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd)  | [<img src="/assets/demo/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) |

| **2秒 426×240**                                                                                                                                             | **4秒 480×854**                                                                                                                                              |
| ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="/assets/demo/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="/assets/demo/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) |

| **16秒 320×320**                                                                                                                                        | **16秒 224×448**                                                                                                                                        | **2秒 426×240**                                                                                                                                            |
| ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="/assets/demo/sample_16s_320x320.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [<img src="/assets/demo/sample_16s_224x448.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) | [<img src="/assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |


</details>

<details>
<summary>OpenSora 1.0 Demo</summary>

| **2秒 512×512**                                                                                                                                                                 | **2秒 512×512**                                                                                                                                                              | **2秒 512×512**                                                                                                                                    |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="/assets/readme/sample_0.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80)                                 | [<img src="/assets/readme/sample_1.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc)                              | [<img src="/assets/readme/sample_2.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16)    |
|森林地区宁静的夜景。 [...] 该视频是一段延时摄影，捕捉了白天到夜晚的转变，湖泊和森林始终作为背景。 | 无人机拍摄的镜头捕捉到了海岸悬崖的壮丽美景，[...] 海水轻轻地拍打着岩石底部和紧贴悬崖顶部的绿色植物。| 瀑布从悬崖上倾泻而下，流入宁静的湖泊，气势磅礴。[...] 摄像机角度提供了瀑布的鸟瞰图。 |
| [<img src="/assets/readme/sample_3.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94)                                 | [<img src="/assets/readme/sample_4.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9)                              | [<img src="/assets/readme/sample_5.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65)    |
| 夜晚繁华的城市街道，充满了汽车前灯的光芒和路灯的氛围光。 [...]                                                           | 向日葵田的生机勃勃，美不胜收。向日葵整齐排列，给人一种秩序感和对称感。 [...]                                            |宁静的水下场景，一只海龟在珊瑚礁中游动。这只海龟的壳呈绿褐色 [...]                   |

视频经过降采样以.gif用于显示。单击查看原始视频。提示经过修剪以用于显示，请参阅[此处](/assets/texts/t2v_samples.txt)查看完整提示。

</details>

## 🔆 新功能/更新

* 📍 **Open-Sora 1.2** 发布。模型权重可在[此处](#model-weights)查看。有关更多详细信息，请参阅我们的**[技术报告 v1.2](docs/report_03.md)** 。
* ✅ 支持整流流调度。
* ✅ 训练我们的 3D-VAE 进行时间维度压缩。
* 📍 **Open-Sora 1.1**发布。模型权重可在[此处](#model-weights)获得。它针对**0s~15s、144p 到 720p、各种宽高比**的视频进行训练。有关更多讨论，请参阅我们的**[技术报告 v1.1](/docs/report_02.md)** 。
* 🔧 **数据处理流程** v1.1发布，提供从原始视频到（文本，视频片段）对的自动处理流程，包括场景剪切$\rightarrow$过滤（美学、光流、OCR 等）$\rightarrow$字幕$\rightarrow$管理。使用此工具，您可以轻松构建视频数据集。
* ✅ 改进的 ST-DiT 架构包括 rope 位置编码、qk 范数、更长的文本长度等。
* ✅ 支持任意分辨率、纵横比和时长（包括图像）的训练。
* ✅ 支持图像和视频调节以及视频编辑，从而支持动画图像，连接视频等。
* 📍 **Open-Sora 1.0**发布。模型权重可在[此处](#model-weights)获得。仅使用 400K 视频片段和 200 个 H800 天（相比稳定视频扩散中的 152M 样本），我们就能生成 2s 512×512 视频。有关更多讨论，请参阅我们的**[技术报告 v1.0](docs/report_01.md)**。
* ✅从图像扩散模型到视频扩散模型的三阶段训练。我们为每个阶段提供权重。
* ✅ 支持训练加速，包括加速 Transformer、更快的 T5 和 VAE 以及序列并行。Open-Sora 在 64x512x512 视频上训练时可将训练速度提高**55%**。详细信息位于[训练加速.md](docs/acceleration.md)。
* 🔧 **数据预处理流程 v1.0**,包括 [下载](tools/datasets/README.md), [视频剪辑](tools/scene_cut/README.md), 和 [字幕](tools/caption/README.md) 工具. 我们的数据收集计划可在 [数据集.md](docs/datasets.md)中找到.

<details>
<summary>查看更多</summary>

✅ 我们发现[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的 VQ-VAE质量较低，因此采用了[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)中的更好的 VAE 。我们还发现时间维度的修补会降低质量。有关更多讨论，请参阅我们的**[技术报告v1.0](docs/report_01.md)**。
✅ 我们研究了不同的架构，包括 DiT、Latte 和我们提出的 **STDiT**。我们的STDiT在质量和速度之间实现了更好的平衡。请参阅我们的 **[技术报告v1.0](docs/report_01.md)**以了解更多讨论。
✅ 支持剪辑和T5文本调节。
✅ 通过将图像视为单帧视频，我们的项目支持在图像和视频上训练 DiT（例如 ImageNet 和 UCF101）。有关更多说明，请参阅[commands.md](docs/commands.md) 。
✅ 支持使用[DiT](https://github.com/facebookresearch/DiT), [Latte](https://github.com/Vchitect/Latte),
  和 [PixArt](https://pixart-alpha.github.io/).的官方权重进行推理。
✅ 重构代码库。查看[structure.md](docs/structure.md)以了解项目结构以及如何使用配置文件。

</details>

### 按优先级排序的 TODO 列表

<details>
<summary>查看更多</summary>

* [x] 训练视频 VAE 并使我们的模型适应新的 VAE
* [x] 缩放模型参数和数据集大小
* [x] 纳入更好的调度程序（整流流程）
* [x] 评估流程
* [x] 完成数据处理流程（包括密集光流、美学评分、文本-图像相似度等）。有关更多信息，请参阅[数据集](/docs/datasets.md)
* [x] 支持图像和视频调节
* [x] 支持可变的纵横比、分辨率和持续时间

</details>

## 内容

* [安装](#安装)
* [模型权重](#模型权重)
* [Gradio演示](#gradio演示)
* [推理](#推理)
* [数据处理](#数据处理)
* [训练](#训练)
* [评估](#评估)
* [贡献](#贡献)
* [引用](#引用)
* [致谢](#致谢)

下面列出了其他有用的文档和链接。

* 报告: [技术报告 v1.2](docs/report_v3.md), [技术报告 v1.1](/docs/report_v2.md), [技术报告 v1.0](/docs/report_v1.md), [训练加速.md](docs/acceleration.md)
* Repo 结构: [结构.md](docs/structure.md)
* 配置文件说明: [config.md](docs/config.md)
* Useful commands: [commands.md](docs/commands.md)
* 数据处理管道和数据集: [datasets.md](docs/datasets.md)
* 每个数据处理工具的 README: [dataset conventions and management](/tools/datasets/README.md), [scene cutting](/tools/scene_cut/README.md), [scoring](/tools/scoring/README.md), [caption](/tools/caption/README.md)
* 评估: [eval](/eval/README.md)
* 画廊: [gallery](https://hpcaitech.github.io/Open-Sora/)

## 安装

### 从源头安装

对于 CUDA 12.1，您可以使用以下命令[安装](/docs/installation.md)依赖项。否则，请参阅安装以获取有关不同 cuda 版本的更多说明以及数据预处理的其他依赖项。

```bash
# create a virtual env and activate (conda as an example)
conda create -n opensora python=3.9
conda activate opensora

# install torch, torchvision and xformers
pip install -r requirements/requirements-cu121.txt

# download the repo
git clone https://github.com/hpcaitech/Open-Sora
cd Open-Sora

# the default installation is for inference only
pip install -v . # for development mode, `pip install -v -e .`


(Optional, recommended for fast speed, especially for training) To enable `layernorm_kernel` and `flash_attn`, you need to install `apex` and `flash-attn` with the following commands.

```bash
# install flash attention
# set enable_flash_attn=False in config to disable flash attention
pip install packaging ninja
pip install flash-attn --no-build-isolation

# install apex
# set enable_layernorm_kernel=False in config to disable apex
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
```

### 使用Docker

运行以下命令从提供的Dockerfile 构建docker 镜像。

```bash
docker build -t opensora .
```

运行以下命令以交互模式启动docker容器。

```bash
docker run -ti --gpus all -v .:/workspace/Open-Sora opensora
```

## 模型权重

### Open-Sora 1.2 模型权重
| 分辨率 | 模型大小 | 数据 | 迭代次数 | 批次大小 | 网址 |
| ---------- | ---------- | ---- | ----------- | ---------- | --- |
| Diffusion | 1.1B       | 30M  | 70k         | 动态大小    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v3) |
| VAE       | 384M       | 3M   | 1M          | 8          | [:link:](https://huggingface.co/hpcai-tech/OpenSora-VAE-v1.2) |

请参阅我们的**[report 1.2](docs/report_v3.md)**以了解更多信息。

### Open-Sora 1.1 模型权重

<details>
<summary>查看更多</summary>

| 分辨率         | M | Data                       | #iterations | Batch Size                                        | URL                                                                  |
| ------------------ | ---------- | -------------------------- | ----------- | ------------------------------------------------- | -------------------------------------------------------------------- |
| mainly 144p & 240p | 700M       | 10M videos + 2M images     | 100k        | [dynamic](/configs/opensora-v1-1/train/stage2.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) |
| 144p to 720p       | 700M       | 500K HQ videos + 1M images | 4k          | [dynamic](/configs/opensora-v1-1/train/stage3.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) |

请参阅我们的 **[报告 1.1](docs/report_02.md)** 以了解更多信息。

:warning: **局限性**: 此版本包含已知问题，我们将在下一版本中修复这些问题（因为我们为下一版本节省了计算资源）。此外，由于此问题，视频生成可能会长时间失败，高分辨率将产生嘈杂的结果。

</details>

### Open-Sora 1.0 模型权重
<details>
<summary>查看更多</summary>

| 分辨率 | 模型大小 | 数据   | 迭代次数 | 批量大小 | GPU 天数 (H800) | 网址
| ---------- | ---------- | ------ | ----------- | ---------- | --------------- |
| 16×512×512 | 700M       | 20K HQ | 20k         | 2×64       | 35              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth) |
| 16×256×256 | 700M       | 20K HQ | 24k         | 8×64       | 45              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) |
| 16×256×256 | 700M       | 366K   | 80k         | 8×64       | 117             | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth)    |

训练流程: 16x256x256 $\rightarrow$ 16x256x256 高清 $\rightarrow$ 16x512x512 高质量.

我们的模型权重部分由 [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)初始化，参数数量为724M.更多信息请参阅  **[技术报告v1.0](docs/report_v1.md)**。数据集相关信息请参阅[数据集文件](docs/datasets.md). HQ 表示高质量.

:warning: **局限性**: 我们的模型是在有限的预算下训练的。质量和文本对齐相对较差。该模型表现不佳，特别是在生成人类时，无法遵循详细的说明。我们正在努力提高质量和文本对齐。

</details>

## Gradio演示

🔥 您可以在Hugging Face 上的[🤗 Gradio 应用程序](https://huggingface.co/spaces/hpcai-tech/open-sora)上在线体验Open-Sora。【由于GPU资源不足，已失效】

### 本地部署

如果您想在本地部署 gradio，我们还在这个存储库中提供了一个[Gradio 应用程序](./gradio) ，您可以使用以下命令启动一个交互式 Web 应用程序来体验使用 Open-Sora 生成视频。

```bash
pip install gradio spaces
python gradio/app.py
```

这将在您的本地主机上启动 Gradio 应用程序。如果您想了解有关 Gradio 应用程序的更多信息，可以参考[Gradio README](./gradio/README.md)。

要启用提示增强和其他语言输入（例如中文输入），您需要OPENAI_API_KEY在环境中进行设置。查看[OpenAI的文档](https://platform.openai.com/docs/quickstart)以获取您的 API 密钥。

```bash
export OPENAI_API_KEY=YOUR_API_KEY
```

### 入门

在 Gradio 应用程序中，基本选项如下：

![Gradio Demo](/assets/readme/gradio_basic.png)

生成视频最简单的方式是输入文本提示，然后点击“**生成视频**”按钮（如果找不到，请向下滚动）。生成的视频将显示在右侧面板中。勾选“**使用 GPT4o 增强提示**”将使用 GPT-4o 来细化提示，而“**随机提示**”按钮将由 GPT-4o 为您生成随机提示。由于 OpenAI 的 API 限制，提示细化结果具有一定的随机性。

然后，你可以选择生成视频的**分辨率**、**时长**、**长宽比**。不同的分辨率和视频长度会影响视频生成速度。在 80G H100 GPU 上，生成速度和峰值内存使用量为：

|   分辨率   | 图像   | 2秒       | 4秒        | 8秒        | 16秒       |
| ---- | ------- | -------- | --------- | --------- | --------- |
| 360p | 3s, 24G | 18s, 27G | 31s, 27G  | 62s, 28G  | 121s, 33G |
| 480p | 2s, 24G | 29s, 31G | 55s, 30G  | 108s, 32G | 219s, 36G |
| 720p | 6s, 27G | 68s, 41G | 130s, 39G | 260s, 45G | 547s, 67G |

注意，除了文本转视频，你还可以使用图片转视频。你可以上传图片，然后点击“**生成视频**”按钮，生成以图片为第一帧的视频。或者，你可以填写文本提示，然后点击“**生成图片**”按钮，根据文本提示生成图片，然后点击“**生成视频**”按钮，根据同一模型生成的图片生成视频。

![Gradio Demo](/assets/readme/gradio_option.png)

然后您可以指定更多选项，包括“**运动强度**”、“**美学**”和“**相机运动**”。如果未选中“启用”或选择“无”，则不会将信息传递给模型。否则，模型将生成具有指定运动强度、美学分数和相机运动的视频。

对于**美学分数**，我们建议使用高于 6 的值。对于**运动强度**，较小的值将导致更平滑但动态性较差的视频，而较大的值将导致更动态但可能更模糊的视频。因此，您可以尝试不使用它，然后根据生成的视频进行调整。对于**相机运动**，有时模型无法很好地遵循指令，我们正在努力改进它。

您还可以调整“**采样步数**”，这是去噪的次数，与生成速度直接相关。小于 30 的数字通常会导致较差的生成结果，而大于 100 的数字通常不会有明显的改善。“种子”用于可重复性，您可以将其设置为固定数字以生成相同的视频。“**CFG 比例**”控制模型遵循文本提示的程度，较小的值会导致视频更随机，而较大的值会导致视频更遵循文本（建议为 7）。

对于更高级的用法，您可以参考[Gradio README](./gradio/README.md#advanced-usage).

## 推理

### Open-Sora 1.2 命令行推理

基础的命令行推理:

```bash
# text to video
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --aspect-ratio 9:16 \
  --prompt "a beautiful waterfall"
```

您可以向命令行添加更多选项来定制生成。

```bash
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --aspect-ratio 9:16 \
  --num-sampling-steps 30 --flow 5 --aes 6.5 \
  --prompt "a beautiful waterfall"
```

对于图像到视频生成和其他功能，API 与 Open-Sora 1.1 兼容。请参阅[此处]](commands.md)了解更多说明。

如果您的安装不包含 `apex` 和 `flash-attn`, 则需要在配置文件中或通过以下命令禁用它们。

```bash
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p \
  --layernorm-kernel False --flash-attn False \
  --prompt "a beautiful waterfall"
```

### 序列并行推理

要启用序列并行，您需要使用 `torchrun` 来运行推理脚本。以下命令将使用 2 个 GPU 运行推理。

```bash
# text to video
CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --aspect-ratio 9:16 \
  --prompt "a beautiful waterfall"
```

:warning: **注意**: gradio 部署不支持序列并行。目前，只有当维度可以除以 GPU 数量时才支持序列并行。因此，在某些情况下可能会失败。我们测试了 4 个 GPU 用于 720p 和 2 个 GPU 用于 480p。


### GPT-4o 快速细化

我们发现 GPT-4o 可以细化提示并提高生成视频的质量。利用此功能，您还可以使用其他语言（例如中文）作为提示。要启用此功能，您需要在环境中准备您的 openai api 密钥：

```bash
export OPENAI_API_KEY=YOUR_API_KEY
```

然后您可以用 `--llm-refine True` 启用GPT-4o进行提示细化以完成推理。

### Open-Sora 1.1 命令行推理
<details>
<summary>查看更多</summary>

由于 Open-Sora 1.1 支持动态输入大小的推理，因此您可以将输入大小作为参数传递。

```bash
# text to video
python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854
```

如果您的安装不包含`apex` 和 `flash-attn`，则需要在配置文件中或通过以下命令禁用它们。

```bash
python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854 --layernorm-kernel False --flash-attn False
```

请参阅[此处](docs/commands.md#inference-with-open-sora-11)了解更多说明，包括文本转图像、图像转视频、视频转视频和无限时间生成。

</details>

### Open-Sora 1.0 命令行推理

<details>
<summary>查看更多</summary>

我们还提供了离线推理脚本。运行以下命令生成样本，所需的模型权重将自动下载。要更改采样提示，请修改传递给的 txt 文件--prompt-path。请参阅[此处](docs/structure.md#inference-config-demos)以自定义配置。

```bash
# Sample 16x512x512 (20s/sample, 100 time steps, 24 GB memory)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path OpenSora-v1-HQ-16x512x512.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 16x256x256 (5s/sample, 100 time steps, 22 GB memory)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path OpenSora-v1-HQ-16x256x256.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 64x512x512 (40s/sample, 100 time steps)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 64x512x512 with sequence parallelism (30s/sample, 100 time steps)
# sequence parallelism is enabled automatically when nproc_per_node is larger than 1
torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt
```

速度是在 H800 GPU 上测试的。有关使用其他型号进行推理，请参阅[此处](docs/commands.md) 了解更多说明。要降低内存使用量，请`vae.micro_batch_size`在配置中设置较小的值（略低采样速度）。

</details>

## 数据处理

高质量的数据对于训练良好的生成模型至关重要。为此，我们建立了完整的数据处理流程，可以将原始视频无缝转换为高质量的视频-文本对。流程如下所示。有关详细信息，请参阅[数据处理](docs/data_processing.md)。另请查看我们使用的[数据集](docs/datasets.md)。

![Data Processing Pipeline](/assets/readme/report_data_pipeline.png)

## 训练

### Open-Sora 1.2 训练

训练过程与Open-Sora 1.1相同。

```bash
# one node
torchrun --standalone --nproc_per_node 8 scripts/train.py \
    configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
# multiple nodes
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \
    configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

### Open-Sora 1.1 训练

<details>
<summary>查看更多</summary>

在文件中准备好数据后`csv`，运行以下命令在单个节点上启动训练。

```bash
# one node
torchrun --standalone --nproc_per_node 8 scripts/train.py \
    configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
# multiple nodes
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \
    configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

</details>

### Open-Sora 1.0 训练

<details>
<summary>查看更多</summary>

在文件中准备好数据后`csv`，运行以下命令在单个节点上启动训练。

```bash
# 1 GPU, 16x256x256
torchrun --nnodes=1 --nproc_per_node=1 scripts/train.py configs/opensora/train/16x256x256.py --data-path YOUR_CSV_PATH
# 8 GPUs, 64x512x512
torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

要在多个节点上启动训练，请根据[ColossalAI](https://colossalai.org/docs/basics/launch_colossalai/#launch-with-colossal-ai-cli)准备一个主机文件，并运行以下命令。

```bash
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```
有关训练其他模型和高级用法，请参阅[此处](docs/commands.md)获取更多说明。

</details>

## 评估

我们支持基于以下方面的评估：

- 验证损失
- [VBench](https://github.com/Vchitect/VBench/tree/master)h分数
- VBench-i2v 分数
- 批量生成以供人工评估
所有评估代码均发布在 `eval`文件夹中。查看[README](/eval/README.md)了解更多详细信息。我们的 [技术报告](report_v3.md#评估)还提供了有关训练期间评估的更多信息。下表显示 Open-Sora 1.2 大大改进了 Open-Sora 1.0。

| 模型          | 总得分 | 质量得分 | 语义得分 |
| -------------- | ----------- | ------------- | -------------- |
| Open-Sora V1.0 | 75.91%      | 78.81%        | 64.28%         |
| Open-Sora V1.2 | 79.23%      | 80.71%        | 73.30%         |

## VAE 训练与评估

我们训练一个由空间 VAE 和时间 VAE 组成的 VAE 管道。有关更多详细信息，请参阅[VAE 文档](vae.md)。在运行以下命令之前，请按照我们的[安装文档](installation.md)安装 VAE 和评估所需的依赖项。

如果您想训练自己的 VAE，我们需要按照[数据处理](#data-processing)流程在 csv 中准备数据，然后运行以下命令。请注意，您需要根据自己的 csv 数据大小相应地调整配置文件中的训练`epochs`数量。


```bash
# stage 1 training, 380k steps, 8 GPUs
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
# stage 2 training, 260k steps, 8 GPUs
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH
# stage 3 training, 540k steps, 24 GPUs
torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH
```

为了评估 VAE 的性能，您需要首先运行 VAE 推理来生成视频，然后计算生成的视频的分数：

```bash
# video generation
torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
# the original videos will be saved to `YOUR_VIDEO_DIR_ori`
# the reconstructed videos through the pipeline will be saved to `YOUR_VIDEO_DIR_rec`
# the reconstructed videos through the spatial VAE only will be saved to `YOUR_VIDEO_DIR_spatial`

# score calculation
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
```


## 贡献

感谢以下出色的贡献者：

<a href="https://github.com/hpcaitech/Open-Sora/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=hpcaitech/Open-Sora" />
</a>

如果您希望为该项目做出贡献，请参阅[Contribution Guideline](./CONTRIBUTING.md)。

## 致谢

这里我们仅列出了部分项目，其他研究成果及数据集请参考我们的报告。

* [ColossalAI](https://github.com/hpcaitech/ColossalAI): 强大的大型模型并行加速与优化系统。
* [DiT](https://github.com/facebookresearch/DiT): 带有 Transformer 的可扩展扩散模型。
* [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): DiT 训练的加速器。我们从 OpenDiT 中采用了有价值的训练进度加速策略。
* [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): 一个基于 DiT 的开源文本转图像模型。
* [Latte](https://github.com/Vchitect/Latte): 尝试高效地训练视频的 DiT。
* [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): 一个强大的图像 VAE 模型。
* [CLIP](https://github.com/openai/CLIP): 一个强大的文本图像嵌入模型。
* [T5](https://github.com/google-research/text-to-text-transfer-transformer): 强大的文本编码器。
* [LLaVA](https://github.com/haotian-liu/LLaVA): 基于[Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) 和 [Yi-34B](https://huggingface.co/01-ai/Yi-34B). 的强大图像字幕模型。
* [PLLaVA](https://github.com/magic-research/PLLaVA): 一个强大的视频字幕模型。
* [MiraData](https://github.com/mira-space/MiraData):具有长持续时间和结构化字幕的大规模视频数据集。

我们感谢他们的出色工作和对开源的慷慨贡献。

## 引用

```bibtex
@software{opensora,
  author = {Zangwei Zheng and Xiangyu Peng and Tianji Yang and Chenhui Shen and Shenggui Li and Hongxin Liu and Yukun Zhou and Tianyi Li and Yang You},
  title = {Open-Sora: Democratizing Efficient Video Production for All},
  month = {March},
  year = {2024},
  url = {https://github.com/hpcaitech/Open-Sora}
}
```

## Star增长

[![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date)


================================================
FILE: Open-Sora/docs/zh_CN/READMEv1.1.md
================================================
<p align="center">
    <img src="../../assets/readme/icon.png" width="250"/>
<p>

<div align="center">
    <a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
    <a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
    <a href="https://discord.gg/shpbperhGs"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
    <a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
    <a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
    <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
    <a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
</div>

## Open-Sora： 完全开源的高效复现类Sora视频生成方案
**Open-Sora**项目是一项致力于**高效**制作高质量视频，并使所有人都能使用其模型、工具和内容的计划。
通过采用**开源**原则，Open-Sora 不仅实现了先进视频生成技术的低成本普及，还提供了一个精简且用户友好的方案，简化了视频制作的复杂性。
通过 Open-Sora，我们希望更多开发者一起探索内容创作领域的创新、创造和包容。

[[English Document]](/README.md)

 <h4>Open-Sora 项目目前处在早期阶段，并将持续更新。</h4>

## 📰 资讯
> 由于文档需要进行翻译，最新资讯请看[英文文档](/README.md#-news)
* **[2024.04.25]** 🤗 我们在Hugging Face Spaces上发布了Open-Sora的[Gradio demo](https://huggingface.co/spaces/hpcai-tech/open-sora)。
* **[2024.04.25]** 🔥 我们发布了支持**2秒至15秒、144p至720p、任意宽高比**的文本到图像、文本到视频、图像到视频、视频到视频、无限时间生成的**Open-Sora 1.1**版本。此外，还发布了一个完整的视频处理流程。 [[checkpoints]]() [[report]](/docs/report_02.md)
* **[2024.03.18]** 🔥 我们发布了**Open-Sora 1.0**，这是一个完全开源的视频生成项目。
* Open-Sora 1.0 支持视频数据预处理、加速训练、推理等全套流程。
* 我们提供的[模型权重](#模型权重)只需 3 天的训练就能生成 2 秒的 512x512 视频。
* **[2024.03.04]** Open-Sora：开源Sora复现方案，成本降低46%，序列扩充至近百万。[[英文博客]](https://hpc-ai.com/blog/open-sora)

## 🎥 最新视频

| **2s 512×512**                                                                                                                                                                 | **2s 512×512**                                                                                                                                                              | **2s 512×512**                                                                                                                                    |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="/assets/readme/sample_0.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80)                                 | [<img src="/assets/readme/sample_1.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc)                              | [<img src="/assets/readme/sample_2.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16)    |
| A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff. | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall. |
| [<img src="/assets/readme/sample_3.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94) | [<img src="/assets/readme/sample_4.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9) | [<img src="/assets/readme/sample_5.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65) |
| A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...]                                                           | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...]                                            | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...]                   |

视频经过降采样处理为`.gif`格式，以便显示。点击查看原始视频。为便于显示，文字经过修剪，全文请参见 [此处](/assets/texts/t2v_samples.txt)。在我们的[图片库](https://hpcaitech.github.io/Open-Sora/)中查看更多样本。

## 🔆 新功能
> 由于文档需要进行翻译，最新资讯请看[英文文档](/README.md#-new-featuresupdates)
* 📍Open-Sora-v1 已发布。[这里](#模型权重)提供了模型权重。只需 400K 视频片段和在单卡 H800 上训200天（类比Stable Video Diffusion 的 152M 样本），我们就能生成 2 秒的 512×512 视频。
* ✅ 从图像扩散模型到视频扩散模型的三阶段训练。我们提供每个阶段的权重。
* ✅ 支持训练加速，包括Transformer加速、更快的 T5 和 VAE 以及序列并行。在对 64x512x512 视频进行训练时，Open-Sora 可将训练速度提高**55%**。详细信息请参见[训练加速](acceleration.md)。
* 🔧 我们提供用于数据预处理的视频切割和字幕工具。有关说明请点击[此处](tools/data/README.md)，我们的数据收集计划请点击 [数据集](datasets.md)。
* ✅ 我们发现来自[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的 VQ-VAE 质量较低，因此采用了来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original) 的高质量 VAE。我们还发现使用添加了时间维度的采样会导致生成质量降低。更多讨论，请参阅我们的 **[报告](docs/report_v1.md)**。
* ✅ 我们研究了不同的架构，包括 DiT、Latte 和我们提出的 **STDiT**。我们的STDiT在质量和速度之间实现了更好的权衡。更多讨论，请参阅我们的 **[报告](report_v1.md)**。
* ✅ 支持剪辑和 T5 文本调节。
* ✅ 通过将图像视为单帧视频，我们的项目支持在图像和视频（如 ImageNet 和 UCF101）上训练 DiT。更多说明请参见 [指令解析](command.md)。
* ✅ 利用[DiT](https://github.com/facebookresearch/DiT)、[Latte](https://github.com/Vchitect/Latte) 和 [PixArt](https://pixart-alpha.github.io/) 的官方权重支持推理。

<details>
<summary>查看更多</summary>

* ✅ 重构代码库。请参阅[结构](structure.md)，了解项目结构以及如何使用配置文件。

</details>

### 下一步计划【按优先级排序】

* [ ] 训练视频-VAE并让模型适应新的VAE **[项目进行中]**
* [ ] 缩放模型参数和数据集大小 **[项目进行中]**
* [ ] 纳入更好的时间表，例如 SD3 中的修正流程。 **[项目进行中]**

<details>
<summary>查看更多</summary>

* [x] 评估流程。
* [x] 完成数据处理流程（包括密集光流、美学评分、文本图像相似性、重复数据删除等）。更多信息请参见[数据集](datasets.md)
* [x] 支持图像和视频调节。
* [x] 支持可变长宽比、分辨率和持续时间。

</details>

## 目录

* [安装](#安装)
* [模型权重](#模型权重)
* [推理](#推理)
* [数据处理](#数据处理)
* [训练](#训练)
* [评估](#评估)
* [贡献](#贡献)
* [声明](#声明)
* [引用](#引用)

## 安装

### 从源码安装
```bash
# create a virtual env
conda create -n opensora python=3.10

# install torch
# the command below is for CUDA 12.1, choose install commands from
# https://pytorch.org/get-started/locally/ based on your own CUDA version
pip3 install torch torchvision

# install flash attention (optional)
pip install packaging ninja
pip install flash-attn --no-build-isolation

# install apex (optional)
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git

# install xformers
pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121

# install this project
git clone https://github.com/hpcaitech/Open-Sora
cd Open-Sora
pip install -v .
```

### 使用Docker镜像

运行如下指令使用提供的Dockerfile构建镜像：

```bash
docker build -t opensora ./docker
```

运行以下命令以启动交互模式下的 Docker 容器：

```bash
docker run -ti --gpus all -v {MOUNT_DIR}:/data opensora
```

安装完成后，建议阅读[结构](structure.md)，了解项目结构以及如何使用配置文件。

## 模型权重

| 分辨率  | 数据   | 迭代次数 | 批量大小 | GPU 天数 (H800) | 网址       |
| ---------- | ------ | ----------- | ---------- | --------------- | ---------- |
| 16×256×256 | 366K   | 80k         | 8×64       | 117             | [:link:]() |
| 16×256×256 | 20K HQ | 24k         | 8×64       | 45              | [:link:]() |
| 16×512×512 | 20K HQ | 20k         | 2×64       | 35              | [:link:]() |

我们模型的权重部分由[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) 初始化。参数数量为 724M。有关训练的更多信息，请参阅我们的 **[报告](report_v1.md)**。有关数据集的更多信息，请参阅[数据](datasets.md)。HQ 表示高质量。
:warning: **局限性**：我们的模型是在有限的预算内训练出来的。质量和文本对齐度相对较差。特别是在生成人类时，模型表现很差，无法遵循详细的指令。我们正在努力改进质量和文本对齐。

## 推理

要使用我们提供的权重进行推理，首先要将[T5](https://huggingface.co/DeepFloyd/t5-v1_1-xxl/tree/main)权重下载到pretrained_models/t5_ckpts/t5-v1_1-xxl 中。然后下载模型权重。运行以下命令生成样本。请参阅[此处](structure.md#推理配置演示)自定义配置。

```bash
# Sample 16x512x512 (20s/sample, 100 time steps, 24 GB memory)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path OpenSora-v1-HQ-16x512x512.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 16x256x256 (5s/sample, 100 time steps, 22 GB memory)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path OpenSora-v1-HQ-16x256x256.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 64x512x512 (40s/sample, 100 time steps)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 64x512x512 with sequence parallelism (30s/sample, 100 time steps)
# sequence parallelism is enabled automatically when nproc_per_node is larger than 1
torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt

```

我们在 H800 GPU 上进行了速度测试。如需使用其他模型进行推理，请参阅[此处](commands.md)获取更多说明。减小`vae.micro_batch_size`来降低显存使用（但取样速度会略微减慢）。

## 数据处理

高质量数据是高质量模型的关键。[这里](datasets.md)有我们使用过的数据集和数据收集计划。我们提供处理视频数据的工具。目前，我们的数据处理流程包括以下步骤：

1. 下载数据集。[[文件](/tools/datasets/README.md)]
2. 将视频分割成片段。 [[文件](/tools/scene_cut/README.md)]
3. 生成视频字幕。 [[文件](/tools/caption/README.md)]

## 训练

### Open-Sora 1.0 训练
<details>
<summary>查看更多</summary>

要启动训练，首先要将[T5](https://huggingface.co/DeepFloyd/t5-v1_1-xxl/tree/main)权重下载到pretrained_models/t5_ckpts/t5-v1_1-xxl 中。然后运行以下命令在单个节点上启动训练。

```bash
# 1 GPU, 16x256x256
torchrun --nnodes=1 --nproc_per_node=1 scripts/train.py configs/opensora/train/16x256x512.py --data-path YOUR_CSV_PATH
# 8 GPUs, 64x512x512
torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

要在多个节点上启动训练，请根据[ColossalAI](https://colossalai.org/docs/basics/launch_colossalai/#launch-with-colossal-ai-cli) 准备一个主机文件，并运行以下命令。

```bash
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

有关其他模型的训练和高级使用方法，请参阅[此处](commands.md)获取更多说明。

</details>

## 评估

点击[这里](https://github.com/hpcaitech/Open-Sora/blob/main/eval/README.md)查看评估

## 贡献

本中文翻译还有许多不足，如果您希望为该项目做出贡献，可以参考 [贡献指南](/CONTRIBUTING.md).

目前需要翻译或更新的文件：
* [ ] 更新[资讯](#-资讯)
* [ ] 更新[最新视频](#-最新视频)
* [ ] 更新[新功能](#-新功能)。
* [ ] 翻译[评估](https://github.com/hpcaitech/Open-Sora/blob/main/eval/README.md)文件
* [ ] 更新Open-Sora 1.1[训练](#训练)
## 声明

* [ColossalAI](https://github.com/hpcaitech/ColossalAI): A powerful large model parallel acceleration and optimization
* [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers.
* [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): An acceleration for DiT training. We adopt valuable acceleration strategies for training progress from OpenDiT.
* [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): An open-source DiT-based text-to-image model.
* [Latte](https://github.com/Vchitect/Latte): An attempt to efficiently train DiT for video.
* [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model.
* [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model.
* [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder.
* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Yi-34B](https://huggingface.co/01-ai/Yi-34B).

我们对他们的出色工作和对开源的慷慨贡献表示感谢。

## 引用

```bibtex
@software{opensora,
  author = {Zangwei Zheng and Xiangyu Peng and Yang You},
  title = {Open-Sora: Democratizing Efficient Video Production for All},
  month = {March},
  year = {2024},
  url = {https://github.com/hpcaitech/Open-Sora}
}
```

[Zangwei Zheng](https://github.com/zhengzangw) and [Xiangyu Peng](https://github.com/xyupeng) equally contributed to this work during their internship at [HPC-AI Tech](https://hpc-ai.com/).

## Star 走势

[![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date)


================================================
FILE: Open-Sora/docs/zh_CN/acceleration.md
================================================
# 加速

>本文档对应于Open-Sora v1.1版本。

Open-Sora 旨在为扩散模型提供一个高速训练框架。在 64 帧 512x512 视频上训练时，我们可以实现 **55%** 的训练速度加速。我们的框架支持训练
**1分钟1080p视频**。

## 加速的 Transformer

Open-Sora 通过以下方式提高训练速度：

- 内核优化，包括 [flash attention](https://github.com/Dao-AILab/flash-attention), 融合 layernorm 内核以及由 colossalAI
  编译的内核。
- 混合并行性，包括 ZeRO。
- 用于更大批量的梯度检查点。

我们在图像上的训练速度可与 [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT) 相媲美，这是一个加速 DiT
训练的项目。训练速度是在批处理大小为 128、图像大小为 256x256 的 8 个 H800 GPU 上测量的。

| 模型       | 吞吐量 (img/s/GPU) | 吞吐量 (tokens/s/GPU) |
|----------|-----------------|--------------------|
| DiT      | 100             | 26k                |
| OpenDiT  | 175             | 45k                |
| OpenSora | 175             | 45k                |

## 高效的 STDiT

我们的 STDiT 采用时空注意力对视频数据进行建模。与直接全神贯注在 Dit 相比，我们的 STDiT 随着帧数的增加而更有效率。我们当前的框架仅支持序列超长序列的并行性。

训练速度是在 8 个 H800 GPU 上测量的，应用了加速技术，GC 表示梯度检查点。
两者都具有像 PixArt 一样的 T5 调节。

| 模型               | 设置             | 吞吐量 (sample/s/GPU) | 吞吐量 (tokens/s/GPU) |
|------------------|----------------|--------------------|--------------------|
| DiT              | 16x256  (4k)   | 7.20               | 29k                |
| STDiT            | 16x256  (4k)   | 7.00               | 28k                |
| DiT              | 16x512  (16k)  | 0.85               | 14k                |
| STDiT            | 16x512  (16k)  | 1.45               | 23k                |
| DiT (GC)         | 64x512  (65k)  | 0.08               | 5k                 |
| STDiT (GC)       | 64x512  (65k)  | 0.40               | 25k                |
| STDiT (GC, sp=2) | 360x512 (370k) | 0.10               | 18k                |

使用 Video-VAE 在时间维度上进行 4 倍下采样时，24fps 视频有 450 帧。STDiT(28k tokens/s) 和 DiT 对图像 (高达 45k tokens/s)
两者之间的速度差距主要来自 T5 和 VAE 编码，以及时间注意力。

## 加速的编码器 (T5, VAE)

在训练过程中，文本由 T5 编码，视频由 VAE 编码。通常有两种方法可以加速训练：

1. 提前预处理文本和视频数据并保存到磁盘。
2. 在训练过程中对文本和视频数据进行编码，并加快编码过程。

对于选项 1，一个样本的 120 个令牌需要 1M 磁盘空间，而 64x64x64 的潜在可能需要 4M。考虑训练 包含 10M 视频剪辑的数据集，所需的总磁盘空间为
50TB。我们的存储系统目前还没有准备好 这种数据规模。

对于选项 2，我们提高了 T5 速度和内存要求。根据在[OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT)，我们发现 VAE
消耗了大量的 GPU 内存。因此，我们
将批大小拆分为较小的批大小，以便进行 VAE 编码。使用这两种技术，我们可以大大加快训练速度。

训练速度是在 8 个带有 STDiT 的 H800 GPU 上测量的。

| 加速模式         | 设置            | 吞吐量 (img/s/GPU) | 吞吐量 (tokens/s/GPU) |
|--------------|---------------|-----------------|--------------------|
| Baseline     | 16x256  (4k)  | 6.16            | 25k                |
| w. faster T5 | 16x256  (4k)  | 7.00            | 29k                |
| Baseline     | 64x512  (65k) | 0.94            | 15k                |
| w. both      | 64x512  (65k) | 1.45            | 23k                |


================================================
FILE: Open-Sora/docs/zh_CN/commands.md
================================================
# 命令

## 推理

您可以修改相应的配置文件来更改推理设置。在 [此处](/docs/structure.md#inference-config-demos) 查看更多详细信息。

### 在 ImageNet 上使用 DiT 预训练进行推理

以下命令会自动在 ImageNet 上下载预训练权重并运行推理。

```bash
python scripts/inference.py configs/dit/inference/1x256x256-class.py --ckpt-path DiT-XL-2-256x256.pt
```

### 在 UCF101 上使用 Latte 预训练进行推理

以下命令会自动下载 UCF101 上的预训练权重并运行推理。

```bash
python scripts/inference.py configs/latte/inference/16x256x256-class.py --ckpt-path Latte-XL-2-256x256-ucf101.pt
```

### 使用 PixArt-α 预训练权重进行推理

将 T5 下载到 `./pretrained_models` 并运行以下命令。

```bash
# 256x256
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x256x256.py --ckpt-path PixArt-XL-2-256x256.pth

# 512x512
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x512x512.py --ckpt-path PixArt-XL-2-512x512.pth

# 1024 multi-scale
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x1024MS.py --ckpt-path PixArt-XL-2-1024MS.pth
```

### 使用训练期间保存的 checkpoints 进行推理

在训练期间，会在 `outputs` 目录中创建一个实验日志记录文件夹。在每个 checkpoint 文件夹下（例如 `epoch12-global_step2000`），有一个 `ema.pt` 文件和共享的 `model` 文件夹。执行以下命令进行推理。

```bash
# 使用 ema 模型进行推理
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000/ema.pt

# 使用模型进行推理
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000

# 使用序列并行进行推理
# 当 nproc_per_node 大于 1 时，将自动启用序列并行
torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000
```

第二个命令将在 checkpoint 文件夹中自动生成一个 `model_ckpt.pt` 文件。

### 推理超参数

1. DPM 求解器擅长对图像进行快速推理。但是，它的视频推理的效果并不令人满意。若出于快速演示目的您可以使用这个求解器。

```python
type="dmp-solver"
num_sampling_steps=20
```

2. 您可以在视频推理上使用 [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) 微调的 VAE 解码器（消耗更多内存）。但是，我们没有看到视频推理效果有明显改善。要使用它，请将 [预训练权重](https://huggingface.co/maxin-cn/Latte/tree/main/t2v_required_models/vae_temporal_decoder) 下载到 `./pretrained_models/vae_temporal_decoder` 中，并修改配置文件，如下所示。

```python
vae = dict(
    type="VideoAutoencoderKLTemporalDecoder",
    from_pretrained="pretrained_models/vae_temporal_decoder",
)
```

## 训练

如果您要继续训练，请运行以下命令。参数 ``--load`` 和 ``--ckpt-path`` 不同之处在于，它会加载优化器和数据加载器的状态。

```bash
torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --load YOUR_PRETRAINED_CKPT
```

如果要启用 wandb 日志，请添加到 `--wandb` 参数到命令中。

```bash
WANDB_API_KEY=YOUR_WANDB_API_KEY torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --wandb True
```

您可以修改相应的配置文件来更改训练设置。在 [此处](/docs/structure.md#training-config-demos) 查看更多详细信息。

### 训练超参数

1. `dtype` 是用于训练的数据类型。仅支持 `fp16` 和 `bf16`。ColossalAI 自动启用 `fp16` 和 `bf16` 的混合精度训练。在训练过程中，我们发现 `bf16` 更稳定。


================================================
FILE: Open-Sora/docs/zh_CN/datasets.md
================================================
# 数据集

## 正在使用的数据集

### HD-VG-130M

[HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) 包括 130M 个文本视频对。标题是
由 BLIP-2 生成。我们发现剪切和文本质量相对较差。它包含 20 个拆分。对于 OpenSora 1.0，我们使用第一个拆分。我们计划使用整个数据集并对其进行重新处理。

### Inter4k

[Inter4k](https://github.com/alexandrosstergiou/Inter4K) 是一个包含分辨率为 4K 的 1k 视频剪辑的数据集。这个
数据集被提议用于超分辨率任务。我们使用数据集进行 HQ 训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。

### Pexels.com

[Pexels.com](https://www.pexels.com/) 是一个提供免费库存照片和视频的网站。我们收集的 19K 视频
来自本网站的剪辑，用于高质量训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。

## 数据集监视列表

我们也在关注以下数据集，并考虑在未来使用它们，这取决于我们的存储空间以及数据集的质量。

| 名称                | 大小           | 描述                            |
|-------------------|--------------|-------------------------------|
| Panda-70M         | 70M videos   | High quality video-text pairs |
| WebVid-10M        | 10M videos   | Low quality                   |
| InternVid-10M-FLT | 10M videos   |                               |
| EGO4D             | 3670 hours   |                               |
| OpenDV-YouTube    | 1700 hours   |                               |
| VidProM           | 6.69M videos |                               |


================================================
FILE: Open-Sora/docs/zh_CN/report_v1.md
================================================
# Open-Sora v1 技术报告

OpenAI的Sora在生成一分钟高质量视频方面非常出色。然而，它几乎没有透露任何关于其细节的信息。为了使人工智能更加“开放”，我们致力于构建一个开源版本的Sora。这份报告描述了我们第一次尝试训练一个基于Transformer的视频扩散模型。

## 选择高效的架构

为了降低计算成本，我们希望利用现有的VAE模型。Sora使用时空VAE来减少时间维度。然而，我们发现没有开源的高质量时空VAE模型。[MAGVIT](https://github.com/google-research/magvit)的4x4x4 VAE并未开源，而[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的2x4x4 VAE在我们的实验中质量较低。因此，我们决定在我们第一个版本中使用2D VAE（来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)）。

视频训练涉及大量的token。考虑到24fps的1分钟视频，我们有1440帧。通过VAE下采样4倍和patch大小下采样2倍，我们得到了1440x1024≈150万个token。在150万个token上进行全注意力计算将带来巨大的计算成本。因此，我们使用时空注意力来降低成本，这是遵循[Latte](https://github.com/Vchitect/Latte)的方法。

如图中所示，在STDiT（ST代表时空）中，我们在每个空间注意力之后立即插入一个时间注意力。这类似于Latte论文中的变种3。然而，我们并没有控制这些变体的相似数量的参数。虽然Latte的论文声称他们的变体比变种3更好，但我们在16x256x256视频上的实验表明，相同数量的迭代次数下，性能排名为：DiT（完整）> STDiT（顺序）> STDiT（并行）≈ Latte。因此，我们出于效率考虑选择了STDiT（顺序）。[这里](/docs/acceleration.md#efficient-stdit)提供了速度基准测试。


![Architecture Comparison](/assets/readme/report_arch_comp.png)

为了专注于视频生成，我们希望基于一个强大的图像生成模型来训练我们的模型。PixArt-α是一个经过高效训练的高质量图像生成模型，具有T5条件化的DiT结构。我们使用[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)初始化我们的模型，并将插入的时间注意力的投影层初始化为零。这种初始化在开始时保留了模型的图像生成能力，而Latte的架构则不能。插入的注意力将参数数量从5.8亿增加到7.24亿。

![Architecture](/assets/readme/report_arch.jpg)

借鉴PixArt-α和Stable Video Diffusion的成功，我们还采用了渐进式训练策略：在366K预训练数据集上进行16x256x256的训练，然后在20K数据集上进行16x256x256、16x512x512和64x512x512的训练。通过扩展位置嵌入，这一策略极大地降低了计算成本。

我们还尝试在DiT中使用3D patch嵌入器。然而，在时间维度上2倍下采样后，生成的视频质量较低。因此，我们将在下一版本中将下采样留给时间VAE。目前，我们在每3帧采样一次进行16帧训练，以及在每2帧采样一次进行64帧训练。


## 数据是训练高质量模型的核心

我们发现数据的数量和质量对生成视频的质量有很大的影响，甚至比模型架构和训练策略的影响还要大。目前，我们只从[HD-VG-130M](https://github.com/daooshee/HD-VG-130M)准备了第一批分割（366K个视频片段）。这些视频的质量参差不齐，而且字幕也不够准确。因此，我们进一步从提供免费许可视频的[Pexels](https://www.pexels.com/)收集了20k相对高质量的视频。我们使用LLaVA，一个图像字幕模型，通过三个帧和一个设计好的提示来标记视频。有了设计好的提示，LLaVA能够生成高质量的字幕。

![Caption](/assets/readme/report_caption.png)

由于我们更加注重数据质量，我们准备收集更多数据，并在下一版本中构建一个视频预处理流程。

## 训练细节

在有限的训练预算下，我们只进行了一些探索。我们发现学习率1e-4过大，因此将其降低到2e-5。在进行大批量训练时，我们发现`fp16`比`bf16`不太稳定，可能会导致生成失败。因此，我们在64x512x512的训练中切换到`bf16`。对于其他超参数，我们遵循了之前的研究工作。

## 损失曲线

16x256x256 预训练损失曲线

![16x256x256 Pretraining Loss Curve](/assets/readme/report_loss_curve_1.png)

16x256x256 高质量训练损失曲线

![16x256x256 HQ Training Loss Curve](/assets/readme/report_loss_curve_2.png)

16x512x512 高质量训练损失曲线

![16x512x512 HQ Training Loss Curve](/assets/readme/report_loss_curve_3.png)


================================================
FILE: Open-Sora/docs/zh_CN/report_v2.md
================================================
# Open-Sora 1.1 技术报告

- [模型架构修改](#模型架构修改)
- [支持不同视频长度/分辨率/宽高比/帧率（fps）训练](#支持不同视频长度分辨率宽高比帧率fps训练)
- [使用Masked DiT作为图生视频/视频生视频模型](#使用masked-dit作为图生视频视频生视频模型)
- [数据收集和流程](#数据收集和流程)
- [训练详情](#训练详情)
- [结果和评价](#结果和评价)
- [不足和下一步计划](#不足和下一步计划)

在Open-Sora1.1版本中，我们使用了10M数据来训练经过结构调优后的STDiT的700M模型（Open-Sora1.0版本仅用400K数据）。我们实现了[Sora报告](https://openai.com/research/video-generation-models-as-world-simulators)中提到的以下功能：

- 可变的视频时长、分辨率、宽高比（包括采样灵活性、改进的取景范围和构图）
- 提示词增加图片和视频选项（使图像动起来、生成式增长视频、视频到视频编辑、连接不同视频）
- 图像生成功能

为了实现这一目标，我们在预训练阶段使用了多任务学习。对于扩散模型来说，用不同的采样时间步长进行训练已经是一种多任务学习。我们将这一思想在图像和视频的条件生成模型上，进一步扩展到多分辨率、宽高比、帧长、fps以及不同的掩码策略。我们在**0~15s、144p到720p、各种宽高比的视频**上训练模型。虽然由于训练FLOPs不足的限制，生成的视频在时间一致性上的表现没有那么高，但我们仍然可以看到这个模型的巨大潜力。

## 模型架构修改

我们对原始ST-DiT模型进行了以下修改，以获得更好的训练稳定性和模型性能（ST-DiT-2）：

- **在时间注意力模块中添加[旋转位置编码](https://arxiv.org/abs/2104.09864)**：遵循目前LLM的最佳实践，我们将时间注意力模块中的正弦位置编码更改为旋转位置编码，因为它也算一项序列预测任务。
- **在时间注意力模块中添加AdaIN和Layernormal**：我们将时间注意力与AdaIN和Layer范数作为空间注意力包裹起来，以稳定训练。
- **[QK归一化](https://arxiv.org/abs/2302.05442)与[RMSNorm](https://arxiv.org/abs/1910.07467)**：和[SD3](https://arxiv.org/pdf/2403.03206.pdf)类似地，我们应用QK归一化来提高半精度训练的稳定性。
- **支持动态输入大小和视频条件限定**：为了支持多分辨率、宽高比和fps训练，我们ST-DiT-2来接受任何输入大小。延申[PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)的想法，我们支持限定视频的高度、宽度、宽高比、帧长和fps。
- **将T5token数量从120扩展到200**：我们使用的视频描述通常少于200个token，我们发现模型也可以很好地处理更长的文本。

## 支持不同视频长度/分辨率/宽高比/帧率（fps）训练

正如[Sora报告](https://openai.com/research/video-generation-models-as-world-simulators)中提到的，使用原始无损视频的分辨率、宽高比和视频长度进行训练可以增加采样灵活性，改善取景和构图。我们找到了三种实现这一目标的方法：
- [NaViT](https://arxiv.org/abs/2307.06304)：通过不同掩码策略支持在同一训练批次内使用不同大小的数据，并且训练效率下降很少。然而，该系统实现起来有点复杂，并且可能无法兼容kernal优化技术（如flashattention）。
- 填充（[FiT](https://arxiv.org/abs/2402.12376)，[Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)）：通过填充支持同一批次内的不同大小的数据。然而，将不同的分辨率填充到相同的大小会导致效率降低。
- 分桶训练（[SDXL](https://arxiv.org/abs/2307.01952)、[PixArt](https://arxiv.org/abs/2310.00426)）：支持通过分桶的方式在不同批次中动态调整大小，但在同一批次内数据大小必须相同，只能应用固定数量的数据大小。在一个批次中，我们不需要实现复杂的掩码或填充。

为了更便捷的实现，我们选择分桶训练的方式。我们预先定义了一些固定的分辨率，并将不同的样本分配到不同的桶中。下面列出了分桶方案中值得注意的点。但我们可以看到，这些在我们的实验中并不是一个大问题。

<details>
<summary>查看注意事项</summary>

- 桶大小被限制为固定数量：首先，在实际应用中，通常只使用少数宽高比（9:16、3:4）和分辨率（240p、1080p）。其次，我们发现经过训练的模型可以很好地推广到未见过的解决方案。
- 每批的大小相同，打破了独立同分布（i.i.d.）假设：由于我们使用多个 GPU，因此不同 GPU 上的本地批次具有不同的大小。我们没有发现此问题导致性能显着下降。
- 可能没有足够的样本来填充每个桶，并且分布可能有偏差：首先，当本地批量大小不太大时，我们的数据集足够大以填充每个桶。其次，我们应该分析数据大小的分布并相应地定义桶大小。第三，分配不平衡并没有显着影响训练过程。
- 不同的分辨率和帧长可能有不同的处理速度：与PixArt只处理相似分辨率（相似token数）的宽高比不同，我们需要考虑不同分辨率和帧长的处理速度。我们可以使用“bucket_config”来定义每个桶的批量大小，以确保处理速度相似。

</details>

![bucket](/assets/readme/report_bucket.png)

如图所示，桶是（分辨率，帧数量，宽高比）的三元组。我们为不同的分辨率提供预定义的宽高比，涵盖了大多数常见的视频宽高比。在每个epoch之前，我们打乱数据集并将样本分配到不同的桶中，如图所示。我们将样本放入最大分辨率和帧长度小于视频的桶中。

考虑到我们的计算资源有限，我们进一步为每个（分辨率，num_frame）二元组引入keep_prob和batch_size两个属性，以降低计算成本并实现多阶段训练。具体来说，高清视频将以概率1-keep_prob下采样到较低分辨率的桶中，并且每个桶的样本数量是由batch_size属性决定的。这样，我们可以控制不同桶中的样本数量，并通过为每个桶搜索合适的数据量来平衡GPU负载。

有关训练中桶使用的详细说明，请参阅[配置文件](/docs/config.md#training-bucket-configs).

## 使用Masked DiT作为图生视频/视频生视频模型

Transformer可以很容易地扩展到支持图生图和视频生视频的任务。我们提出了一种蒙版策略来支持图像和视频的调节。蒙版策略如下图所示。

![mask strategy](/assets/readme/report_mask.png)

在将图像或视频转换成另一个视频的过程中，我们通常会选择出需要作为条件的帧并取消其掩码（unmask）。在使用ST-DiT模型进行前向传播时，被选择取消掩码（unmask）的帧将被赋予时间步长0，而其他帧则保持它们原有的时间步长t。我们发现，如果直接将这种策略应用到训练好的模型上，会得到较差的结果，因为扩散模型在训练过程中并未学会如何处理一个样本中具有不同时间步长的帧。

受[UL2](https://arxiv.org/abs/2205.05131)的启发，我们在训练期间引入了随机掩码策略。具体来说，我们在训练期间随机取消掩码帧，包括取消掩码第一帧，前k帧，最后k帧，最后k帧，第一和最后k帧，随机帧等。基于Open-Sora 1.0模型，以50%的概率应用掩码策略，我们发现模型能够在10,000步的训练中学会处理图像条件（而30%的概率会导致处理能力变差），同时文本到视频的性能略有下降。因此，在Open-Sora 1.1版本中，我们从头开始预训练模型，并采用了掩码策略。

下图给出了用于推理的掩码策略配置的说明。五数字元组在定义掩码策略方面提供了极大的灵活性。

![mask strategy config](/assets/readme/report_mask_config.png)

掩码策略用法的详细说明可在[配置文件](/docs/config.md#advanced-inference-config)中查看.


## 数据收集和流程

正如我们在Sora1.0版本中看见的那样，数据数量和质量对于训练一个好的模型至关重要，因此，我们努力扩展数据集。首先，我们创建了一个遵循[SVD](https://arxiv.org/abs/2311.15127)的自动流水线，包括场景切割、字幕、各种评分和过滤以及数据集管理脚本和通用惯例。

![pipeline](/assets/readme/report_data_pipeline.png)

我们计划使用[panda-70M](https://snap-research.github.io/Panda-70M/)和其他数据来训练模型，大约包含3000万条数据。然而，我们发现磁盘输入输出（disk IO）在同时进行训练和数据处理时成为了一个瓶颈。因此，我们只能准备一个包含1000万条数据的数据集，并且没有完成我们构建的所有处理流程。最终，我们使用了包含970万视频和260万图像的数据集进行预训练，以及560,000视频和160万图像的数据集进行微调。预训练数据集的统计信息如下所示。

图像文本标记 (使用T5分词器)：
![image text tokens](/assets/readme/report_image_textlen.png)

视频文本标记 (使用T5分词器)。我们直接使用Panda的短视频描述进行训练，并自己给其他数据集加视频描述。生成的字幕通常少于200个token。
![video text tokens](/assets/readme/report_video_textlen.png)

视频时长：
![video duration](/assets/readme/report_video_duration.png)

## 训练详情

由于计算资源有限，我们必须仔细监控训练过程，并在推测模型学习不佳时更改训练策略，因为没有消融研究的计算。因此，Open-Sora1.1版本的训练包括多个更改，所以，指数移动平均（EMA）未被应用。

1. 首先，我们从`Pixart-alpha-1024`的模型checkpoint开始，使用不同分辨率的图像进行了6000步的微调。我们发现模型能够很容易地适应并生成不同分辨率的图像。为了加快扩散过程的训练，我们使用了[SpeeDiT](https://github.com/1zeryu/SpeeDiT)（iddpm-speed）技术。
2. **[阶段一]** 然后，我们使用梯度检查点（gradient-checkpointing）技术对模型进行了**24,000**步的预训练，这个过程在64个H800 GPU上运行了**4天**。尽管模型看到的数据样本数量相同，我们发现与使用较小批量大小相比，模型的学习速度较慢。我们推测，在训练的早期阶段，步数的数量对于训练更为重要。大多数视频的分辨率是**240p**，预训练时使用的配置与[stage2.py](/configs/opensora-v1-1/train/stage2.py)相似。
3. **[阶段一]** 为了增加训练步数，我们改用了更小的批量大小，并且没有使用梯度检查点技术。在这个阶段，我们还引入了帧率（fps）条件。模型训练了**40,000**步，持续了**2天**。训练中使用的视频大多数是**144p**分辨率，使用的配置文件是[stage1.py](/configs/opensora-v1-1/train/stage1.py)。我们使用较低的分辨率，因为我们在Open-Sora 1.0版本中发现模型可以以相对较低的分辨率学习时间知识。
4. **[阶段一]** 我们发现模型不能很好地学习长视频，并在Open-Sora1.0训练中发现了一个噪声生成结果，推测是半精度问题。因此，我们采用QK-归一化来稳定训练。我们还将iddpm-speed切换成iddpm。我们训练了**17k**步**14小时**。大多数视频的分辨率是144p，预训练时使用的配置是[stage1.py](/configs/opensora-v1-1/train/stage1.py)。阶段1训练持续约一周，总步长**81k**。
5. **[阶段二]** 我们切换到更高的分辨率，其中大多数视频是**240p和480p**分辨率（[stage2.py](/configs/opensora-v1-1/train/stage2.py)）。我们在所有预训练数据上训练了**22000**步，持续**一天**。
6. **[阶段三]** 我们切换到更高的分辨率，大多数视频的分辨率是**480p和720p**（[stage3.py](/configs/opensora-v1-1/train/stage3.py)）。我们在高质量数据上训了**4000**步，用时**一天**。

## 结果和评价

## 不足和下一步计划

随着我们离Sora的复现又近了一步，我们发现当前模型存在许多不足，这些不足将在我们下阶段工作中得到改善。

- **噪音的生成和影响**：我们发现生成的模型，特别是长视频中，有时很多噪点，不流畅。我们认为问题在于没有使用时间VAE。由于[Pixart-Sigma](https://arxiv.org/abs/2403.04692)发现适应新VAE很容易，我们计划在下一个版本中为模型开发时间VAE。
- **缺乏时间一致性**：我们发现模型无法生成具有高时间一致性的视频，我们认为问题是由于缺乏训练FLOPs，我们计划收集更多数据并继续训练模型以提高时间一致性。
- **人像生成质量低**：我们发现模型无法生成高质量的人类视频，我们认为问题是由于缺乏人类数据，我们计划收集更多的人类数据，并继续训练模型以提高人类生成。
- **美学得分低**：我们发现模型的美学得分不高。问题在于缺少美学得分过滤，由于IO瓶颈没我们没有进行这一步骤。我们计划通过美学得分和微调模型来过滤数据，以提高美学得分。
- **长视频生成质量低**：我们发现，使用同样的提示词，视频越长，质量越差。这意味着图像质量不能同等地被不同长度的序列所适应。

> - **算法与加速实现**：Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou
> - **数据收集与处理**：Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu


================================================
FILE: Open-Sora/docs/zh_CN/report_v3.md
================================================
# Open-Sora 1.2 报告

- [视频压缩网络](#视频压缩网络)
- [整流流和模型适应](#整流流和模型适应)
- [更多数据和更好的多阶段训练](#更多数据和更好的多阶段训练)
- [简单有效的模型调节](#简单有效的模型调节)
- [评估](#评估)

在 Open-Sora 1.2 版本中，我们在 >30M 数据上训练了 一个1.1B 的模型，支持 0s~16s、144p 到 720p、各种宽高比的视频生成。我们的配置如下所列。继 1.1 版本之后，Open-Sora 1.2 还可以进行图像到视频的生成和视频扩展。

|      | 图像 | 2秒  | 4秒  | 8秒  | 16秒 |
| ---- | ----- | --- | --- | --- | --- |
| 240p | ✅     | ✅   | ✅   | ✅   | ✅   |
| 360p | ✅     | ✅   | ✅   | ✅   | ✅   |
| 480p | ✅     | ✅   | ✅   | ✅   | 🆗   |
| 720p | ✅     | ✅   | ✅   | 🆗   | 🆗   |

这里✅表示在训练期间可以看到数据，🆗表示虽然没有经过训练，但模型可以在该配置下进行推理。🆗的推理需要多个80G内存的GPU和序列并行。

除了 Open-Sora 1.1 中引入的功能外，Open-Sora 1.2 还有以下重磅更新：

- 视频压缩网络
- 整流流训练
- 更多数据和更好的多阶段训练
- 简单有效的模型调节
- 更好的评估指标

上述改进的所有实现（包括训练和推理）均可在 Open-Sora 1.2 版本中使用。以下部分将介绍改进的细节。我们还改进了代码库和文档，使其更易于使用。

## 视频压缩网络

对于 Open-Sora 1.0 & 1.1，我们使用了 stable-ai 的 83M 2D VAE，它仅在空间维度上压缩，将视频压缩 8x8 倍。为了减少时间维度，我们每三帧提取一帧。然而，这种方法导致生成的视频流畅度较低，因为牺牲了生成的帧率（fps）。因此，在这个版本中，我们引入了像 OpenAI 的 Sora 一样的视频压缩网络。该网络在时域上将视频大小压缩至四分之一，因此，我们不必再额外抽帧，而可以使用原有帧率生成模型。

考虑到训练 3D VAE 的计算成本很高，我们希望重新利用在 2D VAE 中学到的知识。我们注意到，经过 2D VAE 压缩后，时间维度上相邻的特征仍然高度相关。因此，我们提出了一个简单的视频压缩网络，首先将视频在空间维度上压缩 8x8 倍，然后将视频在时间维度上压缩 4 倍。网络如下所示：

![video_compression_network](/assets/readme/report_3d_vae.png)

我们用[SDXL 的 VAE](https://huggingface.co/stabilityai/sdxl-vae)初始化 2D VAE ，它比我们以前使用的更好。对于 3D VAE，我们采用[Magvit-v2](https://magvit.cs.cmu.edu/v2/)中的 VAE 结构，它包含 300M 个参数。加上 83M 的 2D VAE，视频压缩网络的总参数为 384M。我们设定batch size 为 1， 对 3D VAE 进行了 1.2M 步的训练。训练数据是来自 pixels 和 pixabay 的视频，训练视频大小主要是 17 帧，256x256 分辨率。3D VAE 中使用causal convolotions使图像重建更加准确。

我们的训练包括三个阶段：

1. 对于前 380k 步，我们冻结 2D VAE并在 8 个 GPU 上进行训练。训练目标包括重建 2D VAE 的压缩特征（图中粉红色），并添加损失以使 3D VAE 的特征与 2D VAE 的特征相似（粉红色和绿色，称为identity loss）。我们发现后者的损失可以快速使整个 VAE 在图像上取得良好的性能，并在下一阶段更快地收敛。
2. 对于接下来的 260k 步，我们消除identity loss并仅学习 3D VAE。
3. 对于最后 540k 步，由于我们发现仅重建 2D VAE 的特征无法带来进一步的改进，因此我们移除了loss并训练整个 VAE 来重建原始视频。此阶段在 24 个 GPU 上进行训练。

对于训练的前半部分，我们采用 20% 的图像和 80% 的视频。按照[Magvit-v2](https://magvit.cs.cmu.edu/v2/)，我们使用 17 帧训练视频，同时对图像的前 16 帧进行零填充。然而，我们发现这种设置会导致长度不同于 17 帧的视频变得模糊。因此，在第 3 阶段，我们使用不超过34帧长度的任意帧长度视频进行混合视频长度训练,以使我们的 VAE 对不同视频长度更具鲁棒性（也就是说，如果我们希望训练含有n帧的视频，我们就把原视频中`34-n`帧用0进行填充）。我们的 [训练](/scripts/train_vae.py)和[推理](/scripts/inference_vae.py)代码可在 Open-Sora 1.2 版本中找到。

当使用 VAE 进行扩散模型时，我们的堆叠 VAE 所需的内存较少，因为我们的 VAE 的输入已经经过压缩。我们还将输入视频拆分为几个 17 帧剪辑，以提高推理效率。我们的 VAE 与[Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/docs/Report-v1.1.0.md)中的另一个开源 3D VAE 性能相当。

| 模型          | 结构相似性↑ | 峰值信噪比↑  |
| ------------------ | ----- | ------ |
| Open-Sora-Plan 1.1 | 0.882 | 29.890 |
| Open-Sora 1.2      | 0.880 | 30.590 |

## 整流流和模型适应

最新的扩散模型 Stable Diffusion 3 为了获得更好的性能，采用了[rectified flow](https://github.com/gnobitab/RectifiedFlow)替代了 DDPM。可惜 SD3 的 rectified flow 训练代码没有开源。不过 Open-Sora 1.2 提供了遵循 SD3 论文的训练代码，包括：

- 基本整流流训练
- 用于训练加速的 Logit-norm 采样
- 分辨率和视频长度感知时间步长采样

对于分辨率感知的时间步长采样，我们应该对分辨率较大的图像使用更多的噪声。我们将这个想法扩展到视频生成，对长度较长的视频使用更多的噪声。

Open-Sora 1.2 从[PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma) 模型checkpoint开始。请注意，此模型使用 DDPM 和 SDXL VAE 进行训练，分辨率也高得多。我们发现在小数据集上进行微调可以轻松地使模型适应我们的视频生成设置。适应过程如下，所有训练都在 8 个 GPU 上完成：

1. 多分辨率图像生成能力：我们训练模型以 20k 步生成从 144p 到 2K 的不同分辨率。
2. QK-norm：我们将 QK-norm 添加到模型中并训练 18k 步。
3. 整流流：我们从离散时间 DDPM 转变为连续时间整流流并训练 10k 步。
4. 使用 logit-norm 采样和分辨率感知时间步采样的整流流：我们训练 33k 步。
5. 较小的 AdamW epsilon：按照 SD3，使用 QK-norm，我们可以对 AdamW 使用较小的 epsilon（1e-15），我们训练 8k 步。
6. 新的 VAE 和 fps 调节：我们用自己的 VAE 替换原来的 VAE，并将 fps 调节添加到时间步调节中，我们训练 25k 步。请注意，对每个通道进行规范化对于整流流训练非常重要。
7. 时间注意力模块：我们添加时间注意力模块，其中没有初始化投影层。我们在图像上进行 3k 步训练。
8. 仅针对具有掩码策略的视频的时间块：我们仅在视频上训练时间注意力块，步长为 38k。

经过上述调整后，我们就可以开始在视频上训练模型了。上述调整保留了原始模型生成高质量图像的能力，并未后续的视频生成提供了许多助力：

- 通过整流，我们可以加速训练，将视频的采样步数从100步减少到30步，大大减少了推理的等待时间。
- 使用 qk-norm，训练更加稳定，并且可以使用积极的优化器。
- 采用新的VAE，时间维度压缩了4倍，使得训练更加高效。
- 该模型具有多分辨率图像生成能力，可以生成不同分辨率的视频。

## 更多数据和更好的多阶段训练

由于计算预算有限，我们精心安排了训练数据的质量从低到高，并将训练分为三个阶段。我们的训练涉及 12x8 GPU，总训练时间约为 2 周， 约70k步。

### 第一阶段

我们首先在 Webvid-10M 数据集（40k 小时）上训练模型，共 30k 步（2 个 epoch）。由于视频分辨率均低于 360p 且包含水印，因此我们首先在此数据集上进行训练。训练主要在 240p 和 360p 上进行，视频长度为 2s~16s。我们使用数据集中的原始字幕进行训练。训练配置位于[stage1.py](/configs/opensora-v1-2/train/stage1.py)中。

### 第二阶段

然后我们在 Panda-70M 数据集上训练模型。这个数据集很大，但质量参差不齐。我们使用官方的 30M 子集，其中的片段更加多样化，并过滤掉美学评分低于 4.5 的视频。这产生了一个 20M 子集，包含 41k 小时。数据集中的字幕直接用于我们的训练。训练配置位于[stage2.py](/configs/opensora-v1-2/train/stage2.py)中。

训练主要在 360p 和 480p 上进行。我们训练模型 23k 步，即 0.5 个 epoch。训练尚未完成，因为我们希望我们的新模型能早日与大家见面。

### 第三阶段

在此阶段，我们从各种来源收集了 200 万个视频片段，总时长 5000 小时，其中包括：

- 来自 Pexels、Pixabay、Mixkit 等的免费授权视频。
- [MiraData](https://github.com/mira-space/MiraData)：一个包含长视频的高质量数据集，主要来自游戏和城市/风景探索。
- [Vript](https://github.com/mutonix/Vript/tree/main)：一个密集注释的数据集。
- 还有一些其他数据集。

MiraData 和 Vript 有来自 GPT 的字幕，而我们使用[PLLaVA](https://github.com/magic-research/PLLaVA)为其余字幕添加字幕。与只能进行单帧/图像字幕的 LLaVA 相比，PLLaVA 是专门为视频字幕设计和训练的。[加速版PLLaVA](/tools/caption/README.md#pllava-captioning)已在我们的`tools/`中发布。在实践中，我们使用预训练的 PLLaVA 13B 模型，并从每个视频中选择 4 帧生成字幕，空间池化形状为 2*2。

下面显示了此阶段使用的视频数据的一些统计数据。我们提供了持续时间和分辨率的基本统计数据，以及美学分数和光流分数分布。我们还从视频字幕中提取了对象和动作的标签并计算了它们的频率。
![stats](/assets/readme/report-03_video_stats.png)
![object_count](/assets/readme/report-03_objects_count.png)
![object_count](/assets/readme/report-03_actions_count.png)

此阶段我们主要在 720p 和 1080p 上进行训练，以提高模型在高清视频上的表现力。在训练中，我们使用的掩码率为25%。训练配置位于[stage3.py](/configs/opensora-v1-2/train/stage3.py)中。我们对模型进行 15k 步训练，大约为 2 个 epoch。

## 简单有效的模型调节

对于第 3 阶段，我们计算每个视频片段的美学分数和运动分数。但是，由于视频片段数量较少，我们不愿意过滤掉得分较低的片段，这会导致数据集较小。相反，我们将分数附加到字幕中并将其用作条件。我们发现这种方法可以让模型了解分数并遵循分数来生成质量更好的视频。

例如，一段美学评分为 5.5、运动评分为 10 且检测到摄像头运动向左平移的视频，其字幕将为：

```plaintext
[Original Caption] aesthetic score: 5.5, motion score: 10, camera motion: pan left.
```

在推理过程中，我们还可以使用分数来调节模型。对于摄像机运动，我们仅标记了 13k 个具有高置信度的剪辑，并且摄像机运动检测模块已在我们的工具中发布。

## 评估

之前，我们仅通过人工评估来监控训练过程，因为 DDPM 训练损失与生成的视频质量没有很好的相关性。但是，对于校正流，如 SD3 中所述，我们发现训练损失与生成的视频质量有很好的相关性。因此，我们跟踪了 100 张图像和 1k 个视频的校正流评估损失。

我们从 pixabay 中抽样了 1k 个视频作为验证数据集。我们计算了不同分辨率（144p、240p、360p、480p、720p）下图像和不同长度的视频（2s、4s、8s、16s）的评估损失。对于每个设置，我们等距采样 10 个时间步长。然后对所有损失取平均值。

![Evaluation Loss](/assets/readme/report_val_loss.png)
![Video Evaluation Loss](/assets/readme/report_vid_val_loss.png)

此外，我们还会在训练过程中跟踪[VBench](https://vchitect.github.io/VBench-project/)得分。VBench 是用于短视频生成的自动视频评估基准。我们用 240p 2s 视频计算 vbench 得分。这两个指标验证了我们的模型在训练过程中持续改进。

![VBench](/assets/readme/report_vbench_score.png)

所有评估代码均发布在`eval`文件夹中。查看[评估指南](/eval/README.md)了解更多详细信息。

|模型        | 总得分 | 质量得分 | 语义分数 |
| -------------- | ----------- | ------------- | -------------- |
| Open-Sora V1.0 | 75.91%      | 78.81%        | 64.28%         |
| Open-Sora V1.2 | 79.23%      | 80.71%        | 73.30%         |

## 序列并行

我们使用序列并行来支持长序列训练和推理。我们的实现基于Ulysses，工作流程如下所示。启用序列并行后，我们只需要将 `all-to-all` 通信应用于STDiT中的空间模块（spatial block），因为在序列维度上，只有对空间信息的计算是相互依赖的。

![SP](/assets/readme/sequence_parallelism.jpeg)

目前，由于训练数据分辨率较小，我们尚未使用序列并行进行训练，我们计划在下一个版本中使用。至于推理，我们可以使用序列并行，以防您的 GPU 内存不足。下表显示，序列并行可以实现加速：

| 分辨率 | 时长 | GPU数量 | 是否启用序列并行 |用时（秒） | 加速效果/GPU |
| ---------- | ------- | -------------- | --------- | ------------ | --------------- |
| 720p       | 16秒     | 1              | 否        | 547.97       | -               |
| 720p       | 16s秒    | 2              | 是        | 244.38       | 12%             |


================================================
FILE: Open-Sora/docs/zh_CN/structure.md
================================================
# 代码仓库和配置文件结构

## 代码仓库结构

```plaintext
Open-Sora
├── README.md
├── docs
│   ├── acceleration.md            -> Acceleration & Speed benchmark
│   ├── command.md                 -> Commands for training & inference
│   ├── datasets.md                -> Datasets used in this project
│   ├── structure.md               -> This file
│   └── report_v1.md               -> Report for Open-Sora v1
├── scripts
│   ├── train.py                   -> diffusion training script
│   └── inference.py               -> Report for Open-Sora v1
├── configs                        -> Configs for training & inference
├── opensora
│   ├── __init__.py
│   ├── registry.py                -> Registry helper
│   ├── acceleration               -> Acceleration related code
│   ├── dataset                    -> Dataset related code
│   ├── models
│   │   ├── layers                 -> Common layers
│   │   ├── vae                    -> VAE as image encoder
│   │   ├── text_encoder           -> Text encoder
│   │   │   ├── classes.py         -> Class id encoder (inference only)
│   │   │   ├── clip.py            -> CLIP encoder
│   │   │   └── t5.py              -> T5 encoder
│   │   ├── dit
│   │   ├── latte
│   │   ├── pixart
│   │   └── stdit                  -> Our STDiT related code
│   ├── schedulers                 -> Diffusion schedulers
│   │   ├── iddpm                  -> IDDPM for training and inference
│   │   └── dpms                   -> DPM-Solver for fast inference
│   └── utils
└── tools                          -> Tools for data processing and more
```

## 配置文件结构


我们的配置文件遵循[MMEgine](https://github.com/open-mmlab/mmengine)。 MMEngine 将读取配置文件（“.py”文件）并将其解析为类似字典的对象。

```plaintext
Open-Sora
└── configs                        -> Configs for training & inference
    ├── opensora                   -> STDiT related configs
    │   ├── inference
    │   │   ├── 16x256x256.py      -> Sample videos 16 frames 256x256
    │   │   ├── 16x512x512.py      -> Sample videos 16 frames 512x512
    │   │   └── 64x512x512.py      -> Sample videos 64 frames 512x512
    │   └── train
    │       ├── 16x256x256.py      -> Train on videos 16 frames 256x256
    │       ├── 16x256x256.py      -> Train on videos 16 frames 256x256
    │       └── 64x512x512.py      -> Train on videos 64 frames 512x512
    ├── dit                        -> DiT related configs
    │   ├── inference
    │   │   ├── 1x256x256-class.py -> Sample images with ckpts from DiT
    │   │   ├── 1x256x256.py       -> Sample images with clip condition
    │   │   └── 16x256x256.py      -> Sample videos
    │   └── train
    │       ├── 1x256x256.py       -> Train on images with clip condition
    │       └── 16x256x256.py      -> Train on videos
    ├── latte                      -> Latte related configs
    └── pixart                     -> PixArt related configs
```

## 推理配置演示

要更改推理设置，可以直接修改相应的配置文件。或者您可以传递参数来覆盖配置文件（[config_utils.py](/opensora/utils/config_utils.py)）。要更改采样提示，您应该修改传递给“--prompt_path”参数的“.txt”文件。

```plaintext
--prompt_path ./assets/texts/t2v_samples.txt  -> prompt_path
--ckpt-path ./path/to/your/ckpt.pth           -> model["from_pretrained"]
```

下面提供了每个字段的解释。

```python
# Define sampling size
num_frames = 64               # number of frames
fps = 24 // 2                 # frames per second (divided by 2 for frame_interval=2)
image_size = (512, 512)       # image size (height, width)

# Define model
model = dict(
    type="STDiT-XL/2",        # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
    space_scale=1.0,          # (Optional) Space positional encoding scale (new height / old height)
    time_scale=2 / 3,         # (Optional) Time positional encoding scale (new frame_interval / old frame_interval)
    enable_flash_attn=True,    # (Optional) Speed up training and inference with flash attention
    enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
    from_pretrained="PRETRAINED_MODEL",  # (Optional) Load from pretrained model
    no_temporal_pos_emb=True,  # (Optional) Disable temporal positional encoding (for image)
)
vae = dict(
    type="VideoAutoencoderKL", # Select VAE type
    from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE
    micro_batch_size=128,      # VAE with micro batch size to save memory
)
text_encoder = dict(
    type="t5",                 # Select text encoder type (t5, clip)
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
    model_max_length=120,      # Maximum length of input text
)
scheduler = dict(
    type="iddpm",              # Select scheduler type (iddpm, dpm-solver)
    num_sampling_steps=100,    # Number of sampling steps
    cfg_scale=7.0,             # hyper-parameter for classifier-free diffusion
)
dtype = "fp16"                 # Computation type (fp16, fp32, bf16)

# Other settings
batch_size = 1                 # batch size
seed = 42                      # random seed
prompt_path = "./assets/texts/t2v_samples.txt"  # path to prompt file
save_dir = "./samples"         # path to save samples
```

## 训练配置演示

```python
# Define sampling size
num_frames = 64
frame_interval = 2             # sample every 2 frames
image_size = (512, 512)

# Define dataset
root = None                    # root path to the dataset
data_path = "CSV_PATH"         # path to the csv file
use_image_transform = False    # True if training on images
num_workers = 4                # number of workers for dataloader

# Define acceleration
dtype = "bf16"                 # Computation type (fp16, bf16)
grad_checkpoint = True         # Use gradient checkpointing
plugin = "zero2"               # Plugin for distributed training (zero2, zero2-seq)
sp_size = 1                    # Sequence parallelism size (1 for no sequence parallelism)

# Define model
model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=2 / 3,
    from_pretrained="YOUR_PRETRAINED_MODEL",
    enable_flash_attn=True,        # Enable flash attention
    enable_layernorm_kernel=True, # Enable layernorm kernel
)
vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=128,
)
text_encoder = dict(
    type="t5",
    from_pretrained="/root/autodl-tmp/pretrained_models/DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,           # Enable shardformer for T5 acceleration
)
scheduler = dict(
    type="iddpm",
    timestep_respacing="",      # Default 1000 timesteps
)

# Others
seed = 42
outputs = "outputs"             # path to save checkpoints
wandb = False                   # Use wandb for logging

epochs = 1000                   # number of epochs (just large enough, kill when satisfied)
log_every = 10
ckpt_every = 250
load = None                     # path to resume training

batch_size = 4
lr = 2e-5
grad_clip = 1.0                 # gradient clipping
```


================================================
FILE: Open-Sora/docs/zh_CN/vae.md
================================================
# VAE 技术报告

由于 [Pixart-Sigma](https://arxiv.org/abs/2403.04692) 论文中指出适应新的VAE很简单，因此我们开发了一个额外的时间VAE。
具体而言, 我们的VAE由一个[空间 VAE](https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers)和一个时间VA相接的形式组成.
对于时间VAE，我们遵循 [MAGVIT-v2](https://arxiv.org/abs/2310.05737)的实现, 并做了以下修改:

* 我们删除了码本特有的架构。
* 我们不使用鉴别​​器（discriminator），而是使用VAE重建损失、kl损失和感知损失进行训练。
* 在编码器的最后一个线性层中，我们缩小到 4 通道的对角高斯分布，遵循我们之前训练的接受 4 通道输入的 STDiT。
* 我们的解码器与编码器架构对称。

## 训练
我们分不同阶段训练模型。

我们首先通过在单台机器（8 个 GPU）上冻结空间 VAE 380k 步来训练时间 VAE。我们使用额外的身份损失使 3D VAE 的特征与 2D VAE 的特征相似。我们使用 20% 的图像和 80% 的视频（17 帧）来训练 VAE。

```bash
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
```

接下来，我们移除身份损失并训练 3D VAE 管道以重建 260k 步的 2D 压缩视频。

```bash
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH
```

最后，我们移除了 2D 压缩视频的重建损失，并训练 VAE 管道以构建 540k 步的 3D 视频。我们在 34 帧内使用随机数训练 VAE，使其对不同长度的视频更具鲁棒性。此阶段在 24 个 GPU 上进行训练。

```bash
torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH
```

请注意，您需要根据自己的 csv 数据大小相应地调整配置文件中的 `epochs` 。

## 推理

为了直观地检查 VAE 的性能，您可以运行以下推理。它使用 `_ori` 后缀（即 `"YOUR_VIDEO_DIR"_ori`）将原始视频保存到您指定的视频目录中，使用`_rec`后缀（即`"YOUR_VIDEO_DIR"_rec`）将来自完整管道的重建视频保存到指定的视频目录中，并使用 `_spatial`后缀（即`"YOUR_VIDEO_DIR"_spatial`）将来自 2D 压缩和解压缩的重建视频保存到指定的视频目录中。

```bash
torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
```
## 评估
然后，我们可以计算 VAE 在 SSIM、PSNR、LPIPS 和 FLOLPIPS 指标上的表现得分。

* SSIM: 结构相似性指数度量，越高越好
* PSNR: 峰值信噪比，越高越好
* LPIPS: 学习感知图像质量下降，越低越好
* [FloLPIPS](https://arxiv.org/pdf/2207.08119): 带有视频插值的LPIPS，越低越好。

```bash
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
```

## 致谢
我们非常感谢以下工作：
* [MAGVIT-v2](https://arxiv.org/abs/2310.05737): Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation
* [Taming Transformers](https://github.com/CompVis/taming-transformers): Taming Transformers for High-Resolution Image Synthesis
* [3D blur pooling](https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc)
* [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)


================================================
FILE: Open-Sora/environment-opensora.yml
================================================
name: opensora
channels:
  - defaults
dependencies:
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=5.1=1_gnu
  - ca-certificates=2024.7.2=h06a4308_0
  - ld_impl_linux-64=2.38=h1181459_1
  - libffi=3.4.4=h6a678d5_1
  - libgcc-ng=11.2.0=h1234567_1
  - libgomp=11.2.0=h1234567_1
  - libstdcxx-ng=11.2.0=h1234567_1
  - ncurses=6.4=h6a678d5_0
  - openssl=3.0.15=h5eee18b_0
  - pip=24.2=py39h06a4308_0
  - python=3.9.19=h955ad1f_1
  - readline=8.2=h5eee18b_0
  - setuptools=72.1.0=py39h06a4308_0
  - sqlite=3.45.3=h5eee18b_0
  - tk=8.6.14=h39e8969_0
  - wheel=0.43.0=py39h06a4308_0
  - xz=5.4.6=h5eee18b_1
  - zlib=1.2.13=h5eee18b_1
  - pip:
      - absl-py==2.1.0
      - accelerate==0.29.2
      - addict==2.4.0
      - aiofiles==23.2.1
      - aiosignal==1.3.1
      - altair==5.4.1
      - annotated-types==0.7.0
      - antlr4-python3-runtime==4.9.3
      - anyio==4.4.0
      - apex==0.1
      - asttokens==2.4.1
      - attrs==24.2.0
      - av==13.0.0
      - bcrypt==4.2.0
      - beartype==0.18.5
      - beautifulsoup4==4.12.3
      - bitsandbytes==0.43.3
      - black==24.8.0
      - boto3==1.35.20
      - botocore==1.35.20
      - calflops==0.3.2
      - certifi==2024.8.30
      - cffi==1.17.1
      - cfgv==3.4.0
      - charset-normalizer==3.3.2
      - click==8.1.7
      - cloudpickle==3.0.0
      - colossalai==0.4.0
      - comm==0.2.2
      - contexttimer==0.3.3
      - contourpy==1.3.0
      - cryptography==43.0.1
      - cycler==0.12.1
      - cython==3.0.11
      - debugpy==1.8.5
      - decorator==5.1.1
      - decord==0.6.0
      - deprecated==1.2.14
      - detectron2==0.6
      - diffusers==0.27.2
      - dill==0.3.8
      - distlib==0.3.8
      - distro==1.9.0
      - docker-pycreds==0.4.0
      - easydict==1.13
      - einops==0.8.0
      - exceptiongroup==1.2.2
      - executing==2.1.0
      - fabric==3.2.2
      - facexlib==0.3.0
      - fairscale==0.4.13
      - fastapi==0.114.0
      - ffmpy==0.4.0
      - filelock==3.16.0
      - filterpy==1.4.5
      - flash-attn==2.6.3
      - fonttools==4.53.1
      - frozenlist==1.4.1
      - fsspec==2024.9.0
      - ftfy==6.2.3
      - future==1.0.0
      - fvcore==0.1.5.post20221221
      - galore-torch==1.0
      - gitdb==4.0.11
      - gitpython==3.1.43
      - google==3.0.0
      - gradio==4.26.0
      - gradio-client==0.15.1
      - grpcio==1.66.1
      - h11==0.14.0
      - httpcore==1.0.5
      - httpx==0.27.2
      - huggingface-hub==0.24.6
      - hydra-core==1.3.2
      - identify==2.6.0
      - idna==3.8
      - imageio==2.35.1
      - imgaug==0.4.0
      - importlib-metadata==8.4.0
      - importlib-resources==6.4.5
      - invoke==2.2.0
      - iopath==0.1.9
      - ipykernel==6.29.5
      - ipython==8.18.1
      - ipywidgets==8.1.5
      - jedi==0.19.1
      - jinja2==3.1.4
      - jiter==0.5.0
      - jmespath==1.0.1
      - joblib==1.4.2
      - jsonschema==4.23.0
      - jsonschema-specifications==2023.12.1
      - jupyter-client==8.6.2
      - jupyter-core==5.7.2
      - jupyterlab-widgets==3.0.13
      - kiwisolver==1.4.7
      - lazy-loader==0.4
      - llvmlite==0.43.0
      - lmdb==1.5.1
      - lpips==0.1.4
      - lvis==0.5.3
      - markdown==3.7
      - markdown-it-py==3.0.0
      - markupsafe==2.1.5
      - matplotlib==3.9.2
      - matplotlib-inline==0.1.7
      - mdurl==0.1.2
      - mmengine==0.10.4
      - mpmath==1.3.0
      - msgpack==1.1.0
      - mypy-extensions==1.0.0
      - narwhals==1.8.1
      - nest-asyncio==1.6.0
      - networkx==3.2.1
      - ninja==1.11.1.1
      - nodeenv==1.9.1
      - numba==0.60.0
      - numpy==1.26.4
      - nvidia-cublas-cu12==12.1.3.1
      - nvidia-cuda-cupti-cu12==12.1.105
      - nvidia-cuda-nvrtc-cu12==12.1.105
      - nvidia-cuda-runtime-cu12==12.1.105
      - nvidia-cudnn-cu12==8.9.2.26
      - nvidia-cufft-cu12==11.0.2.54
      - nvidia-curand-cu12==10.3.2.106
      - nvidia-cusolver-cu12==11.4.5.107
      - nvidia-cusparse-cu12==12.1.0.106
      - nvidia-nccl-cu12==2.19.3
      - nvidia-nvjitlink-cu12==12.6.68
      - nvidia-nvtx-cu12==12.1.105
      - omegaconf==2.3.0
      - openai==1.44.1
      - openai-clip==1.0.1
      - opencv-python==4.10.0.84
      - opensora==1.2.0
      - orjson==3.10.7
      - packaging==24.1
      - pandarallel==1.6.5
      - pandas==2.2.2
      - parameterized==0.9.0
      - paramiko==3.4.1
      - parso==0.8.4
      - pathspec==0.12.1
      - peft==0.12.0
      - pexpect==4.9.0
      - pillow==10.4.0
      - platformdirs==4.3.2
      - plumbum==1.8.3
      - portalocker==2.10.1
      - pre-commit==3.8.0
      - prompt-toolkit==3.0.47
      - protobuf==5.28.0
      - psutil==5.9.8
      - ptyprocess==0.7.0
      - pure-eval==0.2.3
      - pyarrow==17.0.0
      - pycocotools==2.0.8
      - pycparser==2.22
      - pydantic==2.9.1
      - pydantic-core==2.23.3
      - pydub==0.25.1
      - pygments==2.18.0
      - pyiqa==0.1.10
      - pynacl==1.5.0
      - pyparsing==3.1.4
      - python-dateutil==2.9.0.post0
      - python-multipart==0.0.9
      - pytorchvideo==0.1.5
      - pytz==2024.1
      - pyyaml==6.0.2
      - pyzmq==26.2.0
      - ray==2.35.0
      - referencing==0.35.1
      - regex==2024.7.24
      - requests==2.32.3
      - rich==13.8.1
      - rotary-embedding-torch==0.5.3
      - rpds-py==0.20.0
      - rpyc==6.0.0
      - ruff==0.6.4
      - s3transfer==0.10.2
      - safetensors==0.4.5
      - scikit-image==0.24.0
      - scikit-learn==1.5.2
      - scipy==1.13.1
      - semantic-version==2.10.0
      - sentencepiece==0.2.0
      - sentry-sdk==2.14.0
      - setproctitle==1.3.3
      - shapely==2.0.6
      - shellingham==1.5.4
      - six==1.16.0
      - smmap==5.0.1
      - sniffio==1.3.1
      - soupsieve==2.6
      - spaces==0.30.2
      - stack-data==0.6.3
      - starlette==0.38.5
      - sympy==1.13.2
      - tabulate==0.9.0
      - tensorboard==2.17.1
      - tensorboard-data-server==0.7.2
      - termcolor==2.4.0
      - threadpoolctl==3.5.0
      - tifffile==2024.8.30
      - timm==0.9.16
      - tokenizers==0.15.2
      - tomli==2.0.1
      - tomlkit==0.12.0
      - torch==2.2.2
      - torchvision==0.17.2
      - tornado==6.4.1
      - tqdm==4.66.5
      - traitlets==5.14.3
      - transformers==4.39.3
      - triton==2.2.0
      - typer==0.12.5
      - typing-extensions==4.12.2
      - tzdata==2024.1
      - urllib3==1.26.20
      - uvicorn==0.29.0
      - virtualenv==20.26.4
      - wandb==0.17.9
      - wcwidth==0.2.13
      - websockets==11.0.3
      - werkzeug==3.0.4
      - widgetsnbextension==4.0.13
      - wrapt==1.16.0
      - xformers==0.0.25.post1
      - yacs==0.1.8
      - yapf==0.40.2
      - zipp==3.20.1
prefix: /root/miniconda3/envs/opensora


================================================
FILE: Open-Sora/eval/README.md
================================================
# Evalution

## Human evaluation

To conduct human evaluation, we need to generate various samples. We provide many prompts in `assets/texts`, and defined some test setting covering different resolution, duration and aspect ratio in `eval/sample.sh`. To facilitate the usage of multiple GPUs, we split sampling tasks into several parts.

```bash
# image (1)
bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -1
# video (2a 2b 2c ...)
bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -2a
# launch 8 jobs at once (you must read the script to understand the details)
bash eval/human_eval/launch.sh /path/to/ckpt num_frames model_name_for_log
```

## Rectified Flow Loss

Evaluate the rectified flow loss with the following commands.

```bash
# image
torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py --data-path /path/to/img.csv --ckpt-path /path/to/ckpt

# video
torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py --data-path /path/to/vid.csv --ckpt-path /path/to/ckpt

# select resolution
torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py --data-path /path/to/vid.csv --ckpt-path /path/to/ckpt --resolution 720p
```

To launch multiple jobs at once, use the following script.

```bash
bash eval/loss/launch.sh /path/to/ckpt model_name
```

To obtain an organized list of scores:
```bash
python eval/loss/tabulate_rl_loss.py --log_dir path/to/log/dir
```

## VBench

[VBench](https://github.com/Vchitect/VBench) is a benchmark for short text to video generation. We provide a script for easily generating samples required by VBench.

First, generate the relevant videos with the following commands:

```bash
# vbench task, if evaluation all set start_index to 0, end_index to 2000
bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log  -4 start_index end_index

# Alternatively, launch 8 jobs at once (you must read the script to understand the details)
bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name

# in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value
# for example
# bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
```

After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.

<!-- ```bash
bash eval/vbench/vbench.sh /path/to/video_folder /path/to/model/ckpt
``` -->

```bash
python eval/vbench/calc_vbench.py /path/to/video_folder /path/to/model/ckpt
```

Finally, we obtain the scaled scores for the model by:
```bash
python eval/vbench/tabulate_vbench_scores.py --score_dir path/to/score/dir
```

## VBench-i2v

[VBench-i2v](https://github.com/Vchitect/VBench/tree/master/vbench2_beta_i2v) is a benchmark for short image to video generation (beta version).
Similarly, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies".

```bash
# Step 1: generate the relevant videos
# vbench i2v tasks, if evaluation all set start_index to 0, end_index to 2000
bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -5 start_index end_index
# Alternatively, launch 8 jobs at once
bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name

# Step 2: run vbench to evaluate the generated samples
python eval/vbench_i2v/vbench_i2v.py /path/to/video_folder /path/to/model/ckpt
# Note that if you need to go to `VBench/vbench2_beta_i2v/utils.py` and change the harded-coded var `image_root` in the `load_i2v_dimension_info` function to your corresponding image folder.

# Step 3: obtain the scaled scores
python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/to/your/model/ckpt
# this will store the results under `eval/vbench_i2v` in the path/to/your/model/ckpt

```

Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine

```bash
bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value
# for example
# bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
# if no flow control, use "None" instead
```

## VAE

Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command:

```bash
# metric can any one or list of: ssim, psnr, lpips, flolpips
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir path/to/original/videos --generated_video_dir path/to/generated/videos --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
```


================================================
FILE: Open-Sora/eval/human_eval/generate.sh
================================================
#!/bin/bash

set -x
set -e

TEXT_PATH=/home/data/sora_data/pixart-sigma-generated/text.txt
OUTPUT_PATH=/home/data/sora_data/pixart-sigma-generated/raw
CMD="python scripts/inference.py configs/pixart/inference/1x2048MS.py"
# LOG_BASE=logs/sample/generate
LOG_BASE=$(dirname $CKPT)/eval/generate
mkdir -p ${LOG_BASE}
NUM_PER_GPU=10000
N_LAUNCH=2
NUM_START=$(($N_LAUNCH * $NUM_PER_GPU * 8))

CUDA_VISIBLE_DEVICES=0 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 0)) --end-index $(($NUM_START + $NUM_PER_GPU * 1)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_1.log 2>&1 &
CUDA_VISIBLE_DEVICES=1 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 1)) --end-index $(($NUM_START + $NUM_PER_GPU * 2)) --image-size 1408 2816 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_2.log 2>&1 &
CUDA_VISIBLE_DEVICES=2 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 2)) --end-index $(($NUM_START + $NUM_PER_GPU * 3)) --image-size 2816 1408 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_3.log 2>&1 &
CUDA_VISIBLE_DEVICES=3 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 3)) --end-index $(($NUM_START + $NUM_PER_GPU * 4)) --image-size 1664 2304 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_4.log 2>&1 &
CUDA_VISIBLE_DEVICES=4 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 4)) --end-index $(($NUM_START + $NUM_PER_GPU * 5)) --image-size 2304 1664 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_5.log 2>&1 &
CUDA_VISIBLE_DEVICES=5 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 5)) --end-index $(($NUM_START + $NUM_PER_GPU * 6)) --image-size 1536 2560 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_6.log 2>&1 &
CUDA_VISIBLE_DEVICES=6 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 6)) --end-index $(($NUM_START + $NUM_PER_GPU * 7)) --image-size 2560 1536 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_7.log 2>&1 &
CUDA_VISIBLE_DEVICES=7 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 7)) --end-index $(($NUM_START + $NUM_PER_GPU * 8)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_8.log 2>&1 &


================================================
FILE: Open-Sora/eval/human_eval/launch.sh
================================================
#!/bin/bash

CKPT=$1
NUM_FRAMES=$2
MODEL_NAME=$3

if [[ $CKPT == *"ema"* ]]; then
    parentdir=$(dirname $CKPT)
    CKPT_BASE=$(basename $parentdir)_ema
else
    CKPT_BASE=$(basename $CKPT)
fi
LOG_BASE=$(dirname $CKPT)/eval
mkdir -p ${LOG_BASE}
echo "Logging to $LOG_BASE"

GPUS=(0 1 2 3 4 5 6 7)
# TASK_ID_LIST=(1 2a 2b 2c 2d 2e 2f 2g) # move image to video task
TASK_ID_LIST=(2a 2b 2c 2d 2e 2f 2g 2h)
# FRAME_LIST=(1 $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES)

for i in "${!GPUS[@]}"; do
    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -${TASK_ID_LIST[i]} >${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
done

# kill all by: pkill -f "inference"


================================================
FILE: Open-Sora/eval/loss/eval_loss.py
================================================
from pprint import pformat

import colossalai
import torch
import torch.distributed as dist
from colossalai.cluster import DistCoordinator
from mmengine.runner import set_random_seed
from tqdm import tqdm

from opensora.acceleration.parallel_states import get_data_parallel_group, set_data_parallel_group
from opensora.datasets.dataloader import prepare_dataloader
from opensora.registry import DATASETS, MODELS, SCHEDULERS, build_module
from opensora.utils.config_utils import parse_configs
from opensora.utils.misc import create_logger, to_torch_dtype
from opensora.utils.train_utils import MaskGenerator


def main():
    torch.set_grad_enabled(False)
    # ======================================================
    # configs & runtime variables
    # ======================================================
    # == parse configs ==
    cfg = parse_configs(training=False)

    # == device and dtype ==
    device = "cuda" if torch.cuda.is_available() else "cpu"
    cfg_dtype = cfg.get("dtype", "fp32")
    assert cfg_dtype in ["fp16", "bf16", "fp32"], f"Unknown mixed precision {cfg_dtype}"
    dtype = to_torch_dtype(cfg.get("dtype", "bf16"))
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # == init distributed env ==
    colossalai.launch_from_torch({})
    DistCoordinator()
    set_random_seed(seed=cfg.get("seed", 1024))
    set_data_parallel_group(dist.group.WORLD)

    # == init logger ==
    logger = create_logger()
    logger.info("Eval loss configuration:\n %s", pformat(cfg.to_dict()))

    # ======================================================
    # build model & load weights
    # ======================================================
    logger.info("Building models...")
    # == build text-encoder and vae ==
    text_encoder = build_module(cfg.text_encoder, MODELS, device=device)
    vae = build_module(cfg.vae, MODELS).to(device, dtype).eval()

    # == build diffusion model ==
    input_size = (None, None, None)
    latent_size = vae.get_latent_size(input_size)
    model = (
        build_module(
            cfg.model,
            MODELS,
            input_size=latent_size,
            in_channels=vae.out_channels,
            caption_channels=text_encoder.output_dim,
            model_max_length=text_encoder.model_max_length,
        )
        .to(device, dtype)
        .eval()
    )
    text_encoder.y_embedder = model.y_embedder  # HACK: for classifier-free guidance

    # == build scheduler ==
    scheduler = build_module(cfg.scheduler, SCHEDULERS)

    if cfg.get("mask_ratios", None) is not None:
        mask_generator = MaskGenerator(cfg.mask_ratios)

    # ======================================================
    # inference
    # ======================================================
    # start evaluation, prepare a dataset everytime in the loop
    bucket_config = cfg.bucket_config
    if cfg.get("resolution", None) is not None:
        bucket_config = {cfg.resolution: bucket_config[cfg.resolution]}
    assert bucket_config is not None, "bucket_config is required for evaluation"
    logger.info("Evaluating bucket_config: %s", bucket_config)

    def build_dataset(resolution, num_frames, batch_size):
        bucket_config = {resolution: {num_frames: (1.0, batch_size)}}
        dataset = build_module(cfg.dataset, DATASETS)
        dataloader_args = dict(
            dataset=dataset,
            batch_size=None,
            num_workers=cfg.num_workers,
            shuffle=False,
            drop_last=False,
            pin_memory=True,
            process_group=get_data_parallel_group(),
        )
        dataloader, sampler = prepare_dataloader(bucket_config=bucket_config, **dataloader_args)
        num_batch = sampler.get_num_batch()
        num_steps_per_epoch = num_batch // dist.get_world_size()
        return dataloader, num_steps_per_epoch, num_batch

    evaluation_losses = {}
    start = cfg.start_index if "start_index" in cfg else 0
    end = cfg.end_index if "end_index" in cfg else len(bucket_config)
    for i, res in enumerate(bucket_config):
        if i < start or i >= end:  # skip task
            continue

        t_bucket = bucket_config[res]
        for num_frames, (_, batch_size) in t_bucket.items():
            if batch_size is None:
                continue
            logger.info("Evaluating resolution: %s, num_frames: %s", res, num_frames)
            dataloader, num_steps_per_epoch, num_batch = build_dataset(res, num_frames, batch_size)
            if num_batch == 0:
                logger.warning("No data for resolution: %s, num_frames: %s", res, num_frames)
                continue

            evaluation_t_losses = []
            for t in torch.linspace(0, scheduler.num_timesteps, cfg.get("num_eval_timesteps", 10) + 2)[1:-1]:
                loss_t = 0.0
                num_samples = 0
                dataloader_iter = iter(dataloader)
                for _ in tqdm(range(num_steps_per_epoch), desc=f"res: {res}, num_frames: {num_frames}, t: {t:.2f}"):
                    batch = next(dataloader_iter)
                    x = batch.pop("video").to(device, dtype)
                    y = batch.pop("text")
                    x = vae.encode(x)
                    model_args = text_encoder.encode(y)

                    # == mask ==
                    mask = None
                    if cfg.get("mask_ratios", None) is not None:
                        mask = mask_generator.get_masks(x)
                        model_args["x_mask"] = mask

                    # == video meta info ==
                    for k, v in batch.items():
                        model_args[k] = v.to(device, dtype)

                    # == diffusion loss computation ==
                    timestep = torch.tensor([t] * x.shape[0], device=device, dtype=dtype)
                    loss_dict = scheduler.training_losses(model, x, model_args, mask=mask, t=timestep)
                    losses = loss_dict["loss"]  # (batch_size)
                    num_samples += x.shape[0]
                    loss_t += losses.sum().item()
                loss_t /= num_samples
                evaluation_t_losses.append(loss_t)
                logger.info("resolution: %s, num_frames: %s, timestep: %.2f, loss: %.4f", res, num_frames, t, loss_t)

            evaluation_losses[(res, num_frames)] = sum(evaluation_t_losses) / len(evaluation_t_losses)
            logger.info(
                "Evaluation losses for resolution: %s, num_frames: %s, loss: %s\n %s",
                res,
                num_frames,
                evaluation_losses[(res, num_frames)],
                evaluation_t_losses,
            )
    logger.info("Evaluation losses: %s", evaluation_losses)


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/eval/loss/launch.sh
================================================
#!/bin/bash

CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py"
CKPT_PATH=$1
MODEL_NAME=$2
IMG_PATH=$3
VID_PATH=$4

if [ -z $IMG_PATH ]; then
    IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
fi

if [ -z $VID_PATH ]; then
    VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
fi

if [[ $CKPT_PATH == *"ema"* ]]; then
    parentdir=$(dirname $CKPT_PATH)
    CKPT_BASE=$(basename $parentdir)_ema
else
    CKPT_BASE=$(basename $CKPT_PATH)
fi
LOG_BASE=$(dirname $CKPT_PATH)/eval
mkdir -p $LOG_BASE
echo "Logging to $LOG_BASE"


GPUS=(3 4 5 6 7)
RESOLUTION=(144p 240p 360p 480p 720p)

CUDA_VISIBLE_DEVICES=0 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 0 --end-index 5 >${LOG_BASE}/img_0.log 2>&1 &
CUDA_VISIBLE_DEVICES=1 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 5 --end-index 6 >${LOG_BASE}/img_1.log 2>&1 &
CUDA_VISIBLE_DEVICES=2 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 6 >${LOG_BASE}/img_2.log 2>&1 &


for i in "${!GPUS[@]}"; do
    CUDA_VISIBLE_DEVICES=${GPUS[i]} $CMD --data-path $VID_PATH --ckpt-path $CKPT_PATH --resolution ${RESOLUTION[i]} >${LOG_BASE}/${RESOLUTION[i]}_vid.log 2>&1 &
done


================================================
FILE: Open-Sora/eval/loss/tabulate_rl_loss.py
================================================
"""
usage:
    python tabulate_rl_loss.py --log_dir /home/zhengzangwei/projs/Open-Sora-dev/logs/loss --ckpt_name epoch0-global_step9000

save the processed json to:
    Open-Sora-dev/evaluation_results/rectified_flow/<ckpt_name>_loss.json
"""

import argparse
import json
import os
from ast import literal_eval


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--log_dir", type=str)
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()

    files = os.listdir(args.log_dir)
    files = [
        "img_0.log",
        "img_1.log",
        "img_2.log",
        "144p_vid.log",
        "240p_vid.log",
        "360p_vid.log",
        "480p_vid.log",
        "720p_vid.log",
    ]

    loss_info = {}

    for fname in files:
        path = os.path.join(args.log_dir, fname)
        with open(path, "r", encoding="utf-8") as f:
            content = f.readlines()
        eval_line = content[-1].split("losses:")[-1].strip()
        loss_dict = literal_eval(eval_line)
        for key, loss in loss_dict.items():
            resolution, frame = key
            if resolution not in loss_info:
                loss_info[resolution] = {}
            loss_info[resolution][frame] = format(loss, ".4f")

    # Convert and write JSON object to file
    output_file_path = os.path.join(args.log_dir, "loss.json")
    with open(output_file_path, "w") as outfile:
        json.dump(loss_info, outfile, indent=4, sort_keys=True)
    print(f"results saved to: {output_file_path}")


================================================
FILE: Open-Sora/eval/sample.sh
================================================
# !/bin/bash

CKPT=$1
NUM_FRAMES=$2
MODEL_NAME=$3
TASK_TYPE=$4
VBENCH_START_INDEX=$5
VBENCH_END_INDEX=$6
VBENCH_RES=$7
VBENCH_ASP_RATIO=$8

NUM_SAMPLING_STEPS=$9
FLOW=${10}
LLM_REFINE=${11}

BASE_ASPECT_RATIO=360p
ASPECT_RATIOS=(144p 240p 360p 480p 720p 1080p)
# Loop through the list of aspect ratios
i=0
for r in "${ASPECT_RATIOS[@]}"; do
  if [[ "$r" == "$BASE_ASPECT_RATIO" ]]; then
    # get aspect ratio 1 level up
    if [[ $((i+1)) -lt ${#ASPECT_RATIOS[@]} ]]; then
      ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[$((i+1))]}
    else
      # If this is the highest ratio, return the highest ratio
      ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[-1]}
    fi
    # get aspect ratio 2 levels up
    if [[ $((i+2)) -lt ${#ASPECT_RATIOS[@]} ]]; then
      ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[$((i+2))]}
    else
      # If this is the highest ratio, return the highest ratio
      ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[-1]}
    fi
  fi
  i=$((i+1))
done
echo "base aspect ratio: ${BASE_ASPECT_RATIO}"
echo "aspect ratio 1 level up: ${ASPECT_RATIO_INCR_1}"
echo "aspect ratio 2 levels up: ${ASPECT_RATIO_INCR_2}"
echo "Note that this aspect ratio level setting is used for videos only, not images"

echo "NUM_FRAMES=${NUM_FRAMES}"

if [ -z "${NUM_FRAMES}" ]; then
  echo "you need to pass NUM_FRAMES"
else
  let DOUBLE_FRAMES=$2*2
  let QUAD_FRAMES=$2*4
  let OCT_FRAMES=$2*8
fi

echo "DOUBLE_FRAMES=${DOUBLE_FRAMES}"
echo "QUAD_FRAMES=${QUAD_FRAMES}"
echo "OCT_FRAMES=${OCT_FRAMES}"

CMD="python scripts/inference.py configs/opensora-v1-2/inference/sample.py"
if [[ $CKPT == *"ema"* ]]; then
  parentdir=$(dirname $CKPT)
  CKPT_BASE=$(basename $parentdir)_ema
else
  CKPT_BASE=$(basename $CKPT)
fi
OUTPUT="/root/autodl-tmp/video_samples/samples_${MODEL_NAME}_${CKPT_BASE}"
start=$(date +%s)
DEFAULT_BS=1

### Functions

# called inside run_video_b
function run_image() { # 14min
  # 1.1 1024x1024
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect-ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS

  # 1.2 240x426
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 240p --aspect-ratio 9:16 --sample-name image_240p_9_16 --end-index 3 --batch-size $DEFAULT_BS

  # 1.3 512x512
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 512 --aspect-ratio 1:1 --sample-name image_t2i_512_1_1 --end-index 3 --batch-size $DEFAULT_BS
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 512 --aspect-ratio 1:1 --sample-name image_t2v_512_1_1 --end-index 3 --batch-size $DEFAULT_BS
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 1 --resolution 512 --aspect-ratio 1:1 --sample-name image_short_512_1_1 --end-index 3 --batch-size $DEFAULT_BS
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 1 --resolution 512 --aspect-ratio 1:1 --sample-name image_sora_512_1_1 --end-index 3 --batch-size $DEFAULT_BS

  # 1.4 720p multi-resolution
  # 1:1
  PROMPT="Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens."
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 1:1 --sample-name image_720p_1_1
  # 9:16
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 9:16 --sample-name image_720p_9_16
  # 16:9
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 16:9 --sample-name image_720p_16_9
  # 4:3
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 4:3 --sample-name image_720p_4_3
  # 3:4
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 3:4 --sample-name image_720p_3_4
  # 1:2
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 1:2 --sample-name image_720p_1_2
  # 2:1
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 2:1 --sample-name image_720p_2_1
}

# for (sample, short, sora)
#   for ( (4s, 720p), (8s, 480p), (16s, 360p) )

function run_video_a() { # ~ 30min ?
  ### previous cmds  # 42min, sample & multi-resolution
  # # sample, 144p, 9:16, 2s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 144p --aspect-ratio 9:16 --sample-name sample_2s_144p_9_16 --batch-size $DEFAULT_BS
  # # sample, 240p, 9:16, 2s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 240p --aspect-ratio 9:16 --sample-name sample_2s_240p_9_16 --batch-size $DEFAULT_BS
  # # sample, 240p, 9:16, 4s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution 240p --aspect-ratio 9:16 --sample-name sample_4s_240p_9_16 --batch-size $DEFAULT_BS
  # # sample, 240p, 9:16, 8s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution 240p --aspect-ratio 9:16 --sample-name sample_8s_240p_9_16 --batch-size $DEFAULT_BS
  # # sample, 480p, 9:16, 2s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 480p --aspect-ratio 9:16 --sample-name sample_2s_480p_9_16 --batch-size $DEFAULT_BS
  # # sample, 480p, 9:16, 4s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution 480p --aspect-ratio 9:16 --sample-name sample_4s_480p_9_16 --batch-size $DEFAULT_BS
  # # sample, 720p, 9:16, 2s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 9:16 --sample-name sample_2s_720p_9_16 --batch-size $DEFAULT_BS

  # sample, 720p, 9:16, 2s
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sample_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS

  # sample, 480p, 9:16, 8s
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sample_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS

  # sample, 360p, 9:16, 16s
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sample_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
}

function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p
  # run image, 14min
  echo "Inside run_video_b, running image samples..."
  run_image

  echo "Inside run_video_b, running video samples..."

  ### previous cmds, 18min
  # # short, 240p, 9:16, 4s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution 240p --aspect-ratio 9:16 --sample-name short_4s_240p_9_16 --batch-size $DEFAULT_BS
  # # short, 240p, 9:16, 8s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution 240p --aspect-ratio 9:16 --sample-name short_8s_240p_9_16 --batch-size $DEFAULT_BS

  # short, 480p, 9:16, 8s: ~24min
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name short_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS

  # short, 360p, 9:16, 16s: ~24min
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name short_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS

}

function run_video_c() {
  ### previous cmds, 60min
  # # sora, 240p, 16:9, 2s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 2s --resolution 240p --aspect-ratio 16:9 --sample-name sora_2s_240p_16_9 --batch-size $DEFAULT_BS
  # # sora, 240p, 9:16, 2s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 2s --resolution 240p --aspect-ratio 9:16 --sample-name sora_2s_240p_9_16 --batch-size $DEFAULT_BS
  # # sora, 240p, 9:16, 16s
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution 240p --aspect-ratio 9:16 --sample-name sora_16s_240p_9_16 --batch-size $DEFAULT_BS

  # short, 720p, 9:16, 2s: ~9min
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name short_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS

  # sora, 360p, 9:16, 16s: ~40min
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sora_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
}

function run_video_d() {
  ### previous cmds, 21min + 30min = 51min
  # # short, 480p, 9:16, 4s: 21min
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution 480p --aspect-ratio 9:16 --sample-name short_4s_480p_9_16 --batch-size $DEFAULT_BS
  # # sora, 480p, 9:16, 8s, 1/3 # moved from run_video_e, 30min
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p_9_16 --batch-size $DEFAULT_BS --start-index 0 --end-index 16

  # sora, 480p, 9:16, 8s, 1/3 # moved from run_video_e, 30min
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 0 --end-index 16
}

function run_video_e() { # 90min * 2/3 = 60min
  # sora, 480p, 9:16, 8s, 2/3
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 16 --end-index 100
}

function run_video_f() { # 60min
  # sora, 720p, 9:16, 2s
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sora_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
}

# --resolution 720p --aspect-ratio [16:9, 9:16, ...]

function run_video_g() { # 15min
  # 720p, 2s multi-resolution
  # 1:1
  PROMPT="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures."
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_1
  # 16:9
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 16:9 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_16_9
  # 9:16
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_9_16
  # 4:3
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 4:3 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_4_3
  # 3:4
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 3:4 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_3_4
  # 1:2
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:2 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_2
  # 2:1
  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 2:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_2_1

  # add motion score
  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name motion_2s_${ASPECT_RATIO_INCR_2} --prompt \
    \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. motion score: 0.0\" \
    \"A stylish woman walking in the street of Tokyo. motion score: 2.0\" \
    \"A stylish woman walking in the street of Tokyo. motion score: 4.0\" \
    \"A stylish woman walking in the street of Tokyo. motion score: 6.0\" \
    \"A stylish woman walking in the street of Tokyo. motion score: 10.0\" \
    \"A stylish woman walking in the street of Tokyo. motion score: 25.0\" \
    \"A stylish woman walking in the street of Tokyo. motion score: 50.0\" \
    \"A stylish woman walking in the street of Tokyo. motion score: 100.0\"

  # add aes score
  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name aes_2s_${ASPECT_RATIO_INCR_2} --prompt \
    \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.0\" \
    \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.5\" \
    \"A stylish woman walking in the street of Tokyo. aesthetic score: 5.0\" \
    \"A stylish woman walking in the street of Tokyo. aesthetic score: 5.5\" \
    \"A stylish woman walking in the street of Tokyo. aesthetic score: 6.0\" \
    \"A stylish woman walking in the street of Tokyo. aesthetic score: 6.5\" \
    \"A stylish woman walking in the street of Tokyo. aesthetic score: 7.0\"
}

# resolution -> 480p

function run_video_h() { # 61min
  # 3.1 image-conditioned long video generation
  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C5_2s_${BASE_ASPECT_RATIO}_9_16 \
    --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
    --num-frames 2s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
    --loop 5 --condition-frame-length 5 \
    --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
    --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS

  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C10_16s_${BASE_ASPECT_RATIO}_9_16 \
    --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
    --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
    --loop 5 --condition-frame-length 10 \
    --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
    --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS

  # 3.2
  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_16s_${BASE_ASPECT_RATIO}_9_16 \
    --prompt-path assets/texts/t2v_ref.txt --start-index 3 --end-index 6 \
    --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
    --loop 1 \
    --reference-path assets/images/condition/cliff.png "assets/images/condition/cactus-sad.png\;assets/images/condition/cactus-happy.png" https://cdn.openai.com/tmp/s/interp/d0.mp4 \
    --mask-strategy "0" "0\;0,1,0,-1,1" "0,0,0,0,${QUAD_FRAMES},0.5" --batch-size $DEFAULT_BS
}

# vbench has 950 samples

VBENCH_BS=1 # 80GB
VBENCH_H=240
VBENCH_W=426
VBENCH_NUM_SAMPLE=5

function run_vbench() {
  if [ -z ${VBENCH_RES} ] || [ -z ${VBENCH_ASP_RATIO} ]; then
    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE \
      --prompt-path assets/texts/VBench/all_dimension.txt \
      --image-size $VBENCH_H $VBENCH_W \
      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
  else
    if [ -z ${NUM_SAMPLING_STEPS} ]; then
        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE \
        --prompt-path assets/texts/VBench/all_dimension.txt \
        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
    else
      if [ -z ${FLOW} ]; then
        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE5 \
        --prompt-path assets/texts/VBench/all_dimension.txt \
        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
      else
        if [ -z ${LLM_REFINE} ]; then
          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE \
          --prompt-path assets/texts/VBench/all_dimension.txt \
          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
        else
          if [ "${FLOW}" = "None" ]; then
            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE \
            --prompt-path assets/texts/VBench/all_dimension.txt \
            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
          else
            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample $VBENCH_NUM_SAMPLE \
            --prompt-path assets/texts/VBench/all_dimension.txt \
            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
          fi
        fi
      fi
    fi
  fi
}

# vbench-i2v has 1120 samples

VBENCH_I2V_H=256
VBENCH_I2V_W=256

function run_vbench_i2v() {
  if [ -z ${VBENCH_RES} ] || [ -z ${VBENCH_ASP_RATIO} ]; then
    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
      --prompt-path assets/texts/VBench/all_i2v.txt \
      --image-size $VBENCH_I2V_H $VBENCH_I2V_W \
      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
  else
    if [ -z ${NUM_SAMPLING_STEPS} ]; then
        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
        --prompt-path assets/texts/VBench/all_i2v.txt \
        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
    else
      if [ -z ${FLOW} ]; then
        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
        --prompt-path assets/texts/VBench/all_i2v.txt \
        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
      else
        if [ -z ${LLM_REFINE} ]; then
          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
          --prompt-path assets/texts/VBench/all_i2v.txt \
          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
        else
          if [ "${FLOW}" = "None" ]; then
            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
            --prompt-path assets/texts/VBench/all_i2v.txt \
            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
          else
            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
            --prompt-path assets/texts/VBench/all_i2v.txt \
            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
          fi
        fi
      fi
    fi
  fi
}

### Main

for arg in "$@"; do
  # image
  if [[ "$arg" = -1 ]] || [[ "$arg" = --image ]]; then
    echo "Running image samples..."
    run_image
  fi
  if [[ "$arg" = -2a ]] || [[ "$arg" = --video ]]; then
    echo "Running video samples a..."
    run_video_a
  fi
  if [[ "$arg" = -2b ]] || [[ "$arg" = --video ]]; then
    echo "Running video samples b..."
    run_video_b
  fi
  if [[ "$arg" = -2c ]] || [[ "$arg" = --video ]]; then
    echo "Running video samples c..."
    run_video_c
  fi
  if [[ "$arg" = -2d ]] || [[ "$arg" = --video ]]; then
    echo "Running video samples d..."
    run_video_d
  fi
  if [[ "$arg" = -2e ]] || [[ "$arg" = --video ]]; then
    echo "Running video samples e..."
    run_video_e
  fi
  if [[ "$arg" = -2f ]] || [[ "$arg" = --video ]]; then
    echo "Running video samples f..."
    run_video_f
  fi
  if [[ "$arg" = -2g ]] || [[ "$arg" = --video ]]; then
    echo "Running video samples g..."
    run_video_g
  fi
  if [[ "$arg" = -2h ]] || [[ "$arg" = --video ]]; then
    echo "Running video samples h..."
    run_video_h
  fi
  # vbench
  if [[ "$arg" = -4 ]] || [[ "$arg" = --vbench ]]; then
    echo "Running vbench samples ..."
    if [ -z ${VBENCH_START_INDEX} ] || [ -z ${VBENCH_END_INDEX} ]; then
      echo "need to set start_index and end_index"
    else
      run_vbench $VBENCH_START_INDEX $VBENCH_END_INDEX
    fi
  fi
  # vbench-i2v
  if [[ "$arg" = -5 ]] || [[ "$arg" = --vbench-i2v ]]; then
    echo "Running vbench-i2v samples ..."
    if [ -z ${VBENCH_START_INDEX} ] || [ -z ${VBENCH_END_INDEX} ]; then
      echo "need to set start_index and end_index"
    else
      run_vbench_i2v $VBENCH_START_INDEX $VBENCH_END_INDEX
    fi
  fi
done

### End

end=$(date +%s)

runtime=$((end - start))

echo "Runtime: $runtime seconds"


================================================
FILE: Open-Sora/eval/vae/cal_flolpips.py
================================================
import sys

import numpy as np
import torch
from tqdm import tqdm

sys.path.append(".")

from flolpips.flolpips import FloLPIPS
from flolpips.pwcnet import Network as PWCNet

loss_fn = FloLPIPS(net="alex", version="0.1").eval().requires_grad_(False)
flownet = PWCNet().eval().requires_grad_(False)


def trans(x):
    return x


def calculate_flolpips(videos1, videos2, device):
    global loss_fn, flownet

    print("calculate_flowlpips...")
    loss_fn = loss_fn.to(device)
    flownet = flownet.to(device)

    if videos1.shape != videos2.shape:
        print("Warning: the shape of videos are not equal.")
        min_frames = min(videos1.shape[1], videos2.shape[1])
        videos1 = videos1[:, :min_frames]
        videos2 = videos2[:, :min_frames]

    videos1 = trans(videos1)
    videos2 = trans(videos2)

    flolpips_results = []
    for video_num in tqdm(range(videos1.shape[0])):
        video1 = videos1[video_num].to(device)
        video2 = videos2[video_num].to(device)
        frames_rec = video1[:-1]
        frames_rec_next = video1[1:]
        frames_gt = video2[:-1]
        frames_gt_next = video2[1:]
        t, c, h, w = frames_gt.shape
        flow_gt = flownet(frames_gt, frames_gt_next)
        flow_dis = flownet(frames_rec, frames_rec_next)
        flow_diff = flow_gt - flow_dis
        flolpips = loss_fn.forward(frames_gt, frames_rec, flow_diff, normalize=True)
        flolpips_results.append(flolpips.cpu().numpy().tolist())

    flolpips_results = np.array(flolpips_results)  # [batch_size, num_frames]
    flolpips = {}
    flolpips_std = {}

    for clip_timestamp in range(flolpips_results.shape[1]):
        flolpips[clip_timestamp] = np.mean(flolpips_results[:, clip_timestamp], axis=-1)
        flolpips_std[clip_timestamp] = np.std(flolpips_results[:, clip_timestamp], axis=-1)

    result = {
        "value": flolpips,
        "value_std": flolpips_std,
        "video_setting": video1.shape,
        "video_setting_name": "time, channel, heigth, width",
        "result": flolpips_results,
        "details": flolpips_results.tolist(),
    }

    return result


# test code / using example


def main():
    NUMBER_OF_VIDEOS = 8
    VIDEO_LENGTH = 50
    CHANNEL = 3
    SIZE = 64
    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)

    import json

    result = calculate_flolpips(videos1, videos2, "cuda:0")
    print(json.dumps(result, indent=4))


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/eval/vae/cal_lpips.py
================================================
import lpips
import numpy as np
import torch
from tqdm import tqdm

spatial = True  # Return a spatial map of perceptual distance.

# Linearly calibrated models (LPIPS)
loss_fn = lpips.LPIPS(net="alex", spatial=spatial)  # Can also set net = 'squeeze' or 'vgg'
# loss_fn = lpips.LPIPS(net='alex', spatial=spatial, lpips=False) # Can also set net = 'squeeze' or 'vgg'


def trans(x):
    # if greyscale images add channel
    if x.shape[-3] == 1:
        x = x.repeat(1, 1, 3, 1, 1)

    # value range [0, 1] -> [-1, 1]
    x = x * 2 - 1

    return x


def calculate_lpips(videos1, videos2, device):
    # image should be RGB, IMPORTANT: normalized to [-1,1]
    print("calculate_lpips...")

    assert videos1.shape == videos2.shape

    # videos [batch_size, timestamps, channel, h, w]

    # support grayscale input, if grayscale -> channel*3
    # value range [0, 1] -> [-1, 1]
    videos1 = trans(videos1)
    videos2 = trans(videos2)

    lpips_results = []

    for video_num in tqdm(range(videos1.shape[0])):
        # get a video
        # video [timestamps, channel, h, w]
        video1 = videos1[video_num]
        video2 = videos2[video_num]

        lpips_results_of_a_video = []
        for clip_timestamp in range(len(video1)):
            # get a img
            # img [timestamps[x], channel, h, w]
            # img [channel, h, w] tensor

            img1 = video1[clip_timestamp].unsqueeze(0).to(device)
            img2 = video2[clip_timestamp].unsqueeze(0).to(device)

            loss_fn.to(device)

            # calculate lpips of a video
            lpips_results_of_a_video.append(loss_fn.forward(img1, img2).mean().detach().cpu().tolist())
        lpips_results.append(lpips_results_of_a_video)

    lpips_results = np.array(lpips_results)

    lpips = {}
    lpips_std = {}

    for clip_timestamp in range(len(video1)):
        lpips[clip_timestamp] = np.mean(lpips_results[:, clip_timestamp])
        lpips_std[clip_timestamp] = np.std(lpips_results[:, clip_timestamp])

    result = {
        "value": lpips,
        "value_std": lpips_std,
        "video_setting": video1.shape,
        "video_setting_name": "time, channel, heigth, width",
    }

    return result


# test code / using example


def main():
    NUMBER_OF_VIDEOS = 8
    VIDEO_LENGTH = 50
    CHANNEL = 3
    SIZE = 64
    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
    videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
    device = torch.device("cuda")
    # device = torch.device("cpu")

    import json

    result = calculate_lpips(videos1, videos2, device)
    print(json.dumps(result, indent=4))


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/eval/vae/cal_psnr.py
================================================
import math

import numpy as np
import torch
from tqdm import tqdm


def img_psnr(img1, img2):
    # [0,1]
    # compute mse
    # mse = np.mean((img1-img2)**2)
    mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2)
    # compute psnr
    if mse < 1e-10:
        return 100
    psnr = 20 * math.log10(1 / math.sqrt(mse))
    return psnr


def trans(x):
    return x


def calculate_psnr(videos1, videos2):
    print("calculate_psnr...")

    # videos [batch_size, timestamps, channel, h, w]

    assert videos1.shape == videos2.shape

    videos1 = trans(videos1)
    videos2 = trans(videos2)

    psnr_results = []

    for video_num in tqdm(range(videos1.shape[0])):
        # get a video
        # video [timestamps, channel, h, w]
        video1 = videos1[video_num]
        video2 = videos2[video_num]

        psnr_results_of_a_video = []
        for clip_timestamp in range(len(video1)):
            # get a img
            # img [timestamps[x], channel, h, w]
            # img [channel, h, w] numpy

            img1 = video1[clip_timestamp].numpy()
            img2 = video2[clip_timestamp].numpy()

            # calculate psnr of a video
            psnr_results_of_a_video.append(img_psnr(img1, img2))

        psnr_results.append(psnr_results_of_a_video)

    psnr_results = np.array(psnr_results)  # [batch_size, num_frames]
    psnr = {}
    psnr_std = {}

    for clip_timestamp in range(len(video1)):
        psnr[clip_timestamp] = np.mean(psnr_results[:, clip_timestamp])
        psnr_std[clip_timestamp] = np.std(psnr_results[:, clip_timestamp])

    result = {
        "value": psnr,
        "value_std": psnr_std,
        "video_setting": video1.shape,
        "video_setting_name": "time, channel, heigth, width",
    }

    return result


# test code / using example


def main():
    NUMBER_OF_VIDEOS = 8
    VIDEO_LENGTH = 50
    CHANNEL = 3
    SIZE = 64
    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)

    import json

    result = calculate_psnr(videos1, videos2)
    print(json.dumps(result, indent=4))


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/eval/vae/cal_ssim.py
================================================
import cv2
import numpy as np
import torch
from tqdm import tqdm


def ssim(img1, img2):
    C1 = 0.01**2
    C2 = 0.03**2
    img1 = img1.astype(np.float64)
    img2 = img2.astype(np.float64)
    kernel = cv2.getGaussianKernel(11, 1.5)
    window = np.outer(kernel, kernel.transpose())
    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]  # valid
    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
    mu1_sq = mu1**2
    mu2_sq = mu2**2
    mu1_mu2 = mu1 * mu2
    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
    return ssim_map.mean()


def calculate_ssim_function(img1, img2):
    # [0,1]
    # ssim is the only metric extremely sensitive to gray being compared to b/w
    if not img1.shape == img2.shape:
        raise ValueError("Input images must have the same dimensions.")
    if img1.ndim == 2:
        return ssim(img1, img2)
    elif img1.ndim == 3:
        if img1.shape[0] == 3:
            ssims = []
            for i in range(3):
                ssims.append(ssim(img1[i], img2[i]))
            return np.array(ssims).mean()
        elif img1.shape[0] == 1:
            return ssim(np.squeeze(img1), np.squeeze(img2))
    else:
        raise ValueError("Wrong input image dimensions.")


def trans(x):
    return x


def calculate_ssim(videos1, videos2):
    print("calculate_ssim...")

    # videos [batch_size, timestamps, channel, h, w]

    assert videos1.shape == videos2.shape

    videos1 = trans(videos1)
    videos2 = trans(videos2)

    ssim_results = []

    for video_num in tqdm(range(videos1.shape[0])):
        # get a video
        # video [timestamps, channel, h, w]
        video1 = videos1[video_num]
        video2 = videos2[video_num]

        ssim_results_of_a_video = []
        for clip_timestamp in range(len(video1)):
            # get a img
            # img [timestamps[x], channel, h, w]
            # img [channel, h, w] numpy

            img1 = video1[clip_timestamp].numpy()
            img2 = video2[clip_timestamp].numpy()

            # calculate ssim of a video
            ssim_results_of_a_video.append(calculate_ssim_function(img1, img2))

        ssim_results.append(ssim_results_of_a_video)

    ssim_results = np.array(ssim_results)

    ssim = {}
    ssim_std = {}

    for clip_timestamp in range(len(video1)):
        ssim[clip_timestamp] = np.mean(ssim_results[:, clip_timestamp])
        ssim_std[clip_timestamp] = np.std(ssim_results[:, clip_timestamp])

    result = {
        "value": ssim,
        "value_std": ssim_std,
        "video_setting": video1.shape,
        "video_setting_name": "time, channel, heigth, width",
    }

    return result


# test code / using example


def main():
    NUMBER_OF_VIDEOS = 8
    VIDEO_LENGTH = 50
    CHANNEL = 3
    SIZE = 64
    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
    torch.device("cuda")

    import json

    result = calculate_ssim(videos1, videos2)
    print(json.dumps(result, indent=4))


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/eval/vae/eval_common_metric.py
================================================
"""Calculates the CLIP Scores

The CLIP model is a contrasitively learned language-image model. There is
an image encoder and a text encoder. It is believed that the CLIP model could
measure the similarity of cross modalities. Please find more information from
https://github.com/openai/CLIP.

The CLIP Score measures the Cosine Similarity between two embedded features.
This repository utilizes the pretrained CLIP Model to calculate
the mean average of cosine similarities.

See --help to see further details.

Code apapted from https://github.com/mseitzer/pytorch-fid and https://github.com/openai/CLIP.

Copyright 2023 The Hong Kong Polytechnic University

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import os.path as osp
import sys
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser

import numpy as np
import torch
from decord import VideoReader, cpu
from pytorchvideo.transforms import ShortSideScale
from torch.utils.data import DataLoader, Dataset, Subset
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import CenterCropVideo

sys.path.append(".")
from cal_flolpips import calculate_flolpips
from cal_lpips import calculate_lpips
from cal_psnr import calculate_psnr
from cal_ssim import calculate_ssim

try:
    from tqdm import tqdm
except ImportError:
    # If tqdm is not available, provide a mock version of it
    def tqdm(x):
        return x


class VideoDataset(Dataset):
    def __init__(
        self,
        real_video_dir,
        generated_video_dir,
        num_frames,
        sample_rate=1,
        crop_size=None,
        resolution=128,
    ) -> None:
        super().__init__()
        self.real_video_files = self._combine_without_prefix(real_video_dir)
        self.generated_video_files = self._combine_without_prefix(generated_video_dir)
        self.num_frames = num_frames
        self.sample_rate = sample_rate
        self.crop_size = crop_size
        self.short_size = resolution

    def __len__(self):
        return len(self.real_video_files)

    def __getitem__(self, index):
        if index >= len(self):
            raise IndexError
        real_video_file = self.real_video_files[index]
        generated_video_file = self.generated_video_files[index]
        print(real_video_file, generated_video_file)
        real_video_tensor = self._load_video(real_video_file)
        generated_video_tensor = self._load_video(generated_video_file)
        return {"real": real_video_tensor, "generated": generated_video_tensor}

    def _load_video(self, video_path):
        num_frames = self.num_frames
        sample_rate = self.sample_rate
        decord_vr = VideoReader(video_path, ctx=cpu(0))
        total_frames = len(decord_vr)
        sample_frames_len = sample_rate * num_frames

        if total_frames >= sample_frames_len:
            s = 0
            e = s + sample_frames_len
            num_frames = num_frames
        else:
            s = 0
            e = total_frames
            num_frames = int(total_frames / sample_frames_len * num_frames)
            print(
                f"sample_frames_len {sample_frames_len}, only can sample {num_frames * sample_rate}",
                video_path,
                total_frames,
            )

        frame_id_list = np.linspace(s, e - 1, num_frames, dtype=int)
        video_data = decord_vr.get_batch(frame_id_list).asnumpy()
        video_data = torch.from_numpy(video_data)
        video_data = video_data.permute(0, 3, 1, 2)  # (T, H, W, C) -> (C, T, H, W)
        return _preprocess(video_data, short_size=self.short_size, crop_size=self.crop_size)

    def _combine_without_prefix(self, folder_path, prefix="."):
        folder = []
        os.makedirs(folder_path, exist_ok=True)
        for name in os.listdir(folder_path):
            if name[0] == prefix:
                continue
            if osp.isfile(osp.join(folder_path, name)):
                folder.append(osp.join(folder_path, name))
        folder.sort()
        return folder


def _preprocess(video_data, short_size=128, crop_size=None):
    transform = Compose(
        [
            Lambda(lambda x: x / 255.0),
            ShortSideScale(size=short_size),
            CenterCropVideo(crop_size=crop_size),
        ]
    )
    video_outputs = transform(video_data)
    # video_outputs = torch.unsqueeze(video_outputs, 0) # (bz,c,t,h,w)
    return video_outputs


def calculate_common_metric(args, dataloader, device):
    metric_dict = {}
    if type(args.metric) is str:
        args.metric = [m.strip() for m in args.metric.split(",")]
    print(args.metric)
    for metric in args.metric:
        score_list = []
        for batch_data in tqdm(dataloader):  # {'real': real_video_tensor, 'generated':generated_video_tensor }
            real_videos = batch_data["real"]
            generated_videos = batch_data["generated"]
            assert real_videos.shape[2] == generated_videos.shape[2]
            if metric == "ssim":
                tmp_list = list(calculate_ssim(real_videos, generated_videos)["value"].values())
            elif metric == "psnr":
                tmp_list = list(calculate_psnr(real_videos, generated_videos)["value"].values())
            elif metric == "flolpips":
                result = calculate_flolpips(real_videos, generated_videos, args.device)
                tmp_list = list(result["value"].values())
            elif metric == "lpips":
                tmp_list = list(calculate_lpips(real_videos, generated_videos, args.device)["value"].values())
            else:
                print(f"metric {metric} is not in acceped list, not calculated")
                continue
            score_list += tmp_list
        metric_dict[metric] = np.mean(score_list)

    return metric_dict


def main():
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument("--batch_size", type=int, default=2, help="Batch size to use")
    parser.add_argument("--real_video_dir", type=str, help=("the path of real videos`"))
    parser.add_argument("--generated_video_dir", type=str, help=("the path of generated videos`"))
    parser.add_argument("--device", type=str, default=None, help="Device to use. Like cuda, cuda:0 or cpu")
    parser.add_argument(
        "--num_workers",
        type=int,
        default=8,
        help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`"),
    )
    parser.add_argument("--sample_fps", type=int, default=30)
    parser.add_argument("--resolution", type=int, default=336)
    parser.add_argument("--crop_size", type=int, default=None)
    parser.add_argument("--num_frames", type=int, default=100)
    parser.add_argument("--sample_rate", type=int, default=1)
    parser.add_argument("--subset_size", type=int, default=None)
    # parser.add_argument("--metric", type=str, default="fvd",choices=['fvd','psnr','ssim','lpips', 'flolpips'])
    parser.add_argument("--metric", nargs="+", default=[])
    parser.add_argument("--fvd_method", type=str, default="styleganv", choices=["styleganv", "videogpt"])

    args = parser.parse_args()

    if args.device is None:
        device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
    else:
        device = torch.device(args.device)

    if args.num_workers is None:
        try:
            num_cpus = len(os.sched_getaffinity(0))
        except AttributeError:
            # os.sched_getaffinity is not available under Windows, use
            # os.cpu_count instead (which may not return the *available* number
            # of CPUs).
            num_cpus = os.cpu_count()

        num_workers = min(num_cpus, 8) if num_cpus is not None else 0
    else:
        num_workers = args.num_workers

    dataset = VideoDataset(
        args.real_video_dir,
        args.generated_video_dir,
        num_frames=args.num_frames,
        sample_rate=args.sample_rate,
        crop_size=args.crop_size,
        resolution=args.resolution,
    )

    if args.subset_size:
        indices = range(args.subset_size)
        dataset = Subset(dataset, indices=indices)

    dataloader = DataLoader(dataset, args.batch_size, num_workers=num_workers, pin_memory=True)

    metric_score = calculate_common_metric(args, dataloader, device)
    print("metric: ", args.metric, " ", metric_score)


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/eval/vae/flolpips/correlation/correlation.py
================================================
#!/usr/bin/env python

import re

import cupy
import torch

kernel_Correlation_rearrange = """
	extern "C" __global__ void kernel_Correlation_rearrange(
		const int n,
		const float* input,
		float* output
	) {
	  int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x;

	  if (intIndex >= n) {
	    return;
	  }

	  int intSample = blockIdx.z;
	  int intChannel = blockIdx.y;

	  float fltValue = input[(((intSample * SIZE_1(input)) + intChannel) * SIZE_2(input) * SIZE_3(input)) + intIndex];

	  __syncthreads();

	  int intPaddedY = (intIndex / SIZE_3(input)) + 4;
	  int intPaddedX = (intIndex % SIZE_3(input)) + 4;
	  int intRearrange = ((SIZE_3(input) + 8) * intPaddedY) + intPaddedX;

	  output[(((intSample * SIZE_1(output) * SIZE_2(output)) + intRearrange) * SIZE_1(input)) + intChannel] = fltValue;
	}
"""

kernel_Correlation_updateOutput = """
	extern "C" __global__ void kernel_Correlation_updateOutput(
	  const int n,
	  const float* rbot0,
	  const float* rbot1,
	  float* top
	) {
	  extern __shared__ char patch_data_char[];

	  float *patch_data = (float *)patch_data_char;

	  // First (upper left) position of kernel upper-left corner in current center position of neighborhood in image 1
	  int x1 = blockIdx.x + 4;
	  int y1 = blockIdx.y + 4;
	  int item = blockIdx.z;
	  int ch_off = threadIdx.x;

	  // Load 3D patch into shared shared memory
	  for (int j = 0; j < 1; j++) { // HEIGHT
	    for (int i = 0; i < 1; i++) { // WIDTH
	      int ji_off = (j + i) * SIZE_3(rbot0);
	      for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS
	        int idx1 = ((item * SIZE_1(rbot0) + y1+j) * SIZE_2(rbot0) + x1+i) * SIZE_3(rbot0) + ch;
	        int idxPatchData = ji_off + ch;
	        patch_data[idxPatchData] = rbot0[idx1];
	      }
	    }
	  }

	  __syncthreads();

	  __shared__ float sum[32];

	  // Compute correlation
	  for (int top_channel = 0; top_channel < SIZE_1(top); top_channel++) {
	    sum[ch_off] = 0;

	    int s2o = top_channel % 9 - 4;
	    int s2p = top_channel / 9 - 4;

	    for (int j = 0; j < 1; j++) { // HEIGHT
	      for (int i = 0; i < 1; i++) { // WIDTH
	        int ji_off = (j + i) * SIZE_3(rbot0);
	        for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS
	          int x2 = x1 + s2o;
	          int y2 = y1 + s2p;

	          int idxPatchData = ji_off + ch;
	          int idx2 = ((item * SIZE_1(rbot0) + y2+j) * SIZE_2(rbot0) + x2+i) * SIZE_3(rbot0) + ch;

	          sum[ch_off] += patch_data[idxPatchData] * rbot1[idx2];
	        }
	      }
	    }

	    __syncthreads();

	    if (ch_off == 0) {
	      float total_sum = 0;
	      for (int idx = 0; idx < 32; idx++) {
	        total_sum += sum[idx];
	      }
	      const int sumelems = SIZE_3(rbot0);
	      const int index = ((top_channel*SIZE_2(top) + blockIdx.y)*SIZE_3(top))+blockIdx.x;
	      top[index + item*SIZE_1(top)*SIZE_2(top)*SIZE_3(top)] = total_sum / (float)sumelems;
	    }
	  }
	}
"""

kernel_Correlation_updateGradFirst = """
	#define ROUND_OFF 50000

	extern "C" __global__ void kernel_Correlation_updateGradFirst(
	  const int n,
	  const int intSample,
	  const float* rbot0,
	  const float* rbot1,
	  const float* gradOutput,
	  float* gradFirst,
	  float* gradSecond
	) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) {
	  int n = intIndex % SIZE_1(gradFirst); // channels
	  int l = (intIndex / SIZE_1(gradFirst)) % SIZE_3(gradFirst) + 4; // w-pos
	  int m = (intIndex / SIZE_1(gradFirst) / SIZE_3(gradFirst)) % SIZE_2(gradFirst) + 4; // h-pos

	  // round_off is a trick to enable integer division with ceil, even for negative numbers
	  // We use a large offset, for the inner part not to become negative.
	  const int round_off = ROUND_OFF;
	  const int round_off_s1 = round_off;

	  // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
	  int xmin = (l - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4)
	  int ymin = (m - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4)

	  // Same here:
	  int xmax = (l - 4 + round_off_s1) - round_off; // floor (l - 4)
	  int ymax = (m - 4 + round_off_s1) - round_off; // floor (m - 4)

	  float sum = 0;
	  if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) {
	    xmin = max(0,xmin);
	    xmax = min(SIZE_3(gradOutput)-1,xmax);

	    ymin = max(0,ymin);
	    ymax = min(SIZE_2(gradOutput)-1,ymax);

	    for (int p = -4; p <= 4; p++) {
	      for (int o = -4; o <= 4; o++) {
	        // Get rbot1 data:
	        int s2o = o;
	        int s2p = p;
	        int idxbot1 = ((intSample * SIZE_1(rbot0) + (m+s2p)) * SIZE_2(rbot0) + (l+s2o)) * SIZE_3(rbot0) + n;
	        float bot1tmp = rbot1[idxbot1]; // rbot1[l+s2o,m+s2p,n]

	        // Index offset for gradOutput in following loops:
	        int op = (p+4) * 9 + (o+4); // index[o,p]
	        int idxopoffset = (intSample * SIZE_1(gradOutput) + op);

	        for (int y = ymin; y <= ymax; y++) {
	          for (int x = xmin; x <= xmax; x++) {
	            int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p]
	            sum += gradOutput[idxgradOutput] * bot1tmp;
	          }
	        }
	      }
	    }
	  }
	  const int sumelems = SIZE_1(gradFirst);
	  const int bot0index = ((n * SIZE_2(gradFirst)) + (m-4)) * SIZE_3(gradFirst) + (l-4);
	  gradFirst[bot0index + intSample*SIZE_1(gradFirst)*SIZE_2(gradFirst)*SIZE_3(gradFirst)] = sum / (float)sumelems;
	} }
"""

kernel_Correlation_updateGradSecond = """
	#define ROUND_OFF 50000

	extern "C" __global__ void kernel_Correlation_updateGradSecond(
	  const int n,
	  const int intSample,
	  const float* rbot0,
	  const float* rbot1,
	  const float* gradOutput,
	  float* gradFirst,
	  float* gradSecond
	) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) {
	  int n = intIndex % SIZE_1(gradSecond); // channels
	  int l = (intIndex / SIZE_1(gradSecond)) % SIZE_3(gradSecond) + 4; // w-pos
	  int m = (intIndex / SIZE_1(gradSecond) / SIZE_3(gradSecond)) % SIZE_2(gradSecond) + 4; // h-pos

	  // round_off is a trick to enable integer division with ceil, even for negative numbers
	  // We use a large offset, for the inner part not to become negative.
	  const int round_off = ROUND_OFF;
	  const int round_off_s1 = round_off;

	  float sum = 0;
	  for (int p = -4; p <= 4; p++) {
	    for (int o = -4; o <= 4; o++) {
	      int s2o = o;
	      int s2p = p;

	      //Get X,Y ranges and clamp
	      // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
	      int xmin = (l - 4 - s2o + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o)
	      int ymin = (m - 4 - s2p + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o)

	      // Same here:
	      int xmax = (l - 4 - s2o + round_off_s1) - round_off; // floor (l - 4 - s2o)
	      int ymax = (m - 4 - s2p + round_off_s1) - round_off; // floor (m - 4 - s2p)

	      if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) {
	        xmin = max(0,xmin);
	        xmax = min(SIZE_3(gradOutput)-1,xmax);

	        ymin = max(0,ymin);
	        ymax = min(SIZE_2(gradOutput)-1,ymax);

	        // Get rbot0 data:
	        int idxbot0 = ((intSample * SIZE_1(rbot0) + (m-s2p)) * SIZE_2(rbot0) + (l-s2o)) * SIZE_3(rbot0) + n;
	        float bot0tmp = rbot0[idxbot0]; // rbot1[l+s2o,m+s2p,n]

	        // Index offset for gradOutput in following loops:
	        int op = (p+4) * 9 + (o+4); // index[o,p]
	        int idxopoffset = (intSample * SIZE_1(gradOutput) + op);

	        for (int y = ymin; y <= ymax; y++) {
	          for (int x = xmin; x <= xmax; x++) {
	            int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p]
	            sum += gradOutput[idxgradOutput] * bot0tmp;
	          }
	        }
	      }
	    }
	  }
	  const int sumelems = SIZE_1(gradSecond);
	  const int bot1index = ((n * SIZE_2(gradSecond)) + (m-4)) * SIZE_3(gradSecond) + (l-4);
	  gradSecond[bot1index + intSample*SIZE_1(gradSecond)*SIZE_2(gradSecond)*SIZE_3(gradSecond)] = sum / (float)sumelems;
	} }
"""


def cupy_kernel(strFunction, objVariables):
    strKernel = globals()[strFunction]

    while True:
        objMatch = re.search("(SIZE_)([0-4])(\()([^\)]*)(\))", strKernel)

        if objMatch is None:
            break
        # end

        intArg = int(objMatch.group(2))

        strTensor = objMatch.group(4)
        intSizes = objVariables[strTensor].size()

        strKernel = strKernel.replace(objMatch.group(), str(intSizes[intArg]))
    # end

    while True:
        objMatch = re.search("(VALUE_)([0-4])(\()([^\)]+)(\))", strKernel)

        if objMatch is None:
            break
        # end

        intArgs = int(objMatch.group(2))
        strArgs = objMatch.group(4).split(",")

        strTensor = strArgs[0]
        intStrides = objVariables[strTensor].stride()
        strIndex = [
            "(("
            + strArgs[intArg + 1].replace("{", "(").replace("}", ")").strip()
            + ")*"
            + str(intStrides[intArg])
            + ")"
            for intArg in range(intArgs)
        ]

        strKernel = strKernel.replace(objMatch.group(0), strTensor + "[" + str.join("+", strIndex) + "]")
    # end

    return strKernel


# end


@cupy.memoize(for_each_device=True)
def cupy_launch(strFunction, strKernel):
    return cupy.RawKernel(strKernel, strFunction)


# end


class _FunctionCorrelation(torch.autograd.Function):
    @staticmethod
    def forward(self, first, second):
        rbot0 = first.new_zeros([first.shape[0], first.shape[2] + 8, first.shape[3] + 8, first.shape[1]])
        rbot1 = first.new_zeros([first.shape[0], first.shape[2] + 8, first.shape[3] + 8, first.shape[1]])

        self.save_for_backward(first, second, rbot0, rbot1)

        first = first.contiguous()
        assert first.is_cuda == True
        second = second.contiguous()
        assert second.is_cuda == True

        output = first.new_zeros([first.shape[0], 81, first.shape[2], first.shape[3]])

        if first.is_cuda == True:
            n = first.shape[2] * first.shape[3]
            cupy_launch(
                "kernel_Correlation_rearrange",
                cupy_kernel("kernel_Correlation_rearrange", {"input": first, "output": rbot0}),
            )(
                grid=tuple([int((n + 16 - 1) / 16), first.shape[1], first.shape[0]]),
                block=tuple([16, 1, 1]),
                args=[n, first.data_ptr(), rbot0.data_ptr()],
            )

            n = second.shape[2] * second.shape[3]
            cupy_launch(
                "kernel_Correlation_rearrange",
                cupy_kernel("kernel_Correlation_rearrange", {"input": second, "output": rbot1}),
            )(
                grid=tuple([int((n + 16 - 1) / 16), second.shape[1], second.shape[0]]),
                block=tuple([16, 1, 1]),
                args=[n, second.data_ptr(), rbot1.data_ptr()],
            )

            n = output.shape[1] * output.shape[2] * output.shape[3]
            cupy_launch(
                "kernel_Correlation_updateOutput",
                cupy_kernel("kernel_Correlation_updateOutput", {"rbot0": rbot0, "rbot1": rbot1, "top": output}),
            )(
                grid=tuple([output.shape[3], output.shape[2], output.shape[0]]),
                block=tuple([32, 1, 1]),
                shared_mem=first.shape[1] * 4,
                args=[n, rbot0.data_ptr(), rbot1.data_ptr(), output.data_ptr()],
            )

        elif first.is_cuda == False:
            raise NotImplementedError()

        # end

        return output

    # end

    @staticmethod
    def backward(self, gradOutput):
        first, second, rbot0, rbot1 = self.saved_tensors

        gradOutput = gradOutput.contiguous()
        assert gradOutput.is_cuda == True

        gradFirst = (
            first.new_zeros([first.shape[0], first.shape[1], first.shape[2], first.shape[3]])
            if self.needs_input_grad[0] == True
            else None
        )
        gradSecond = (
            first.new_zeros([first.shape[0], first.shape[1], first.shape[2], first.shape[3]])
            if self.needs_input_grad[1] == True
            else None
        )

        if first.is_cuda == True:
            if gradFirst is not None:
                for intSample in range(first.shape[0]):
                    n = first.shape[1] * first.shape[2] * first.shape[3]
                    cupy_launch(
                        "kernel_Correlation_updateGradFirst",
                        cupy_kernel(
                            "kernel_Correlation_updateGradFirst",
                            {
                                "rbot0": rbot0,
                                "rbot1": rbot1,
                                "gradOutput": gradOutput,
                                "gradFirst": gradFirst,
                                "gradSecond": None,
                            },
                        ),
                    )(
                        grid=tuple([int((n + 512 - 1) / 512), 1, 1]),
                        block=tuple([512, 1, 1]),
                        args=[
                            n,
                            intSample,
                            rbot0.data_ptr(),
                            rbot1.data_ptr(),
                            gradOutput.data_ptr(),
                            gradFirst.data_ptr(),
                            None,
                        ],
                    )
                # end
            # end

            if gradSecond is not None:
                for intSample in range(first.shape[0]):
                    n = first.shape[1] * first.shape[2] * first.shape[3]
                    cupy_launch(
                        "kernel_Correlation_updateGradSecond",
                        cupy_kernel(
                            "kernel_Correlation_updateGradSecond",
                            {
                                "rbot0": rbot0,
                                "rbot1": rbot1,
                                "gradOutput": gradOutput,
                                "gradFirst": None,
                                "gradSecond": gradSecond,
                            },
                        ),
                    )(
                        grid=tuple([int((n + 512 - 1) / 512), 1, 1]),
                        block=tuple([512, 1, 1]),
                        args=[
                            n,
                            intSample,
                            rbot0.data_ptr(),
                            rbot1.data_ptr(),
                            gradOutput.data_ptr(),
                            None,
                            gradSecond.data_ptr(),
                        ],
                    )
                # end
            # end

        elif first.is_cuda == False:
            raise NotImplementedError()

        # end

        return gradFirst, gradSecond

    # end


# end


def FunctionCorrelation(tenFirst, tenSecond):
    return _FunctionCorrelation.apply(tenFirst, tenSecond)


# end


class ModuleCorrelation(torch.nn.Module):
    def __init__(self):
        super(ModuleCorrelation, self).__init__()

    # end

    def forward(self, tenFirst, tenSecond):
        return _FunctionCorrelation.apply(tenFirst, tenSecond)

    # end


# end


================================================
FILE: Open-Sora/eval/vae/flolpips/flolpips.py
================================================
from __future__ import absolute_import

import hashlib
import os

import requests
import torch
import torch.nn
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm

from .pretrained_networks import alexnet, squeezenet, vgg16
from .pwcnet import Network as PWCNet
from .utils import *

URL_MAP = {"alex": "https://raw.githubusercontent.com/danier97/flolpips/main/weights/v0.1/alex.pth"}

CKPT_MAP = {"alex": "alex.pth"}

MD5_MAP = {"alex": "9642209e2b57a85d20f86d812320f9e6"}


def spatial_average(in_tens, keepdim=True):
    return in_tens.mean([2, 3], keepdim=keepdim)


def mw_spatial_average(in_tens, flow, keepdim=True):
    _, _, h, w = in_tens.shape
    flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear")
    flow_mag = torch.sqrt(flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2)
    flow_mag = flow_mag / torch.sum(flow_mag, dim=[1, 2, 3], keepdim=True)
    return torch.sum(in_tens * flow_mag, dim=[2, 3], keepdim=keepdim)


def mtw_spatial_average(in_tens, flow, texture, keepdim=True):
    _, _, h, w = in_tens.shape
    flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear")
    texture = F.interpolate(texture, (h, w), align_corners=False, mode="bilinear")
    flow_mag = torch.sqrt(flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2)
    flow_mag = (flow_mag - flow_mag.min()) / (flow_mag.max() - flow_mag.min()) + 1e-6
    texture = (texture - texture.min()) / (texture.max() - texture.min()) + 1e-6
    weight = flow_mag / texture
    weight /= torch.sum(weight)
    return torch.sum(in_tens * weight, dim=[2, 3], keepdim=keepdim)


def m2w_spatial_average(in_tens, flow, keepdim=True):
    _, _, h, w = in_tens.shape
    flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear")
    flow_mag = flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2  # B,1,H,W
    flow_mag = flow_mag / torch.sum(flow_mag)
    return torch.sum(in_tens * flow_mag, dim=[2, 3], keepdim=keepdim)


def upsample(in_tens, out_HW=(64, 64)):  # assumes scale factor is same for H and W
    in_H, in_W = in_tens.shape[2], in_tens.shape[3]
    return nn.Upsample(size=out_HW, mode="bilinear", align_corners=False)(in_tens)


def md5_hash(path):
    with open(path, "rb") as f:
        content = f.read()
    return hashlib.md5(content).hexdigest()


def download(url, local_path, chunk_size=1024):
    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
    with requests.get(url, stream=True) as r:
        total_size = int(r.headers.get("content-length", 0))
        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
            with open(local_path, "wb") as f:
                for data in r.iter_content(chunk_size=chunk_size):
                    if data:
                        f.write(data)
                        pbar.update(chunk_size)


def get_ckpt_path(name, root, check=False):
    assert name in URL_MAP
    path = os.path.join(root, CKPT_MAP[name])
    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
        download(URL_MAP[name], path)
        md5 = md5_hash(path)
        assert md5 == MD5_MAP[name], md5
    return path


# Learned perceptual metric
class LPIPS(nn.Module):
    def __init__(
        self,
        pretrained=True,
        net="alex",
        version="0.1",
        lpips=True,
        spatial=False,
        pnet_rand=False,
        pnet_tune=False,
        use_dropout=True,
        model_path=None,
        eval_mode=True,
        verbose=False,
    ):
        # lpips - [True] means with linear calibration on top of base network
        # pretrained - [True] means load linear weights

        super(LPIPS, self).__init__()
        if verbose:
            print(
                "Setting up [%s] perceptual loss: trunk [%s], v[%s], spatial [%s]"
                % ("LPIPS" if lpips else "baseline", net, version, "on" if spatial else "off")
            )

        self.pnet_type = net
        self.pnet_tune = pnet_tune
        self.pnet_rand = pnet_rand
        self.spatial = spatial
        self.lpips = lpips  # false means baseline of just averaging all layers
        self.version = version
        self.scaling_layer = ScalingLayer()

        if self.pnet_type in ["vgg", "vgg16"]:
            net_type = vgg16
            self.chns = [64, 128, 256, 512, 512]
        elif self.pnet_type == "alex":
            net_type = alexnet
            self.chns = [64, 192, 384, 256, 256]
        elif self.pnet_type == "squeeze":
            net_type = squeezenet
            self.chns = [64, 128, 256, 384, 384, 512, 512]
        self.L = len(self.chns)

        self.net = net_type(pretrained=not self.pnet_rand, requires_grad=self.pnet_tune)

        if lpips:
            self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
            self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
            self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
            self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
            self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
            self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
            if self.pnet_type == "squeeze":  # 7 layers for squeezenet
                self.lin5 = NetLinLayer(self.chns[5], use_dropout=use_dropout)
                self.lin6 = NetLinLayer(self.chns[6], use_dropout=use_dropout)
                self.lins += [self.lin5, self.lin6]
            self.lins = nn.ModuleList(self.lins)

            if pretrained:
                self.load_from_pretrained(version, net)
                if verbose:
                    print("Loaded model from: %s" % model_path)

        if eval_mode:
            self.eval()

    def load_from_pretrained(self, version, net):
        ckpt = get_ckpt_path(net, "pretrained_models/flolpips/weights/v%s" % (version))
        self.load_state_dict(torch.load(ckpt, map_location="cpu"), strict=False)

    def forward(self, in0, in1, retPerLayer=False, normalize=False):
        if normalize:  # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
            in0 = 2 * in0 - 1
            in1 = 2 * in1 - 1

        # v0.0 - original release had a bug, where input was not scaled
        in0_input, in1_input = (
            (self.scaling_layer(in0), self.scaling_layer(in1)) if self.version == "0.1" else (in0, in1)
        )
        outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
        feats0, feats1, diffs = {}, {}, {}

        for kk in range(self.L):
            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2

        if self.lpips:
            if self.spatial:
                res = [upsample(self.lins[kk](diffs[kk]), out_HW=in0.shape[2:]) for kk in range(self.L)]
            else:
                res = [spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)]
        else:
            if self.spatial:
                res = [upsample(diffs[kk].sum(dim=1, keepdim=True), out_HW=in0.shape[2:]) for kk in range(self.L)]
            else:
                res = [spatial_average(diffs[kk].sum(dim=1, keepdim=True), keepdim=True) for kk in range(self.L)]

        # val = res[0]
        # for l in range(1,self.L):
        #     val += res[l]
        #     print(val)

        # a = spatial_average(self.lins[kk](diffs[kk]), keepdim=True)
        # b = torch.max(self.lins[kk](feats0[kk]**2))
        # for kk in range(self.L):
        #     a += spatial_average(self.lins[kk](diffs[kk]), keepdim=True)
        #     b = torch.max(b,torch.max(self.lins[kk](feats0[kk]**2)))
        # a = a/self.L
        # from IPython import embed
        # embed()
        # return 10*torch.log10(b/a)

        # if(retPerLayer):
        #     return (val, res)
        # else:
        return torch.sum(torch.cat(res, 1), dim=(1, 2, 3), keepdims=False)


class ScalingLayer(nn.Module):
    def __init__(self):
        super(ScalingLayer, self).__init__()
        self.register_buffer("shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None])
        self.register_buffer("scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None])

    def forward(self, inp):
        return (inp - self.shift) / self.scale


class NetLinLayer(nn.Module):
    """A single linear layer which does a 1x1 conv"""

    def __init__(self, chn_in, chn_out=1, use_dropout=False):
        super(NetLinLayer, self).__init__()

        layers = (
            [
                nn.Dropout(),
            ]
            if (use_dropout)
            else []
        )
        layers += [
            nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
        ]
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


class Dist2LogitLayer(nn.Module):
    """takes 2 distances, puts through fc layers, spits out value between [0,1] (if use_sigmoid is True)"""

    def __init__(self, chn_mid=32, use_sigmoid=True):
        super(Dist2LogitLayer, self).__init__()

        layers = [
            nn.Conv2d(5, chn_mid, 1, stride=1, padding=0, bias=True),
        ]
        layers += [
            nn.LeakyReLU(0.2, True),
        ]
        layers += [
            nn.Conv2d(chn_mid, chn_mid, 1, stride=1, padding=0, bias=True),
        ]
        layers += [
            nn.LeakyReLU(0.2, True),
        ]
        layers += [
            nn.Conv2d(chn_mid, 1, 1, stride=1, padding=0, bias=True),
        ]
        if use_sigmoid:
            layers += [
                nn.Sigmoid(),
            ]
        self.model = nn.Sequential(*layers)

    def forward(self, d0, d1, eps=0.1):
        return self.model.forward(torch.cat((d0, d1, d0 - d1, d0 / (d1 + eps), d1 / (d0 + eps)), dim=1))


class BCERankingLoss(nn.Module):
    def __init__(self, chn_mid=32):
        super(BCERankingLoss, self).__init__()
        self.net = Dist2LogitLayer(chn_mid=chn_mid)
        # self.parameters = list(self.net.parameters())
        self.loss = torch.nn.BCELoss()

    def forward(self, d0, d1, judge):
        per = (judge + 1.0) / 2.0
        self.logit = self.net.forward(d0, d1)
        return self.loss(self.logit, per)


# L2, DSSIM metrics
class FakeNet(nn.Module):
    def __init__(self, use_gpu=True, colorspace="Lab"):
        super(FakeNet, self).__init__()
        self.use_gpu = use_gpu
        self.colorspace = colorspace


class L2(FakeNet):
    def forward(self, in0, in1, retPerLayer=None):
        assert in0.size()[0] == 1  # currently only supports batchSize 1

        if self.colorspace == "RGB":
            (N, C, X, Y) = in0.size()
            value = torch.mean(
                torch.mean(torch.mean((in0 - in1) ** 2, dim=1).view(N, 1, X, Y), dim=2).view(N, 1, 1, Y), dim=3
            ).view(N)
            return value
        elif self.colorspace == "Lab":
            value = l2(
                tensor2np(tensor2tensorlab(in0.data, to_norm=False)),
                tensor2np(tensor2tensorlab(in1.data, to_norm=False)),
                range=100.0,
            ).astype("float")
            ret_var = Variable(torch.Tensor((value,)))
            if self.use_gpu:
                ret_var = ret_var.cuda()
            return ret_var


class DSSIM(FakeNet):
    def forward(self, in0, in1, retPerLayer=None):
        assert in0.size()[0] == 1  # currently only supports batchSize 1

        if self.colorspace == "RGB":
            value = dssim(1.0 * tensor2im(in0.data), 1.0 * tensor2im(in1.data), range=255.0).astype("float")
        elif self.colorspace == "Lab":
            value = dssim(
                tensor2np(tensor2tensorlab(in0.data, to_norm=False)),
                tensor2np(tensor2tensorlab(in1.data, to_norm=False)),
                range=100.0,
            ).astype("float")
        ret_var = Variable(torch.Tensor((value,)))
        if self.use_gpu:
            ret_var = ret_var.cuda()
        return ret_var


def print_network(net):
    num_params = 0
    for param in net.parameters():
        num_params += param.numel()
    print("Network", net)
    print("Total number of parameters: %d" % num_params)


class FloLPIPS(LPIPS):
    def __init__(
        self,
        pretrained=True,
        net="alex",
        version="0.1",
        lpips=True,
        spatial=False,
        pnet_rand=False,
        pnet_tune=False,
        use_dropout=True,
        model_path=None,
        eval_mode=True,
        verbose=False,
    ):
        super(FloLPIPS, self).__init__(
            pretrained, net, version, lpips, spatial, pnet_rand, pnet_tune, use_dropout, model_path, eval_mode, verbose
        )

    def forward(self, in0, in1, flow, retPerLayer=False, normalize=False):
        if normalize:  # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
            in0 = 2 * in0 - 1
            in1 = 2 * in1 - 1

        in0_input, in1_input = (
            (self.scaling_layer(in0), self.scaling_layer(in1)) if self.version == "0.1" else (in0, in1)
        )
        outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
        feats0, feats1, diffs = {}, {}, {}

        for kk in range(self.L):
            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2

        res = [mw_spatial_average(self.lins[kk](diffs[kk]), flow, keepdim=True) for kk in range(self.L)]

        return torch.sum(torch.cat(res, 1), dim=(1, 2, 3), keepdims=False)


class Flolpips(nn.Module):
    def __init__(self):
        super(Flolpips, self).__init__()
        self.loss_fn = FloLPIPS(net="alex", version="0.1")
        self.flownet = PWCNet()

    @torch.no_grad()
    def forward(self, I0, I1, frame_dis, frame_ref):
        """
        args:
            I0: first frame of the triplet, shape: [B, C, H, W]
            I1: third frame of the triplet, shape: [B, C, H, W]
            frame_dis: prediction of the intermediate frame, shape: [B, C, H, W]
            frame_ref: ground-truth of the intermediate frame, shape: [B, C, H, W]
        """
        assert (
            I0.size() == I1.size() == frame_dis.size() == frame_ref.size()
        ), "the 4 input tensors should have same size"

        flow_ref = self.flownet(frame_ref, I0)
        flow_dis = self.flownet(frame_dis, I0)
        flow_diff = flow_ref - flow_dis
        flolpips_wrt_I0 = self.loss_fn.forward(frame_ref, frame_dis, flow_diff, normalize=True)

        flow_ref = self.flownet(frame_ref, I1)
        flow_dis = self.flownet(frame_dis, I1)
        flow_diff = flow_ref - flow_dis
        flolpips_wrt_I1 = self.loss_fn.forward(frame_ref, frame_dis, flow_diff, normalize=True)

        flolpips = (flolpips_wrt_I0 + flolpips_wrt_I1) / 2
        return flolpips


================================================
FILE: Open-Sora/eval/vae/flolpips/pretrained_networks.py
================================================
from collections import namedtuple

import torch
from torchvision import models as tv


class squeezenet(torch.nn.Module):
    def __init__(self, requires_grad=False, pretrained=True):
        super(squeezenet, self).__init__()
        pretrained_features = tv.squeezenet1_1(pretrained=pretrained).features
        self.slice1 = torch.nn.Sequential()
        self.slice2 = torch.nn.Sequential()
        self.slice3 = torch.nn.Sequential()
        self.slice4 = torch.nn.Sequential()
        self.slice5 = torch.nn.Sequential()
        self.slice6 = torch.nn.Sequential()
        self.slice7 = torch.nn.Sequential()
        self.N_slices = 7
        for x in range(2):
            self.slice1.add_module(str(x), pretrained_features[x])
        for x in range(2, 5):
            self.slice2.add_module(str(x), pretrained_features[x])
        for x in range(5, 8):
            self.slice3.add_module(str(x), pretrained_features[x])
        for x in range(8, 10):
            self.slice4.add_module(str(x), pretrained_features[x])
        for x in range(10, 11):
            self.slice5.add_module(str(x), pretrained_features[x])
        for x in range(11, 12):
            self.slice6.add_module(str(x), pretrained_features[x])
        for x in range(12, 13):
            self.slice7.add_module(str(x), pretrained_features[x])
        if not requires_grad:
            for param in self.parameters():
                param.requires_grad = False

    def forward(self, X):
        h = self.slice1(X)
        h_relu1 = h
        h = self.slice2(h)
        h_relu2 = h
        h = self.slice3(h)
        h_relu3 = h
        h = self.slice4(h)
        h_relu4 = h
        h = self.slice5(h)
        h_relu5 = h
        h = self.slice6(h)
        h_relu6 = h
        h = self.slice7(h)
        h_relu7 = h
        vgg_outputs = namedtuple("SqueezeOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5", "relu6", "relu7"])
        out = vgg_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5, h_relu6, h_relu7)

        return out


class alexnet(torch.nn.Module):
    def __init__(self, requires_grad=False, pretrained=True):
        super(alexnet, self).__init__()
        alexnet_pretrained_features = tv.alexnet(pretrained=pretrained).features
        self.slice1 = torch.nn.Sequential()
        self.slice2 = torch.nn.Sequential()
        self.slice3 = torch.nn.Sequential()
        self.slice4 = torch.nn.Sequential()
        self.slice5 = torch.nn.Sequential()
        self.N_slices = 5
        for x in range(2):
            self.slice1.add_module(str(x), alexnet_pretrained_features[x])
        for x in range(2, 5):
            self.slice2.add_module(str(x), alexnet_pretrained_features[x])
        for x in range(5, 8):
            self.slice3.add_module(str(x), alexnet_pretrained_features[x])
        for x in range(8, 10):
            self.slice4.add_module(str(x), alexnet_pretrained_features[x])
        for x in range(10, 12):
            self.slice5.add_module(str(x), alexnet_pretrained_features[x])
        if not requires_grad:
            for param in self.parameters():
                param.requires_grad = False

    def forward(self, X):
        h = self.slice1(X)
        h_relu1 = h
        h = self.slice2(h)
        h_relu2 = h
        h = self.slice3(h)
        h_relu3 = h
        h = self.slice4(h)
        h_relu4 = h
        h = self.slice5(h)
        h_relu5 = h
        alexnet_outputs = namedtuple("AlexnetOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5"])
        out = alexnet_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5)

        return out


class vgg16(torch.nn.Module):
    def __init__(self, requires_grad=False, pretrained=True):
        super(vgg16, self).__init__()
        vgg_pretrained_features = tv.vgg16(pretrained=pretrained).features
        self.slice1 = torch.nn.Sequential()
        self.slice2 = torch.nn.Sequential()
        self.slice3 = torch.nn.Sequential()
        self.slice4 = torch.nn.Sequential()
        self.slice5 = torch.nn.Sequential()
        self.N_slices = 5
        for x in range(4):
            self.slice1.add_module(str(x), vgg_pretrained_features[x])
        for x in range(4, 9):
            self.slice2.add_module(str(x), vgg_pretrained_features[x])
        for x in range(9, 16):
            self.slice3.add_module(str(x), vgg_pretrained_features[x])
        for x in range(16, 23):
            self.slice4.add_module(str(x), vgg_pretrained_features[x])
        for x in range(23, 30):
            self.slice5.add_module(str(x), vgg_pretrained_features[x])
        if not requires_grad:
            for param in self.parameters():
                param.requires_grad = False

    def forward(self, X):
        h = self.slice1(X)
        h_relu1_2 = h
        h = self.slice2(h)
        h_relu2_2 = h
        h = self.slice3(h)
        h_relu3_3 = h
        h = self.slice4(h)
        h_relu4_3 = h
        h = self.slice5(h)
        h_relu5_3 = h
        vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)

        return out


class resnet(torch.nn.Module):
    def __init__(self, requires_grad=False, pretrained=True, num=18):
        super(resnet, self).__init__()
        if num == 18:
            self.net = tv.resnet18(pretrained=pretrained)
        elif num == 34:
            self.net = tv.resnet34(pretrained=pretrained)
        elif num == 50:
            self.net = tv.resnet50(pretrained=pretrained)
        elif num == 101:
            self.net = tv.resnet101(pretrained=pretrained)
        elif num == 152:
            self.net = tv.resnet152(pretrained=pretrained)
        self.N_slices = 5

        self.conv1 = self.net.conv1
        self.bn1 = self.net.bn1
        self.relu = self.net.relu
        self.maxpool = self.net.maxpool
        self.layer1 = self.net.layer1
        self.layer2 = self.net.layer2
        self.layer3 = self.net.layer3
        self.layer4 = self.net.layer4

    def forward(self, X):
        h = self.conv1(X)
        h = self.bn1(h)
        h = self.relu(h)
        h_relu1 = h
        h = self.maxpool(h)
        h = self.layer1(h)
        h_conv2 = h
        h = self.layer2(h)
        h_conv3 = h
        h = self.layer3(h)
        h_conv4 = h
        h = self.layer4(h)
        h_conv5 = h

        outputs = namedtuple("Outputs", ["relu1", "conv2", "conv3", "conv4", "conv5"])
        out = outputs(h_relu1, h_conv2, h_conv3, h_conv4, h_conv5)

        return out


================================================
FILE: Open-Sora/eval/vae/flolpips/pwcnet.py
================================================
#!/usr/bin/env python

import math

import torch

# try:
from .correlation import correlation  # the custom cost volume layer

# except:
# 	sys.path.insert(0, './correlation'); import correlation # you should consider upgrading python
# end

##########################################################

# assert(int(str('').join(torch.__version__.split('.')[0:2])) >= 13) # requires at least pytorch version 1.3.0

# torch.set_grad_enabled(False) # make sure to not compute gradients for computational performance

# torch.backends.cudnn.enabled = True # make sure to use cudnn for computational performance

# ##########################################################

# arguments_strModel = 'default' # 'default', or 'chairs-things'
# arguments_strFirst = './images/first.png'
# arguments_strSecond = './images/second.png'
# arguments_strOut = './out.flo'

# for strOption, strArgument in getopt.getopt(sys.argv[1:], '', [ strParameter[2:] + '=' for strParameter in sys.argv[1::2] ])[0]:
# 	if strOption == '--model' and strArgument != '': arguments_strModel = strArgument # which model to use
# 	if strOption == '--first' and strArgument != '': arguments_strFirst = strArgument # path to the first frame
# 	if strOption == '--second' and strArgument != '': arguments_strSecond = strArgument # path to the second frame
# 	if strOption == '--out' and strArgument != '': arguments_strOut = strArgument # path to where the output should be stored
# end

##########################################################


def backwarp(tenInput, tenFlow):
    backwarp_tenGrid = {}
    backwarp_tenPartial = {}
    if str(tenFlow.shape) not in backwarp_tenGrid:
        tenHor = (
            torch.linspace(-1.0 + (1.0 / tenFlow.shape[3]), 1.0 - (1.0 / tenFlow.shape[3]), tenFlow.shape[3])
            .view(1, 1, 1, -1)
            .expand(-1, -1, tenFlow.shape[2], -1)
        )
        tenVer = (
            torch.linspace(-1.0 + (1.0 / tenFlow.shape[2]), 1.0 - (1.0 / tenFlow.shape[2]), tenFlow.shape[2])
            .view(1, 1, -1, 1)
            .expand(-1, -1, -1, tenFlow.shape[3])
        )

        backwarp_tenGrid[str(tenFlow.shape)] = torch.cat([tenHor, tenVer], 1).cuda()
    # end

    if str(tenFlow.shape) not in backwarp_tenPartial:
        backwarp_tenPartial[str(tenFlow.shape)] = tenFlow.new_ones(
            [tenFlow.shape[0], 1, tenFlow.shape[2], tenFlow.shape[3]]
        )
    # end

    tenFlow = torch.cat(
        [
            tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
            tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0),
        ],
        1,
    )
    tenInput = torch.cat([tenInput, backwarp_tenPartial[str(tenFlow.shape)]], 1)

    tenOutput = torch.nn.functional.grid_sample(
        input=tenInput,
        grid=(backwarp_tenGrid[str(tenFlow.shape)] + tenFlow).permute(0, 2, 3, 1),
        mode="bilinear",
        padding_mode="zeros",
        align_corners=False,
    )

    tenMask = tenOutput[:, -1:, :, :]
    tenMask[tenMask > 0.999] = 1.0
    tenMask[tenMask < 1.0] = 0.0

    return tenOutput[:, :-1, :, :] * tenMask


# end

##########################################################


class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()

        class Extractor(torch.nn.Module):
            def __init__(self):
                super(Extractor, self).__init__()

                self.netOne = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

                self.netTwo = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

                self.netThr = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

                self.netFou = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

                self.netFiv = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=96, out_channels=128, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

                self.netSix = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=128, out_channels=196, kernel_size=3, stride=2, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=196, out_channels=196, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=196, out_channels=196, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

            # end

            def forward(self, tenInput):
                tenOne = self.netOne(tenInput)
                tenTwo = self.netTwo(tenOne)
                tenThr = self.netThr(tenTwo)
                tenFou = self.netFou(tenThr)
                tenFiv = self.netFiv(tenFou)
                tenSix = self.netSix(tenFiv)

                return [tenOne, tenTwo, tenThr, tenFou, tenFiv, tenSix]

            # end

        # end

        class Decoder(torch.nn.Module):
            def __init__(self, intLevel):
                super(Decoder, self).__init__()

                intPrevious = [
                    None,
                    None,
                    81 + 32 + 2 + 2,
                    81 + 64 + 2 + 2,
                    81 + 96 + 2 + 2,
                    81 + 128 + 2 + 2,
                    81,
                    None,
                ][intLevel + 1]
                intCurrent = [
                    None,
                    None,
                    81 + 32 + 2 + 2,
                    81 + 64 + 2 + 2,
                    81 + 96 + 2 + 2,
                    81 + 128 + 2 + 2,
                    81,
                    None,
                ][intLevel + 0]

                if intLevel < 6:
                    self.netUpflow = torch.nn.ConvTranspose2d(
                        in_channels=2, out_channels=2, kernel_size=4, stride=2, padding=1
                    )
                if intLevel < 6:
                    self.netUpfeat = torch.nn.ConvTranspose2d(
                        in_channels=intPrevious + 128 + 128 + 96 + 64 + 32,
                        out_channels=2,
                        kernel_size=4,
                        stride=2,
                        padding=1,
                    )
                if intLevel < 6:
                    self.fltBackwarp = [None, None, None, 5.0, 2.5, 1.25, 0.625, None][intLevel + 1]

                self.netOne = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=intCurrent, out_channels=128, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

                self.netTwo = torch.nn.Sequential(
                    torch.nn.Conv2d(in_channels=intCurrent + 128, out_channels=128, kernel_size=3, stride=1, padding=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

                self.netThr = torch.nn.Sequential(
                    torch.nn.Conv2d(
                        in_channels=intCurrent + 128 + 128, out_channels=96, kernel_size=3, stride=1, padding=1
                    ),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

                self.netFou = torch.nn.Sequential(
                    torch.nn.Conv2d(
                        in_channels=intCurrent + 128 + 128 + 96, out_channels=64, kernel_size=3, stride=1, padding=1
                    ),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

                self.netFiv = torch.nn.Sequential(
                    torch.nn.Conv2d(
                        in_channels=intCurrent + 128 + 128 + 96 + 64,
                        out_channels=32,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    ),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                )

                self.netSix = torch.nn.Sequential(
                    torch.nn.Conv2d(
                        in_channels=intCurrent + 128 + 128 + 96 + 64 + 32,
                        out_channels=2,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    )
                )

            # end

            def forward(self, tenFirst, tenSecond, objPrevious):
                tenFlow = None
                tenFeat = None

                if objPrevious is None:
                    tenFlow = None
                    tenFeat = None

                    tenVolume = torch.nn.functional.leaky_relu(
                        input=correlation.FunctionCorrelation(tenFirst=tenFirst, tenSecond=tenSecond),
                        negative_slope=0.1,
                        inplace=False,
                    )

                    tenFeat = torch.cat([tenVolume], 1)

                elif objPrevious is not None:
                    tenFlow = self.netUpflow(objPrevious["tenFlow"])
                    tenFeat = self.netUpfeat(objPrevious["tenFeat"])

                    tenVolume = torch.nn.functional.leaky_relu(
                        input=correlation.FunctionCorrelation(
                            tenFirst=tenFirst,
                            tenSecond=backwarp(tenInput=tenSecond, tenFlow=tenFlow * self.fltBackwarp),
                        ),
                        negative_slope=0.1,
                        inplace=False,
                    )

                    tenFeat = torch.cat([tenVolume, tenFirst, tenFlow, tenFeat], 1)

                # end

                tenFeat = torch.cat([self.netOne(tenFeat), tenFeat], 1)
                tenFeat = torch.cat([self.netTwo(tenFeat), tenFeat], 1)
                tenFeat = torch.cat([self.netThr(tenFeat), tenFeat], 1)
                tenFeat = torch.cat([self.netFou(tenFeat), tenFeat], 1)
                tenFeat = torch.cat([self.netFiv(tenFeat), tenFeat], 1)

                tenFlow = self.netSix(tenFeat)

                return {"tenFlow": tenFlow, "tenFeat": tenFeat}

            # end

        # end

        class Refiner(torch.nn.Module):
            def __init__(self):
                super(Refiner, self).__init__()

                self.netMain = torch.nn.Sequential(
                    torch.nn.Conv2d(
                        in_channels=81 + 32 + 2 + 2 + 128 + 128 + 96 + 64 + 32,
                        out_channels=128,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                        dilation=1,
                    ),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=2, dilation=2),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=4, dilation=4),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=128, out_channels=96, kernel_size=3, stride=1, padding=8, dilation=8),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=96, out_channels=64, kernel_size=3, stride=1, padding=16, dilation=16),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1, dilation=1),
                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
                    torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=3, stride=1, padding=1, dilation=1),
                )

            # end

            def forward(self, tenInput):
                return self.netMain(tenInput)

            # end

        # end

        self.netExtractor = Extractor()

        self.netTwo = Decoder(2)
        self.netThr = Decoder(3)
        self.netFou = Decoder(4)
        self.netFiv = Decoder(5)
        self.netSix = Decoder(6)

        self.netRefiner = Refiner()

        self.load_state_dict(
            {
                strKey.replace("module", "net"): tenWeight
                for strKey, tenWeight in torch.hub.load_state_dict_from_url(
                    url="http://content.sniklaus.com/github/pytorch-pwc/network-" + "default" + ".pytorch"
                ).items()
            }
        )

    # end

    def forward(self, tenFirst, tenSecond):
        intWidth = tenFirst.shape[3]
        intHeight = tenFirst.shape[2]

        intPreprocessedWidth = int(math.floor(math.ceil(intWidth / 64.0) * 64.0))
        intPreprocessedHeight = int(math.floor(math.ceil(intHeight / 64.0) * 64.0))

        tenPreprocessedFirst = torch.nn.functional.interpolate(
            input=tenFirst, size=(intPreprocessedHeight, intPreprocessedWidth), mode="bilinear", align_corners=False
        )
        tenPreprocessedSecond = torch.nn.functional.interpolate(
            input=tenSecond, size=(intPreprocessedHeight, intPreprocessedWidth), mode="bilinear", align_corners=False
        )

        tenFirst = self.netExtractor(tenPreprocessedFirst)
        tenSecond = self.netExtractor(tenPreprocessedSecond)

        objEstimate = self.netSix(tenFirst[-1], tenSecond[-1], None)
        objEstimate = self.netFiv(tenFirst[-2], tenSecond[-2], objEstimate)
        objEstimate = self.netFou(tenFirst[-3], tenSecond[-3], objEstimate)
        objEstimate = self.netThr(tenFirst[-4], tenSecond[-4], objEstimate)
        objEstimate = self.netTwo(tenFirst[-5], tenSecond[-5], objEstimate)

        tenFlow = objEstimate["tenFlow"] + self.netRefiner(objEstimate["tenFeat"])
        tenFlow = 20.0 * torch.nn.functional.interpolate(
            input=tenFlow, size=(intHeight, intWidth), mode="bilinear", align_corners=False
        )
        tenFlow[:, 0, :, :] *= float(intWidth) / float(intPreprocessedWidth)
        tenFlow[:, 1, :, :] *= float(intHeight) / float(intPreprocessedHeight)

        return tenFlow

    # end


# end

netNetwork = None

##########################################################


def estimate(tenFirst, tenSecond):
    global netNetwork

    if netNetwork is None:
        netNetwork = Network().cuda().eval()
    # end

    assert tenFirst.shape[1] == tenSecond.shape[1]
    assert tenFirst.shape[2] == tenSecond.shape[2]

    intWidth = tenFirst.shape[2]
    intHeight = tenFirst.shape[1]

    assert (
        intWidth == 1024
    )  # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue
    assert (
        intHeight == 436
    )  # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue

    tenPreprocessedFirst = tenFirst.cuda().view(1, 3, intHeight, intWidth)
    tenPreprocessedSecond = tenSecond.cuda().view(1, 3, intHeight, intWidth)

    intPreprocessedWidth = int(math.floor(math.ceil(intWidth / 64.0) * 64.0))
    intPreprocessedHeight = int(math.floor(math.ceil(intHeight / 64.0) * 64.0))

    tenPreprocessedFirst = torch.nn.functional.interpolate(
        input=tenPreprocessedFirst,
        size=(intPreprocessedHeight, intPreprocessedWidth),
        mode="bilinear",
        align_corners=False,
    )
    tenPreprocessedSecond = torch.nn.functional.interpolate(
        input=tenPreprocessedSecond,
        size=(intPreprocessedHeight, intPreprocessedWidth),
        mode="bilinear",
        align_corners=False,
    )

    tenFlow = 20.0 * torch.nn.functional.interpolate(
        input=netNetwork(tenPreprocessedFirst, tenPreprocessedSecond),
        size=(intHeight, intWidth),
        mode="bilinear",
        align_corners=False,
    )

    tenFlow[:, 0, :, :] *= float(intWidth) / float(intPreprocessedWidth)
    tenFlow[:, 1, :, :] *= float(intHeight) / float(intPreprocessedHeight)

    return tenFlow[0, :, :, :].cpu()


# end

##########################################################

# if __name__ == '__main__':
# 	tenFirst = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strFirst))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0)))
# 	tenSecond = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strSecond))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0)))

# 	tenOutput = estimate(tenFirst, tenSecond)

# 	objOutput = open(arguments_strOut, 'wb')

# 	numpy.array([ 80, 73, 69, 72 ], numpy.uint8).tofile(objOutput)
# 	numpy.array([ tenOutput.shape[2], tenOutput.shape[1] ], numpy.int32).tofile(objOutput)
# 	numpy.array(tenOutput.numpy().transpose(1, 2, 0), numpy.float32).tofile(objOutput)

# 	objOutput.close()
# end


================================================
FILE: Open-Sora/eval/vae/flolpips/utils.py
================================================
import cv2
import numpy as np
import torch


def normalize_tensor(in_feat, eps=1e-10):
    norm_factor = torch.sqrt(torch.sum(in_feat**2, dim=1, keepdim=True))
    return in_feat / (norm_factor + eps)


def l2(p0, p1, range=255.0):
    return 0.5 * np.mean((p0 / range - p1 / range) ** 2)


def dssim(p0, p1, range=255.0):
    from skimage.measure import compare_ssim

    return (1 - compare_ssim(p0, p1, data_range=range, multichannel=True)) / 2.0


def tensor2im(image_tensor, imtype=np.uint8, cent=1.0, factor=255.0 / 2.0):
    image_numpy = image_tensor[0].cpu().float().numpy()
    image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + cent) * factor
    return image_numpy.astype(imtype)


def tensor2np(tensor_obj):
    # change dimension of a tensor object into a numpy array
    return tensor_obj[0].cpu().float().numpy().transpose((1, 2, 0))


def np2tensor(np_obj):
    # change dimenion of np array into tensor array
    return torch.Tensor(np_obj[:, :, :, np.newaxis].transpose((3, 2, 0, 1)))


def tensor2tensorlab(image_tensor, to_norm=True, mc_only=False):
    # image tensor to lab tensor
    from skimage import color

    img = tensor2im(image_tensor)
    img_lab = color.rgb2lab(img)
    if mc_only:
        img_lab[:, :, 0] = img_lab[:, :, 0] - 50
    if to_norm and not mc_only:
        img_lab[:, :, 0] = img_lab[:, :, 0] - 50
        img_lab = img_lab / 100.0

    return np2tensor(img_lab)


def read_frame_yuv2rgb(stream, width, height, iFrame, bit_depth, pix_fmt="420"):
    if pix_fmt == "420":
        multiplier = 1
        uv_factor = 2
    elif pix_fmt == "444":
        multiplier = 2
        uv_factor = 1
    else:
        print("Pixel format {} is not supported".format(pix_fmt))
        return

    if bit_depth == 8:
        datatype = np.uint8
        stream.seek(iFrame * 1.5 * width * height * multiplier)
        Y = np.fromfile(stream, dtype=datatype, count=width * height).reshape((height, width))

        # read chroma samples and upsample since original is 4:2:0 sampling
        U = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
            (height // uv_factor, width // uv_factor)
        )
        V = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
            (height // uv_factor, width // uv_factor)
        )

    else:
        datatype = np.uint16
        stream.seek(iFrame * 3 * width * height * multiplier)
        Y = np.fromfile(stream, dtype=datatype, count=width * height).reshape((height, width))

        U = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
            (height // uv_factor, width // uv_factor)
        )
        V = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
            (height // uv_factor, width // uv_factor)
        )

    if pix_fmt == "420":
        yuv = np.empty((height * 3 // 2, width), dtype=datatype)
        yuv[0:height, :] = Y

        yuv[height : height + height // 4, :] = U.reshape(-1, width)
        yuv[height + height // 4 :, :] = V.reshape(-1, width)

        if bit_depth != 8:
            yuv = (yuv / (2**bit_depth - 1) * 255).astype(np.uint8)

        # convert to rgb
        rgb = cv2.cvtColor(yuv, cv2.COLOR_YUV2RGB_I420)

    else:
        yvu = np.stack([Y, V, U], axis=2)
        if bit_depth != 8:
            yvu = (yvu / (2**bit_depth - 1) * 255).astype(np.uint8)
        rgb = cv2.cvtColor(yvu, cv2.COLOR_YCrCb2RGB)

    return rgb


================================================
FILE: Open-Sora/eval/vae/script/eval.sh
================================================
python eval/eval_common_metric.py \
    --batch_size 2 \
    --real_video_dir ../test_eval/release/origin \
    --generated_video_dir ../test_eval/release \
    --device cuda \
    --sample_fps 10 \
    --crop_size 256 \
    --resolution 256 \
    --num_frames 17 \
    --sample_rate 1 \
    --subset_size 100 \
    --metric ssim psnr lpips flolpips


================================================
FILE: Open-Sora/eval/vbench/VBench_full_info.json
================================================
[
    {
        "prompt_en": "In a still frame, a stop sign",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "a toilet, frozen in time",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "a laptop, frozen in time",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of alley",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of bar",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of barn",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of bathroom",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of bedroom",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of cliff",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, courtyard",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, gas station",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of house",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "indoor gymnasium, frozen in time",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of indoor library",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of kitchen",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of palace",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, parking lot",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, phone booth",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of restaurant",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of tower",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a bowl",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of an apple",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a bench",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a bed",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a chair",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a cup",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a dining table",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, a pear",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a bunch of grapes",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a bowl on the kitchen counter",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a beautiful, handcrafted ceramic bowl",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of an antique bowl",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of an exquisite mahogany dining table",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a wooden bench in the park",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, a park bench with a view of the lake",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a vintage rocking chair was placed on the porch",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of the phone booth was tucked away in a quiet alley",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved fa\u00e7ades",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a country estate's library featured elegant wooden shelves",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time",
        "dimension": [
            "temporal_flickering"
        ]
    },
    {
        "prompt_en": "a bird and a cat",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "bird and cat"
            }
        }
    },
    {
        "prompt_en": "a cat and a dog",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "cat and dog"
            }
        }
    },
    {
        "prompt_en": "a dog and a horse",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "dog and horse"
            }
        }
    },
    {
        "prompt_en": "a horse and a sheep",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "horse and sheep"
            }
        }
    },
    {
        "prompt_en": "a sheep and a cow",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "sheep and cow"
            }
        }
    },
    {
        "prompt_en": "a cow and an elephant",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "cow and elephant"
            }
        }
    },
    {
        "prompt_en": "an elephant and a bear",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "elephant and bear"
            }
        }
    },
    {
        "prompt_en": "a bear and a zebra",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "bear and zebra"
            }
        }
    },
    {
        "prompt_en": "a zebra and a giraffe",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "zebra and giraffe"
            }
        }
    },
    {
        "prompt_en": "a giraffe and a bird",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "giraffe and bird"
            }
        }
    },
    {
        "prompt_en": "a chair and a couch",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "chair and couch"
            }
        }
    },
    {
        "prompt_en": "a couch and a potted plant",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "couch and potted plant"
            }
        }
    },
    {
        "prompt_en": "a potted plant and a tv",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "potted plant and tv"
            }
        }
    },
    {
        "prompt_en": "a tv and a laptop",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "tv and laptop"
            }
        }
    },
    {
        "prompt_en": "a laptop and a remote",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "laptop and remote"
            }
        }
    },
    {
        "prompt_en": "a remote and a keyboard",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "remote and keyboard"
            }
        }
    },
    {
        "prompt_en": "a keyboard and a cell phone",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "keyboard and cell phone"
            }
        }
    },
    {
        "prompt_en": "a cell phone and a book",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "cell phone and book"
            }
        }
    },
    {
        "prompt_en": "a book and a clock",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "book and clock"
            }
        }
    },
    {
        "prompt_en": "a clock and a backpack",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "clock and backpack"
            }
        }
    },
    {
        "prompt_en": "a backpack and an umbrella",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "backpack and umbrella"
            }
        }
    },
    {
        "prompt_en": "an umbrella and a handbag",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "umbrella and handbag"
            }
        }
    },
    {
        "prompt_en": "a handbag and a tie",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "handbag and tie"
            }
        }
    },
    {
        "prompt_en": "a tie and a suitcase",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "tie and suitcase"
            }
        }
    },
    {
        "prompt_en": "a suitcase and a vase",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "suitcase and vase"
            }
        }
    },
    {
        "prompt_en": "a vase and scissors",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "vase and scissors"
            }
        }
    },
    {
        "prompt_en": "scissors and a teddy bear",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "scissors and teddy bear"
            }
        }
    },
    {
        "prompt_en": "a teddy bear and a frisbee",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "teddy bear and frisbee"
            }
        }
    },
    {
        "prompt_en": "a frisbee and skis",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "frisbee and skis"
            }
        }
    },
    {
        "prompt_en": "skis and a snowboard",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "skis and snowboard"
            }
        }
    },
    {
        "prompt_en": "a snowboard and a sports ball",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "snowboard and sports ball"
            }
        }
    },
    {
        "prompt_en": "a sports ball and a kite",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "sports ball and kite"
            }
        }
    },
    {
        "prompt_en": "a kite and a baseball bat",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "kite and baseball bat"
            }
        }
    },
    {
        "prompt_en": "a baseball bat and a baseball glove",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "baseball bat and baseball glove"
            }
        }
    },
    {
        "prompt_en": "a baseball glove and a skateboard",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "baseball glove and skateboard"
            }
        }
    },
    {
        "prompt_en": "a skateboard and a surfboard",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "skateboard and surfboard"
            }
        }
    },
    {
        "prompt_en": "a surfboard and a tennis racket",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "surfboard and tennis racket"
            }
        }
    },
    {
        "prompt_en": "a tennis racket and a bottle",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "tennis racket and bottle"
            }
        }
    },
    {
        "prompt_en": "a bottle and a chair",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "bottle and chair"
            }
        }
    },
    {
        "prompt_en": "an airplane and a train",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "airplane and train"
            }
        }
    },
    {
        "prompt_en": "a train and a boat",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "train and boat"
            }
        }
    },
    {
        "prompt_en": "a boat and an airplane",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "boat and airplane"
            }
        }
    },
    {
        "prompt_en": "a bicycle and a car",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "bicycle and car"
            }
        }
    },
    {
        "prompt_en": "a car and a motorcycle",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "car and motorcycle"
            }
        }
    },
    {
        "prompt_en": "a motorcycle and a bus",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "motorcycle and bus"
            }
        }
    },
    {
        "prompt_en": "a bus and a traffic light",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "bus and traffic light"
            }
        }
    },
    {
        "prompt_en": "a traffic light and a fire hydrant",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "traffic light and fire hydrant"
            }
        }
    },
    {
        "prompt_en": "a fire hydrant and a stop sign",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "fire hydrant and stop sign"
            }
        }
    },
    {
        "prompt_en": "a stop sign and a parking meter",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "stop sign and parking meter"
            }
        }
    },
    {
        "prompt_en": "a parking meter and a truck",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "parking meter and truck"
            }
        }
    },
    {
        "prompt_en": "a truck and a bicycle",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "truck and bicycle"
            }
        }
    },
    {
        "prompt_en": "a toilet and a hair drier",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "toilet and hair drier"
            }
        }
    },
    {
        "prompt_en": "a hair drier and a toothbrush",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "hair drier and toothbrush"
            }
        }
    },
    {
        "prompt_en": "a toothbrush and a sink",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "toothbrush and sink"
            }
        }
    },
    {
        "prompt_en": "a sink and a toilet",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "sink and toilet"
            }
        }
    },
    {
        "prompt_en": "a wine glass and a chair",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "wine glass and chair"
            }
        }
    },
    {
        "prompt_en": "a cup and a couch",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "cup and couch"
            }
        }
    },
    {
        "prompt_en": "a fork and a potted plant",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "fork and potted plant"
            }
        }
    },
    {
        "prompt_en": "a knife and a tv",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "knife and tv"
            }
        }
    },
    {
        "prompt_en": "a spoon and a laptop",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "spoon and laptop"
            }
        }
    },
    {
        "prompt_en": "a bowl and a remote",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "bowl and remote"
            }
        }
    },
    {
        "prompt_en": "a banana and a keyboard",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "banana and keyboard"
            }
        }
    },
    {
        "prompt_en": "an apple and a cell phone",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "apple and cell phone"
            }
        }
    },
    {
        "prompt_en": "a sandwich and a book",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "sandwich and book"
            }
        }
    },
    {
        "prompt_en": "an orange and a clock",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "orange and clock"
            }
        }
    },
    {
        "prompt_en": "broccoli and a backpack",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "broccoli and backpack"
            }
        }
    },
    {
        "prompt_en": "a carrot and an umbrella",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "carrot and umbrella"
            }
        }
    },
    {
        "prompt_en": "a hot dog and a handbag",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "hot dog and handbag"
            }
        }
    },
    {
        "prompt_en": "a pizza and a tie",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "pizza and tie"
            }
        }
    },
    {
        "prompt_en": "a donut and a suitcase",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "donut and suitcase"
            }
        }
    },
    {
        "prompt_en": "a cake and a vase",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "cake and vase"
            }
        }
    },
    {
        "prompt_en": "an oven and scissors",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "oven and scissors"
            }
        }
    },
    {
        "prompt_en": "a toaster and a teddy bear",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "toaster and teddy bear"
            }
        }
    },
    {
        "prompt_en": "a microwave and a frisbee",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "microwave and frisbee"
            }
        }
    },
    {
        "prompt_en": "a refrigerator and skis",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "refrigerator and skis"
            }
        }
    },
    {
        "prompt_en": "a bicycle and an airplane",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "bicycle and airplane"
            }
        }
    },
    {
        "prompt_en": "a car and a train",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "car and train"
            }
        }
    },
    {
        "prompt_en": "a motorcycle and a boat",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "motorcycle and boat"
            }
        }
    },
    {
        "prompt_en": "a person and a toilet",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "person and toilet"
            }
        }
    },
    {
        "prompt_en": "a person and a hair drier",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "person and hair drier"
            }
        }
    },
    {
        "prompt_en": "a person and a toothbrush",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "person and toothbrush"
            }
        }
    },
    {
        "prompt_en": "a person and a sink",
        "dimension": [
            "multiple_objects"
        ],
        "auxiliary_info": {
            "multiple_objects": {
                "object": "person and sink"
            }
        }
    },
    {
        "prompt_en": "A person is riding a bike",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is marching",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is roller skating",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is tasting beer",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is clapping",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is drawing",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is petting animal (not cat)",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is eating watermelon",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is playing harp",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is wrestling",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is riding scooter",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is sweeping floor",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is skateboarding",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is dunking basketball",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is playing flute",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is stretching leg",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is tying tie",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is skydiving",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is shooting goal (soccer)",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is playing piano",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is finger snapping",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is canoeing or kayaking",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is laughing",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is digging",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is clay pottery making",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is shooting basketball",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is bending back",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is shaking hands",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is bandaging",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is push up",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is catching or throwing frisbee",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is playing trumpet",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is flying kite",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is filling eyebrows",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is shuffling cards",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is folding clothes",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is smoking",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is tai chi",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is squat",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is playing controller",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is throwing axe",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is giving or receiving award",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is air drumming",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is taking a shower",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is planting trees",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is sharpening knives",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is robot dancing",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is rock climbing",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is hula hooping",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is writing",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is bungee jumping",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is pushing cart",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is cleaning windows",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is cutting watermelon",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is cheerleading",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is washing hands",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is ironing",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is cutting nails",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is hugging",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is trimming or shaving beard",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is jogging",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is making bed",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is washing dishes",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is grooming dog",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is doing laundry",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is knitting",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is reading book",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is baby waking up",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is massaging legs",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is brushing teeth",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is crawling baby",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is motorcycling",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is driving car",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is sticking tongue out",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is shaking head",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is sword fighting",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is doing aerobics",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is strumming guitar",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is riding or walking with horse",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is archery",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is catching or throwing baseball",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is playing chess",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is rock scissors paper",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is using computer",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is arranging flowers",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is bending metal",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is ice skating",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is climbing a rope",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is crying",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is dancing ballet",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is getting a haircut",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is running on treadmill",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is kissing",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is counting money",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is barbequing",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is peeling apples",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is milking cow",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is shining shoes",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is making snowman",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "A person is sailing",
        "dimension": [
            "human_action"
        ]
    },
    {
        "prompt_en": "a person swimming in ocean",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a person giving a presentation to a room full of colleagues",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a person washing the dishes",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a person eating a burger",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a person walking in the snowstorm",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a person drinking coffee in a cafe",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a person playing guitar",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bicycle leaning against a tree",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bicycle gliding through a snowy field",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bicycle slowing down to stop",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bicycle accelerating to gain speed",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a car stuck in traffic during rush hour",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a car turning a corner",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a car slowing down to stop",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a car accelerating to gain speed",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a motorcycle cruising along a coastal highway",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a motorcycle turning a corner",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a motorcycle slowing down to stop",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a motorcycle gliding through a snowy field",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a motorcycle accelerating to gain speed",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "an airplane soaring through a clear blue sky",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "an airplane taking off",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "an airplane landing smoothly on a runway",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "an airplane accelerating to gain speed",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bus turning a corner",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bus stuck in traffic during rush hour",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bus accelerating to gain speed",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a train speeding down the tracks",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a train crossing over a tall bridge",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a train accelerating to gain speed",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a truck turning a corner",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a truck anchored in a tranquil bay",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a truck stuck in traffic during rush hour",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a truck slowing down to stop",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a truck accelerating to gain speed",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a boat sailing smoothly on a calm lake",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a boat slowing down to stop",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a boat accelerating to gain speed",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bird soaring gracefully in the sky",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bird building a nest from twigs and leaves",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bird flying over a snowy forest",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a cat grooming itself meticulously with its tongue",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a cat playing in park",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a cat drinking water",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a cat running happily",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a dog enjoying a peaceful walk",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a dog playing in park",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a dog drinking water",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a dog running happily",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a horse bending down to drink water from a river",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a horse galloping across an open field",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a horse taking a peaceful walk",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a horse running to join a herd of its kind",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a sheep bending down to drink water from a river",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a sheep taking a peaceful walk",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a sheep running to join a herd of its kind",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a cow bending down to drink water from a river",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a cow chewing cud while resting in a tranquil barn",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a cow running to join a herd of its kind",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "an elephant spraying itself with water using its trunk to cool down",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "an elephant taking a peaceful walk",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "an elephant running to join a herd of its kind",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bear catching a salmon in its powerful jaws",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bear sniffing the air for scents of food",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bear climbing a tree",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a bear hunting for prey",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a zebra bending down to drink water from a river",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a zebra running to join a herd of its kind",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a zebra taking a peaceful walk",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a giraffe bending down to drink water from a river",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a giraffe taking a peaceful walk",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a giraffe running to join a herd of its kind",
        "dimension": [
            "subject_consistency",
            "dynamic_degree",
            "motion_smoothness"
        ]
    },
    {
        "prompt_en": "a person",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "person"
            }
        }
    },
    {
        "prompt_en": "a bicycle",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "bicycle"
            }
        }
    },
    {
        "prompt_en": "a car",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "car"
            }
        }
    },
    {
        "prompt_en": "a motorcycle",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "motorcycle"
            }
        }
    },
    {
        "prompt_en": "an airplane",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "airplane"
            }
        }
    },
    {
        "prompt_en": "a bus",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "bus"
            }
        }
    },
    {
        "prompt_en": "a train",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "train"
            }
        }
    },
    {
        "prompt_en": "a truck",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "truck"
            }
        }
    },
    {
        "prompt_en": "a boat",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "boat"
            }
        }
    },
    {
        "prompt_en": "a traffic light",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "traffic light"
            }
        }
    },
    {
        "prompt_en": "a fire hydrant",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "fire hydrant"
            }
        }
    },
    {
        "prompt_en": "a stop sign",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "stop sign"
            }
        }
    },
    {
        "prompt_en": "a parking meter",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "parking meter"
            }
        }
    },
    {
        "prompt_en": "a bench",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "bench"
            }
        }
    },
    {
        "prompt_en": "a bird",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "bird"
            }
        }
    },
    {
        "prompt_en": "a cat",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "cat"
            }
        }
    },
    {
        "prompt_en": "a dog",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "dog"
            }
        }
    },
    {
        "prompt_en": "a horse",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "horse"
            }
        }
    },
    {
        "prompt_en": "a sheep",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "sheep"
            }
        }
    },
    {
        "prompt_en": "a cow",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "cow"
            }
        }
    },
    {
        "prompt_en": "an elephant",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "elephant"
            }
        }
    },
    {
        "prompt_en": "a bear",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "bear"
            }
        }
    },
    {
        "prompt_en": "a zebra",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "zebra"
            }
        }
    },
    {
        "prompt_en": "a giraffe",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "giraffe"
            }
        }
    },
    {
        "prompt_en": "a backpack",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "backpack"
            }
        }
    },
    {
        "prompt_en": "an umbrella",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "umbrella"
            }
        }
    },
    {
        "prompt_en": "a handbag",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "handbag"
            }
        }
    },
    {
        "prompt_en": "a tie",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "tie"
            }
        }
    },
    {
        "prompt_en": "a suitcase",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "suitcase"
            }
        }
    },
    {
        "prompt_en": "a frisbee",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "frisbee"
            }
        }
    },
    {
        "prompt_en": "skis",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "skis"
            }
        }
    },
    {
        "prompt_en": "a snowboard",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "snowboard"
            }
        }
    },
    {
        "prompt_en": "a sports ball",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "sports ball"
            }
        }
    },
    {
        "prompt_en": "a kite",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "kite"
            }
        }
    },
    {
        "prompt_en": "a baseball bat",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "baseball bat"
            }
        }
    },
    {
        "prompt_en": "a baseball glove",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "baseball glove"
            }
        }
    },
    {
        "prompt_en": "a skateboard",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "skateboard"
            }
        }
    },
    {
        "prompt_en": "a surfboard",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "surfboard"
            }
        }
    },
    {
        "prompt_en": "a tennis racket",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "tennis racket"
            }
        }
    },
    {
        "prompt_en": "a bottle",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "bottle"
            }
        }
    },
    {
        "prompt_en": "a wine glass",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "wine glass"
            }
        }
    },
    {
        "prompt_en": "a cup",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "cup"
            }
        }
    },
    {
        "prompt_en": "a fork",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "fork"
            }
        }
    },
    {
        "prompt_en": "a knife",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "knife"
            }
        }
    },
    {
        "prompt_en": "a spoon",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "spoon"
            }
        }
    },
    {
        "prompt_en": "a bowl",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "bowl"
            }
        }
    },
    {
        "prompt_en": "a banana",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "banana"
            }
        }
    },
    {
        "prompt_en": "an apple",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "apple"
            }
        }
    },
    {
        "prompt_en": "a sandwich",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "sandwich"
            }
        }
    },
    {
        "prompt_en": "an orange",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "orange"
            }
        }
    },
    {
        "prompt_en": "broccoli",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "broccoli"
            }
        }
    },
    {
        "prompt_en": "a carrot",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "carrot"
            }
        }
    },
    {
        "prompt_en": "a hot dog",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "hot dog"
            }
        }
    },
    {
        "prompt_en": "a pizza",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "pizza"
            }
        }
    },
    {
        "prompt_en": "a donut",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "donut"
            }
        }
    },
    {
        "prompt_en": "a cake",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "cake"
            }
        }
    },
    {
        "prompt_en": "a chair",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "chair"
            }
        }
    },
    {
        "prompt_en": "a couch",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "couch"
            }
        }
    },
    {
        "prompt_en": "a potted plant",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "potted plant"
            }
        }
    },
    {
        "prompt_en": "a bed",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "bed"
            }
        }
    },
    {
        "prompt_en": "a dining table",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "dining table"
            }
        }
    },
    {
        "prompt_en": "a toilet",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "toilet"
            }
        }
    },
    {
        "prompt_en": "a tv",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "tv"
            }
        }
    },
    {
        "prompt_en": "a laptop",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "laptop"
            }
        }
    },
    {
        "prompt_en": "a remote",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "remote"
            }
        }
    },
    {
        "prompt_en": "a keyboard",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "keyboard"
            }
        }
    },
    {
        "prompt_en": "a cell phone",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "cell phone"
            }
        }
    },
    {
        "prompt_en": "a microwave",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "microwave"
            }
        }
    },
    {
        "prompt_en": "an oven",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "oven"
            }
        }
    },
    {
        "prompt_en": "a toaster",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "toaster"
            }
        }
    },
    {
        "prompt_en": "a sink",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "sink"
            }
        }
    },
    {
        "prompt_en": "a refrigerator",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "refrigerator"
            }
        }
    },
    {
        "prompt_en": "a book",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "book"
            }
        }
    },
    {
        "prompt_en": "a clock",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "clock"
            }
        }
    },
    {
        "prompt_en": "a vase",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "vase"
            }
        }
    },
    {
        "prompt_en": "scissors",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "scissors"
            }
        }
    },
    {
        "prompt_en": "a teddy bear",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "teddy bear"
            }
        }
    },
    {
        "prompt_en": "a hair drier",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "hair drier"
            }
        }
    },
    {
        "prompt_en": "a toothbrush",
        "dimension": [
            "object_class"
        ],
        "auxiliary_info": {
            "object_class": {
                "object": "toothbrush"
            }
        }
    },
    {
        "prompt_en": "a red bicycle",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "red"
            }
        }
    },
    {
        "prompt_en": "a green bicycle",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "green"
            }
        }
    },
    {
        "prompt_en": "a blue bicycle",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "blue"
            }
        }
    },
    {
        "prompt_en": "a yellow bicycle",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "yellow"
            }
        }
    },
    {
        "prompt_en": "an orange bicycle",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "orange"
            }
        }
    },
    {
        "prompt_en": "a purple bicycle",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "purple"
            }
        }
    },
    {
        "prompt_en": "a pink bicycle",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "pink"
            }
        }
    },
    {
        "prompt_en": "a black bicycle",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "black"
            }
        }
    },
    {
        "prompt_en": "a white bicycle",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "white"
            }
        }
    },
    {
        "prompt_en": "a red car",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "red"
            }
        }
    },
    {
        "prompt_en": "a green car",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "green"
            }
        }
    },
    {
        "prompt_en": "a blue car",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "blue"
            }
        }
    },
    {
        "prompt_en": "a yellow car",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "yellow"
            }
        }
    },
    {
        "prompt_en": "an orange car",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "orange"
            }
        }
    },
    {
        "prompt_en": "a purple car",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "purple"
            }
        }
    },
    {
        "prompt_en": "a pink car",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "pink"
            }
        }
    },
    {
        "prompt_en": "a black car",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "black"
            }
        }
    },
    {
        "prompt_en": "a white car",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "white"
            }
        }
    },
    {
        "prompt_en": "a red bird",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "red"
            }
        }
    },
    {
        "prompt_en": "a green bird",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "green"
            }
        }
    },
    {
        "prompt_en": "a blue bird",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "blue"
            }
        }
    },
    {
        "prompt_en": "a yellow bird",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "yellow"
            }
        }
    },
    {
        "prompt_en": "an orange bird",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "orange"
            }
        }
    },
    {
        "prompt_en": "a purple bird",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "purple"
            }
        }
    },
    {
        "prompt_en": "a pink bird",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "pink"
            }
        }
    },
    {
        "prompt_en": "a black bird",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "black"
            }
        }
    },
    {
        "prompt_en": "a white bird",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "white"
            }
        }
    },
    {
        "prompt_en": "a black cat",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "black"
            }
        }
    },
    {
        "prompt_en": "a white cat",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "white"
            }
        }
    },
    {
        "prompt_en": "an orange cat",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "orange"
            }
        }
    },
    {
        "prompt_en": "a yellow cat",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "yellow"
            }
        }
    },
    {
        "prompt_en": "a red umbrella",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "red"
            }
        }
    },
    {
        "prompt_en": "a green umbrella",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "green"
            }
        }
    },
    {
        "prompt_en": "a blue umbrella",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "blue"
            }
        }
    },
    {
        "prompt_en": "a yellow umbrella",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "yellow"
            }
        }
    },
    {
        "prompt_en": "an orange umbrella",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "orange"
            }
        }
    },
    {
        "prompt_en": "a purple umbrella",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "purple"
            }
        }
    },
    {
        "prompt_en": "a pink umbrella",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "pink"
            }
        }
    },
    {
        "prompt_en": "a black umbrella",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "black"
            }
        }
    },
    {
        "prompt_en": "a white umbrella",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "white"
            }
        }
    },
    {
        "prompt_en": "a red suitcase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "red"
            }
        }
    },
    {
        "prompt_en": "a green suitcase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "green"
            }
        }
    },
    {
        "prompt_en": "a blue suitcase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "blue"
            }
        }
    },
    {
        "prompt_en": "a yellow suitcase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "yellow"
            }
        }
    },
    {
        "prompt_en": "an orange suitcase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "orange"
            }
        }
    },
    {
        "prompt_en": "a purple suitcase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "purple"
            }
        }
    },
    {
        "prompt_en": "a pink suitcase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "pink"
            }
        }
    },
    {
        "prompt_en": "a black suitcase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "black"
            }
        }
    },
    {
        "prompt_en": "a white suitcase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "white"
            }
        }
    },
    {
        "prompt_en": "a red bowl",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "red"
            }
        }
    },
    {
        "prompt_en": "a green bowl",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "green"
            }
        }
    },
    {
        "prompt_en": "a blue bowl",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "blue"
            }
        }
    },
    {
        "prompt_en": "a yellow bowl",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "yellow"
            }
        }
    },
    {
        "prompt_en": "an orange bowl",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "orange"
            }
        }
    },
    {
        "prompt_en": "a purple bowl",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "purple"
            }
        }
    },
    {
        "prompt_en": "a pink bowl",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "pink"
            }
        }
    },
    {
        "prompt_en": "a black bowl",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "black"
            }
        }
    },
    {
        "prompt_en": "a white bowl",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "white"
            }
        }
    },
    {
        "prompt_en": "a red chair",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "red"
            }
        }
    },
    {
        "prompt_en": "a green chair",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "green"
            }
        }
    },
    {
        "prompt_en": "a blue chair",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "blue"
            }
        }
    },
    {
        "prompt_en": "a yellow chair",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "yellow"
            }
        }
    },
    {
        "prompt_en": "an orange chair",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "orange"
            }
        }
    },
    {
        "prompt_en": "a purple chair",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "purple"
            }
        }
    },
    {
        "prompt_en": "a pink chair",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "pink"
            }
        }
    },
    {
        "prompt_en": "a black chair",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "black"
            }
        }
    },
    {
        "prompt_en": "a white chair",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "white"
            }
        }
    },
    {
        "prompt_en": "a red clock",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "red"
            }
        }
    },
    {
        "prompt_en": "a green clock",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "green"
            }
        }
    },
    {
        "prompt_en": "a blue clock",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "blue"
            }
        }
    },
    {
        "prompt_en": "a yellow clock",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "yellow"
            }
        }
    },
    {
        "prompt_en": "an orange clock",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "orange"
            }
        }
    },
    {
        "prompt_en": "a purple clock",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "purple"
            }
        }
    },
    {
        "prompt_en": "a pink clock",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "pink"
            }
        }
    },
    {
        "prompt_en": "a black clock",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "black"
            }
        }
    },
    {
        "prompt_en": "a white clock",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "white"
            }
        }
    },
    {
        "prompt_en": "a red vase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "red"
            }
        }
    },
    {
        "prompt_en": "a green vase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "green"
            }
        }
    },
    {
        "prompt_en": "a blue vase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "blue"
            }
        }
    },
    {
        "prompt_en": "a yellow vase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "yellow"
            }
        }
    },
    {
        "prompt_en": "an orange vase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "orange"
            }
        }
    },
    {
        "prompt_en": "a purple vase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "purple"
            }
        }
    },
    {
        "prompt_en": "a pink vase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "pink"
            }
        }
    },
    {
        "prompt_en": "a black vase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "black"
            }
        }
    },
    {
        "prompt_en": "a white vase",
        "dimension": [
            "color"
        ],
        "auxiliary_info": {
            "color": {
                "color": "white"
            }
        }
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "Van Gogh style"
            }
        }
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, oil painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "oil painting"
            }
        }
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "by Hokusai, in the style of Ukiyo"
            }
        }
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, black and white",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "black and white"
            }
        }
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pixel art",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "pixel art"
            }
        }
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "in cyberpunk style"
            }
        }
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, animated style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "animated style"
            }
        }
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, watercolor painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "watercolor painting"
            }
        }
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, surrealism style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "surrealism style"
            }
        }
    },
    {
        "prompt_en": "The bund Shanghai, Van Gogh style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "Van Gogh style"
            }
        }
    },
    {
        "prompt_en": "The bund Shanghai, oil painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "oil painting"
            }
        }
    },
    {
        "prompt_en": "The bund Shanghai by Hokusai, in the style of Ukiyo",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "by Hokusai, in the style of Ukiyo"
            }
        }
    },
    {
        "prompt_en": "The bund Shanghai, black and white",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "black and white"
            }
        }
    },
    {
        "prompt_en": "The bund Shanghai, pixel art",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "pixel art"
            }
        }
    },
    {
        "prompt_en": "The bund Shanghai, in cyberpunk style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "in cyberpunk style"
            }
        }
    },
    {
        "prompt_en": "The bund Shanghai, animated style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "animated style"
            }
        }
    },
    {
        "prompt_en": "The bund Shanghai, watercolor painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "watercolor painting"
            }
        }
    },
    {
        "prompt_en": "The bund Shanghai, surrealism style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "surrealism style"
            }
        }
    },
    {
        "prompt_en": "a shark is swimming in the ocean, Van Gogh style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "Van Gogh style"
            }
        }
    },
    {
        "prompt_en": "a shark is swimming in the ocean, oil painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "oil painting"
            }
        }
    },
    {
        "prompt_en": "a shark is swimming in the ocean by Hokusai, in the style of Ukiyo",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "by Hokusai, in the style of Ukiyo"
            }
        }
    },
    {
        "prompt_en": "a shark is swimming in the ocean, black and white",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "black and white"
            }
        }
    },
    {
        "prompt_en": "a shark is swimming in the ocean, pixel art",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "pixel art"
            }
        }
    },
    {
        "prompt_en": "a shark is swimming in the ocean, in cyberpunk style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "in cyberpunk style"
            }
        }
    },
    {
        "prompt_en": "a shark is swimming in the ocean, animated style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "animated style"
            }
        }
    },
    {
        "prompt_en": "a shark is swimming in the ocean, watercolor painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "watercolor painting"
            }
        }
    },
    {
        "prompt_en": "a shark is swimming in the ocean, surrealism style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "surrealism style"
            }
        }
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, Van Gogh style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "Van Gogh style"
            }
        }
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, oil painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "oil painting"
            }
        }
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "by Hokusai, in the style of Ukiyo"
            }
        }
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, black and white",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "black and white"
            }
        }
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, pixel art",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "pixel art"
            }
        }
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, in cyberpunk style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "in cyberpunk style"
            }
        }
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, animated style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "animated style"
            }
        }
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, watercolor painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "watercolor painting"
            }
        }
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, surrealism style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "surrealism style"
            }
        }
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, Van Gogh style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "Van Gogh style"
            }
        }
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, oil painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "oil painting"
            }
        }
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "by Hokusai, in the style of Ukiyo"
            }
        }
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, black and white",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "black and white"
            }
        }
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, pixel art",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "pixel art"
            }
        }
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, in cyberpunk style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "in cyberpunk style"
            }
        }
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, animated style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "animated style"
            }
        }
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, watercolor painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "watercolor painting"
            }
        }
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, surrealism style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "surrealism style"
            }
        }
    },
    {
        "prompt_en": "Gwen Stacy reading a book, Van Gogh style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "Van Gogh style"
            }
        }
    },
    {
        "prompt_en": "Gwen Stacy reading a book, oil painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "oil painting"
            }
        }
    },
    {
        "prompt_en": "Gwen Stacy reading a book by Hokusai, in the style of Ukiyo",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "by Hokusai, in the style of Ukiyo"
            }
        }
    },
    {
        "prompt_en": "Gwen Stacy reading a book, black and white",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "black and white"
            }
        }
    },
    {
        "prompt_en": "Gwen Stacy reading a book, pixel art",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "pixel art"
            }
        }
    },
    {
        "prompt_en": "Gwen Stacy reading a book, in cyberpunk style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "in cyberpunk style"
            }
        }
    },
    {
        "prompt_en": "Gwen Stacy reading a book, animated style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "animated style"
            }
        }
    },
    {
        "prompt_en": "Gwen Stacy reading a book, watercolor painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "watercolor painting"
            }
        }
    },
    {
        "prompt_en": "Gwen Stacy reading a book, surrealism style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "surrealism style"
            }
        }
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "Van Gogh style"
            }
        }
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "oil painting"
            }
        }
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "by Hokusai, in the style of Ukiyo"
            }
        }
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "black and white"
            }
        }
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "pixel art"
            }
        }
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "in cyberpunk style"
            }
        }
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "animated style"
            }
        }
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "watercolor painting"
            }
        }
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "surrealism style"
            }
        }
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "Van Gogh style"
            }
        }
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "oil painting"
            }
        }
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "by Hokusai, in the style of Ukiyo"
            }
        }
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "black and white"
            }
        }
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "pixel art"
            }
        }
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "in cyberpunk style"
            }
        }
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "animated style"
            }
        }
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "watercolor painting"
            }
        }
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "surrealism style"
            }
        }
    },
    {
        "prompt_en": "An astronaut flying in space, Van Gogh style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "Van Gogh style"
            }
        }
    },
    {
        "prompt_en": "An astronaut flying in space, oil painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "oil painting"
            }
        }
    },
    {
        "prompt_en": "An astronaut flying in space by Hokusai, in the style of Ukiyo",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "by Hokusai, in the style of Ukiyo"
            }
        }
    },
    {
        "prompt_en": "An astronaut flying in space, black and white",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "black and white"
            }
        }
    },
    {
        "prompt_en": "An astronaut flying in space, pixel art",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "pixel art"
            }
        }
    },
    {
        "prompt_en": "An astronaut flying in space, in cyberpunk style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "in cyberpunk style"
            }
        }
    },
    {
        "prompt_en": "An astronaut flying in space, animated style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "animated style"
            }
        }
    },
    {
        "prompt_en": "An astronaut flying in space, watercolor painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "watercolor painting"
            }
        }
    },
    {
        "prompt_en": "An astronaut flying in space, surrealism style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "surrealism style"
            }
        }
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "Van Gogh style"
            }
        }
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "oil painting"
            }
        }
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "by Hokusai, in the style of Ukiyo"
            }
        }
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "black and white"
            }
        }
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "pixel art"
            }
        }
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "in cyberpunk style"
            }
        }
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "animated style"
            }
        }
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "watercolor painting"
            }
        }
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style",
        "dimension": [
            "appearance_style"
        ],
        "auxiliary_info": {
            "appearance_style": {
                "appearance_style": "surrealism style"
            }
        }
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in super slow motion",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom in",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom out",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan left",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan right",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt up",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt down",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, racking focus",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, in super slow motion",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, zoom in",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, zoom out",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, pan left",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, pan right",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, tilt up",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, tilt down",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, with an intense shaking effect",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, featuring a steady and smooth perspective",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, racking focus",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean, in super slow motion",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean, zoom in",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean, zoom out",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean, pan left",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean, pan right",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean, tilt up",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean, tilt down",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean, with an intense shaking effect",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean, featuring a steady and smooth perspective",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean, racking focus",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, in super slow motion",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, zoom in",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, zoom out",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, pan left",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, pan right",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, tilt up",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, tilt down",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, with an intense shaking effect",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris, racking focus",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, in super slow motion",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, zoom in",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, zoom out",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, pan left",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, pan right",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, tilt up",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, tilt down",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, with an intense shaking effect",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset, racking focus",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book, in super slow motion",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book, zoom in",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book, zoom out",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book, pan left",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book, pan right",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book, tilt up",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book, tilt down",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book, with an intense shaking effect",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book, featuring a steady and smooth perspective",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book, racking focus",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space, in super slow motion",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space, zoom in",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space, zoom out",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space, pan left",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space, pan right",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space, tilt up",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space, tilt down",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space, with an intense shaking effect",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space, featuring a steady and smooth perspective",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space, racking focus",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus",
        "dimension": [
            "temporal_style"
        ]
    },
    {
        "prompt_en": "Close up of grapes on a rotating table.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Turtle swimming in ocean.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A storm trooper vacuuming the beach.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A panda standing on a surfboard in the ocean in sunset.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "An astronaut feeding ducks on a sunny afternoon, reflection from the water.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Two pandas discussing an academic paper.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Sunset time lapse at the beach with moving clouds and colors in the sky.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A fat rabbit wearing a purple robe walking through a fantasy landscape.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A koala bear playing piano in the forest.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "An astronaut flying in space.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Fireworks.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "An animated painting of fluffy white clouds moving in sky.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Flying through fantasy landscapes.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A bigfoot walking in the snowstorm.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A squirrel eating a burger.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A cat wearing sunglasses and working as a lifeguard at a pool.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Splash of turquoise water in extreme slow motion, alpha channel included.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "an ice cream is melting on the table.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "a drone flying over a snowy forest.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "a shark is swimming in the ocean.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Aerial panoramic video from a drone of a fantasy land.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "a teddy bear is swimming in the ocean.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "time lapse of sunrise on mars.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "golden fish swimming in the ocean.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "An artist brush painting on a canvas close up.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A drone view of celebration with Christmas tree and fireworks, starry sky - background.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Campfire at night in a snowy forest with starry sky in the background.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "a fantasy landscape",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A 3D model of a 1800s victorian house.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "this is how I do makeup in the morning.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A raccoon that looks like a turtle, digital art.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Robot dancing in Times Square.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Busy freeway at night.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Balloon full of water exploding in extreme slow motion.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "An astronaut is riding a horse in the space in a photorealistic style.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Sewing machine, old sewing machine working.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Vampire makeup face of beautiful girl, red contact lenses.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Ashtray full of butts on table, smoke flowing on black background, close-up",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Pacific coast, carmel by the sea ocean and waves.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A teddy bear is playing drum kit in NYC Times Square.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A corgi is playing drum kit.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "An Iron man is playing the electronic guitar, high electronic guitar.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A raccoon is playing the electronic guitar.",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A corgi's head depicted as an explosion of a nebula",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A fantasy landscape",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A future where humans have achieved teleportation technology",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A jellyfish floating through the ocean, with bioluminescent tentacles",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A Mars rover moving on Mars",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A panda drinking coffee in a cafe in Paris",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A space shuttle launching into orbit, with flames and smoke billowing out from the engines",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A steam train moving on a mountainside",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A super cool giant robot in Cyberpunk Beijing",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Cinematic shot of Van Gogh's selfie, Van Gogh style",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Gwen Stacy reading a book",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Iron Man flying in the sky",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, oil painting",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Yoda playing guitar on the stage",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A car moving slowly on an empty street, rainy evening",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A cat eating food out of a bowl",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A cat wearing sunglasses at a pool",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A confused panda in calculus class",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A cute fluffy panda eating Chinese food in a restaurant",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A cute happy Corgi playing in park, sunset",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A cute raccoon playing guitar in a boat on the ocean",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A lightning striking atop of eiffel tower, dark clouds in the sky",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A modern art museum, with colorful paintings",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A panda cooking in the kitchen",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A panda playing on a swing set",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A polar bear is playing guitar",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A raccoon dressed in suit playing the trumpet, stage background",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A shark swimming in clear Caribbean ocean",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A super robot protecting city",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "A teddy bear washing the dishes",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "An epic tornado attacking above a glowing city at night, the tornado is made of smoke",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Clown fish swimming through the coral reef",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Hyper-realistic spaceship landing on Mars",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "The bund Shanghai, vibrant color",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Vincent van Gogh is painting in the room",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "Yellow flowers swing in the wind",
        "dimension": [
            "overall_consistency",
            "aesthetic_quality",
            "imaging_quality"
        ]
    },
    {
        "prompt_en": "alley",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "alley"
                }
            }
        }
    },
    {
        "prompt_en": "amusement park",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "amusement park"
                }
            }
        }
    },
    {
        "prompt_en": "aquarium",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "aquarium"
                }
            }
        }
    },
    {
        "prompt_en": "arch",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "arch"
                }
            }
        }
    },
    {
        "prompt_en": "art gallery",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "art gallery"
                }
            }
        }
    },
    {
        "prompt_en": "bathroom",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "bathroom"
                }
            }
        }
    },
    {
        "prompt_en": "bakery shop",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "bakery shop"
                }
            }
        }
    },
    {
        "prompt_en": "ballroom",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "ballroom"
                }
            }
        }
    },
    {
        "prompt_en": "bar",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "bar"
                }
            }
        }
    },
    {
        "prompt_en": "barn",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "barn"
                }
            }
        }
    },
    {
        "prompt_en": "basement",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "basement"
                }
            }
        }
    },
    {
        "prompt_en": "beach",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "beach"
                }
            }
        }
    },
    {
        "prompt_en": "bedroom",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "bedroom"
                }
            }
        }
    },
    {
        "prompt_en": "bridge",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "bridge"
                }
            }
        }
    },
    {
        "prompt_en": "botanical garden",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "botanical garden"
                }
            }
        }
    },
    {
        "prompt_en": "cafeteria",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "cafeteria"
                }
            }
        }
    },
    {
        "prompt_en": "campsite",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "campsite"
                }
            }
        }
    },
    {
        "prompt_en": "campus",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "campus"
                }
            }
        }
    },
    {
        "prompt_en": "carrousel",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "carrousel"
                }
            }
        }
    },
    {
        "prompt_en": "castle",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "castle"
                }
            }
        }
    },
    {
        "prompt_en": "cemetery",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "cemetery"
                }
            }
        }
    },
    {
        "prompt_en": "classroom",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "classroom"
                }
            }
        }
    },
    {
        "prompt_en": "cliff",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "cliff"
                }
            }
        }
    },
    {
        "prompt_en": "crosswalk",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "crosswalk"
                }
            }
        }
    },
    {
        "prompt_en": "construction site",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "construction site"
                }
            }
        }
    },
    {
        "prompt_en": "corridor",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "corridor"
                }
            }
        }
    },
    {
        "prompt_en": "courtyard",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "courtyard"
                }
            }
        }
    },
    {
        "prompt_en": "desert",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "desert"
                }
            }
        }
    },
    {
        "prompt_en": "downtown",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "downtown"
                }
            }
        }
    },
    {
        "prompt_en": "driveway",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "driveway"
                }
            }
        }
    },
    {
        "prompt_en": "farm",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "farm"
                }
            }
        }
    },
    {
        "prompt_en": "food court",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "food court"
                }
            }
        }
    },
    {
        "prompt_en": "football field",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "football field"
                }
            }
        }
    },
    {
        "prompt_en": "forest road",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "forest road"
                }
            }
        }
    },
    {
        "prompt_en": "fountain",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "fountain"
                }
            }
        }
    },
    {
        "prompt_en": "gas station",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "gas station"
                }
            }
        }
    },
    {
        "prompt_en": "glacier",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "glacier"
                }
            }
        }
    },
    {
        "prompt_en": "golf course",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "golf course"
                }
            }
        }
    },
    {
        "prompt_en": "indoor gymnasium",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "indoor gymnasium"
                }
            }
        }
    },
    {
        "prompt_en": "harbor",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "harbor"
                }
            }
        }
    },
    {
        "prompt_en": "highway",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "highway"
                }
            }
        }
    },
    {
        "prompt_en": "hospital",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "hospital"
                }
            }
        }
    },
    {
        "prompt_en": "house",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "house"
                }
            }
        }
    },
    {
        "prompt_en": "iceberg",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "iceberg"
                }
            }
        }
    },
    {
        "prompt_en": "industrial area",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "industrial area"
                }
            }
        }
    },
    {
        "prompt_en": "jail cell",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "jail cell"
                }
            }
        }
    },
    {
        "prompt_en": "junkyard",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "junkyard"
                }
            }
        }
    },
    {
        "prompt_en": "kitchen",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "kitchen"
                }
            }
        }
    },
    {
        "prompt_en": "indoor library",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "indoor library"
                }
            }
        }
    },
    {
        "prompt_en": "lighthouse",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "lighthouse"
                }
            }
        }
    },
    {
        "prompt_en": "laboratory",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "laboratory"
                }
            }
        }
    },
    {
        "prompt_en": "mansion",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "mansion"
                }
            }
        }
    },
    {
        "prompt_en": "marsh",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "marsh"
                }
            }
        }
    },
    {
        "prompt_en": "mountain",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "mountain"
                }
            }
        }
    },
    {
        "prompt_en": "indoor movie theater",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "indoor movie theater"
                }
            }
        }
    },
    {
        "prompt_en": "indoor museum",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "indoor museum"
                }
            }
        }
    },
    {
        "prompt_en": "music studio",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "music studio"
                }
            }
        }
    },
    {
        "prompt_en": "nursery",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "nursery"
                }
            }
        }
    },
    {
        "prompt_en": "ocean",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "ocean"
                }
            }
        }
    },
    {
        "prompt_en": "office",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "office"
                }
            }
        }
    },
    {
        "prompt_en": "palace",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "palace"
                }
            }
        }
    },
    {
        "prompt_en": "parking lot",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "parking lot"
                }
            }
        }
    },
    {
        "prompt_en": "pharmacy",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "pharmacy"
                }
            }
        }
    },
    {
        "prompt_en": "phone booth",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "phone booth"
                }
            }
        }
    },
    {
        "prompt_en": "raceway",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "raceway"
                }
            }
        }
    },
    {
        "prompt_en": "restaurant",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "restaurant"
                }
            }
        }
    },
    {
        "prompt_en": "river",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "river"
                }
            }
        }
    },
    {
        "prompt_en": "science museum",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "science museum"
                }
            }
        }
    },
    {
        "prompt_en": "shower",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "shower"
                }
            }
        }
    },
    {
        "prompt_en": "ski slope",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "ski slope"
                }
            }
        }
    },
    {
        "prompt_en": "sky",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "sky"
                }
            }
        }
    },
    {
        "prompt_en": "skyscraper",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "skyscraper"
                }
            }
        }
    },
    {
        "prompt_en": "baseball stadium",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "baseball stadium"
                }
            }
        }
    },
    {
        "prompt_en": "staircase",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "staircase"
                }
            }
        }
    },
    {
        "prompt_en": "street",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "street"
                }
            }
        }
    },
    {
        "prompt_en": "supermarket",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "supermarket"
                }
            }
        }
    },
    {
        "prompt_en": "indoor swimming pool",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "indoor swimming pool"
                }
            }
        }
    },
    {
        "prompt_en": "tower",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "tower"
                }
            }
        }
    },
    {
        "prompt_en": "outdoor track",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "outdoor track"
                }
            }
        }
    },
    {
        "prompt_en": "train railway",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "train railway"
                }
            }
        }
    },
    {
        "prompt_en": "train station platform",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "train station platform"
                }
            }
        }
    },
    {
        "prompt_en": "underwater coral reef",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "underwater coral reef"
                }
            }
        }
    },
    {
        "prompt_en": "valley",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "valley"
                }
            }
        }
    },
    {
        "prompt_en": "volcano",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "volcano"
                }
            }
        }
    },
    {
        "prompt_en": "waterfall",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "waterfall"
                }
            }
        }
    },
    {
        "prompt_en": "windmill",
        "dimension": [
            "scene",
            "background_consistency"
        ],
        "auxiliary_info": {
            "scene": {
                "scene": {
                    "scene": "windmill"
                }
            }
        }
    },
    {
        "prompt_en": "a bicycle on the left of a car, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "bicycle",
                    "object_b": "car",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a car on the right of a motorcycle, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "car",
                    "object_b": "motorcycle",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a motorcycle on the left of a bus, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "motorcycle",
                    "object_b": "bus",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a bus on the right of a traffic light, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "bus",
                    "object_b": "traffic light",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a traffic light on the left of a fire hydrant, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "traffic light",
                    "object_b": "fire hydrant",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a fire hydrant on the right of a stop sign, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "fire hydrant",
                    "object_b": "stop sign",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a stop sign on the left of a parking meter, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "stop sign",
                    "object_b": "parking meter",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a parking meter on the right of a bench, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "parking meter",
                    "object_b": "bench",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a bench on the left of a truck, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "bench",
                    "object_b": "truck",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a truck on the right of a bicycle, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "truck",
                    "object_b": "bicycle",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a bird on the left of a cat, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "bird",
                    "object_b": "cat",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a cat on the right of a dog, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "cat",
                    "object_b": "dog",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a dog on the left of a horse, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "dog",
                    "object_b": "horse",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a horse on the right of a sheep, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "horse",
                    "object_b": "sheep",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a sheep on the left of a cow, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "sheep",
                    "object_b": "cow",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a cow on the right of an elephant, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "cow",
                    "object_b": "elephant",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "an elephant on the left of a bear, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "elephant",
                    "object_b": "bear",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a bear on the right of a zebra, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "bear",
                    "object_b": "zebra",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a zebra on the left of a giraffe, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "zebra",
                    "object_b": "giraffe",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a giraffe on the right of a bird, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "giraffe",
                    "object_b": "bird",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a bottle on the left of a wine glass, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "bottle",
                    "object_b": "wine glass",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a wine glass on the right of a cup, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "wine glass",
                    "object_b": "cup",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a cup on the left of a fork, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "cup",
                    "object_b": "fork",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a fork on the right of a knife, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "fork",
                    "object_b": "knife",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a knife on the left of a spoon, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "knife",
                    "object_b": "spoon",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a spoon on the right of a bowl, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "spoon",
                    "object_b": "bowl",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a bowl on the left of a bottle, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "bowl",
                    "object_b": "bottle",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a potted plant on the left of a remote, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "potted plant",
                    "object_b": "remote",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a remote on the right of a clock, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "remote",
                    "object_b": "clock",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a clock on the left of a vase, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "clock",
                    "object_b": "vase",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a vase on the right of scissors, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "vase",
                    "object_b": "scissors",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "scissors on the left of a teddy bear, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "scissors",
                    "object_b": "teddy bear",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a teddy bear on the right of a potted plant, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "teddy bear",
                    "object_b": "potted plant",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a frisbee on the left of a sports ball, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "frisbee",
                    "object_b": "sports ball",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a sports ball on the right of a baseball bat, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "sports ball",
                    "object_b": "baseball bat",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a baseball bat on the left of a baseball glove, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "baseball bat",
                    "object_b": "baseball glove",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a baseball glove on the right of a tennis racket, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "baseball glove",
                    "object_b": "tennis racket",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a tennis racket on the left of a frisbee, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "tennis racket",
                    "object_b": "frisbee",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a toilet on the left of a hair drier, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "toilet",
                    "object_b": "hair drier",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a hair drier on the right of a toothbrush, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "hair drier",
                    "object_b": "toothbrush",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a toothbrush on the left of a sink, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "toothbrush",
                    "object_b": "sink",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a sink on the right of a toilet, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "sink",
                    "object_b": "toilet",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a chair on the left of a couch, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "chair",
                    "object_b": "couch",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a couch on the right of a bed, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "couch",
                    "object_b": "bed",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a bed on the left of a tv, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "bed",
                    "object_b": "tv",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a tv on the right of a dining table, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "tv",
                    "object_b": "dining table",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a dining table on the left of a chair, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "dining table",
                    "object_b": "chair",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "an airplane on the left of a train, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "airplane",
                    "object_b": "train",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "a train on the right of a boat, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "train",
                    "object_b": "boat",
                    "relationship": "on the right of"
                }
            }
        }
    },
    {
        "prompt_en": "a boat on the left of an airplane, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "boat",
                    "object_b": "airplane",
                    "relationship": "on the left of"
                }
            }
        }
    },
    {
        "prompt_en": "an oven on the top of a toaster, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "oven",
                    "object_b": "toaster",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "an oven on the bottom of a toaster, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "oven",
                    "object_b": "toaster",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a toaster on the top of a microwave, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "toaster",
                    "object_b": "microwave",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a toaster on the bottom of a microwave, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "toaster",
                    "object_b": "microwave",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a microwave on the top of an oven, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "microwave",
                    "object_b": "oven",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a microwave on the bottom of an oven, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "microwave",
                    "object_b": "oven",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a banana on the top of an apple, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "banana",
                    "object_b": "apple",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a banana on the bottom of an apple, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "banana",
                    "object_b": "apple",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "an apple on the top of a sandwich, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "apple",
                    "object_b": "sandwich",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "an apple on the bottom of a sandwich, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "apple",
                    "object_b": "sandwich",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a sandwich on the top of an orange, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "sandwich",
                    "object_b": "orange",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a sandwich on the bottom of an orange, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "sandwich",
                    "object_b": "orange",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "an orange on the top of a carrot, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "orange",
                    "object_b": "carrot",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "an orange on the bottom of a carrot, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "orange",
                    "object_b": "carrot",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a carrot on the top of a hot dog, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "carrot",
                    "object_b": "hot dog",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a carrot on the bottom of a hot dog, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "carrot",
                    "object_b": "hot dog",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a hot dog on the top of a pizza, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "hot dog",
                    "object_b": "pizza",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a hot dog on the bottom of a pizza, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "hot dog",
                    "object_b": "pizza",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a pizza on the top of a donut, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "pizza",
                    "object_b": "donut",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a pizza on the bottom of a donut, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "pizza",
                    "object_b": "donut",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a donut on the top of broccoli, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "donut",
                    "object_b": "broccoli",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a donut on the bottom of broccoli, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "donut",
                    "object_b": "broccoli",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "broccoli on the top of a banana, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "broccoli",
                    "object_b": "banana",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "broccoli on the bottom of a banana, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "broccoli",
                    "object_b": "banana",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "skis on the top of a snowboard, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "skis",
                    "object_b": "snowboard",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "skis on the bottom of a snowboard, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "skis",
                    "object_b": "snowboard",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a snowboard on the top of a kite, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "snowboard",
                    "object_b": "kite",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a snowboard on the bottom of a kite, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "snowboard",
                    "object_b": "kite",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a kite on the top of a skateboard, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "kite",
                    "object_b": "skateboard",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a kite on the bottom of a skateboard, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "kite",
                    "object_b": "skateboard",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a skateboard on the top of a surfboard, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "skateboard",
                    "object_b": "surfboard",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a skateboard on the bottom of a surfboard, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "skateboard",
                    "object_b": "surfboard",
                    "relationship": "on the bottom of"
                }
            }
        }
    },
    {
        "prompt_en": "a surfboard on the top of skis, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "surfboard",
                    "object_b": "skis",
                    "relationship": "on the top of"
                }
            }
        }
    },
    {
        "prompt_en": "a surfboard on the bottom of skis, front view",
        "dimension": [
            "spatial_relationship"
        ],
        "auxiliary_info": {
            "spatial_relationship": {
                "spatial_relationship": {
                    "object_a": "surfboard",
                    "object_b": "skis",
                    "relationship": "on the bottom of"
                }
            }
        }
    }
]


================================================
FILE: Open-Sora/eval/vbench/calc_vbench.py
================================================
import argparse
import os
import time

import torch
from vbench import VBench

full_info_path = "eval/vbench/VBench_full_info.json"
dimensions = [
    # a: 10min
    "subject_consistency",  # 4min
    "imaging_quality",  # 6min
    # b: 12min
    "background_consistency",  # 2min
    "motion_smoothness",  # 5min
    "overall_consistency",  # 2min
    "human_action",  # 3min
    # c: 14min
    "multiple_objects",  # 14min
    # d: 14min
    "spatial_relationship",  # 14min
    # e: 12min
    "object_class",  # 12min
    # f: 12min
    "color",  # 12min
    # g: 10.5min
    "aesthetic_quality",  # 2.5min
    "appearance_style",  # 6min
    "temporal_flickering",  # 2min
    # h: 9min
    "scene",  # 3min
    "temporal_style",  # 2min
    "dynamic_degree",  # 4min
]


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("video_folder", type=str)  # samples/samples..._vbench/eval
    parser.add_argument("model_ckpt", type=str)
    parser.add_argument("--start", type=int, default=0)  # start index of dimension to be evaluated
    parser.add_argument("--end", type=int, default=-1)  # start index of dimension to be evaluated

    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()
    output_dir = os.path.join(args.model_ckpt, "vbench")
    os.makedirs(output_dir, exist_ok=True)
    video_path = args.video_folder

    kwargs = {}
    kwargs["imaging_quality_preprocessing_mode"] = "longer"  # use VBench/evaluate.py default

    start_time = time.time()

    # NOTE: important to use torch.device("cuda"), else will have issue with object_class third_party module
    my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir)
    if args.end == -1:  # adjust end accordingly
        args.end = len(dimensions)
    for dim in dimensions[args.start : args.end]:
        my_VBench.evaluate(
            videos_path=video_path,
            name=dim,
            local=False,
            read_frame=False,
            dimension_list=[dim],
            mode="vbench_standard",
            **kwargs,
        )

    print("Runtime: %s seconds " % (time.time() - start_time))


================================================
FILE: Open-Sora/eval/vbench/launch.sh
================================================
# !/bin/bash

CKPT=$1
NUM_FRAMES=$2
MODEL_NAME=$3
RES=$4
ASP_RATIO=$5

NUM_SAMPLING_STEPS=$6
FLOW=$7
LLM_REFINE=$8

if [[ $CKPT == *"ema"* ]]; then
    parentdir=$(dirname $CKPT)
    CKPT_BASE=$(basename $parentdir)_ema
else
    CKPT_BASE=$(basename $CKPT)
fi
LOG_BASE=$(dirname $CKPT)/eval
echo "Logging to $LOG_BASE"

# 确保 eval 目录存在
mkdir -p $LOG_BASE

#GPUS=(0 1 2 3 4 5 6 7)
#TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
#START_INDEX_LIST=(0 120 240 360 480 600 720 840)
#END_INDEX_LIST=(120 240 360 480 600 720 840 2000)

# 使用 6 张 GPU
GPUS=(0 1 2 3 4 5)
TASK_ID_LIST=(4a 4b 4c 4d 4e 4f)
# 将 950 个 prompts 划分为 6 个区间
START_INDEX_LIST=(0 158 316 474 632 790)
END_INDEX_LIST=(158 316 474 632 790 2000)

# 使用 5 张 GPU
#GPUS=(0 1 2 3 4)
#TASK_ID_LIST=(4a 4b 4c 4d 4e)
## 将 950 个 prompts 划分为 5 个区间
#START_INDEX_LIST=(0 190 380 570 760)
#END_INDEX_LIST=(190 380 570 760 2000)

## Modify the following to run on multiple machines for faster results
## 720p will take quite long on a single machine
# START_INDEX_LIST=(60 180 300 420 540 660 780 900)
# END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
# LOG_BASE=$(dirname $CKPT)/eval/last_60
# mkdir -p ${LOG_BASE}
# echo "Logging to $LOG_BASE"


for i in "${!GPUS[@]}"; do
    if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
        then
            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
        else
            if [ -z ${NUM_SAMPLING_STEPS} ];
                then
                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                else
                    if [ -z ${FLOW} ];
                    then
                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                    else
                        if [ -z ${LLM_REFINE} ];
                            then
                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                            else
                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                        fi
                    fi
            fi
    fi
done


================================================
FILE: Open-Sora/eval/vbench/launch_calc.sh
================================================
# !/bin/bash

VIDEO_DIR=$1
CKPT_DIR=$2
LOG_BASE=$CKPT_DIR
mkdir -p $LOG_BASE
echo "Logging to $LOG_BASE"

GPUS=(0 1 2 3 4 5 6 7)
START_INDEX_LIST=(0 2 6 7 8 9 10 13)
END_INDEX_LIST=(2 6 7 8 9 10 13 16)
TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only

for i in "${!GPUS[@]}"; do
    CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
done


================================================
FILE: Open-Sora/eval/vbench/tabulate_vbench_scores.py
================================================
import argparse
import json
import os

SEMANTIC_WEIGHT = 1
QUALITY_WEIGHT = 4

QUALITY_LIST = [
    "subject consistency",
    "background consistency",
    "temporal flickering",
    "motion smoothness",
    "aesthetic quality",
    "imaging quality",
    "dynamic degree",
]

SEMANTIC_LIST = [
    "object class",
    "multiple objects",
    "human action",
    "color",
    "spatial relationship",
    "scene",
    "appearance style",
    "temporal style",
    "overall consistency",
]

NORMALIZE_DIC = {
    "subject consistency": {"Min": 0.1462, "Max": 1.0},
    "background consistency": {"Min": 0.2615, "Max": 1.0},
    "temporal flickering": {"Min": 0.6293, "Max": 1.0},
    "motion smoothness": {"Min": 0.706, "Max": 0.9975},
    "dynamic degree": {"Min": 0.0, "Max": 1.0},
    "aesthetic quality": {"Min": 0.0, "Max": 1.0},
    "imaging quality": {"Min": 0.0, "Max": 1.0},
    "object class": {"Min": 0.0, "Max": 1.0},
    "multiple objects": {"Min": 0.0, "Max": 1.0},
    "human action": {"Min": 0.0, "Max": 1.0},
    "color": {"Min": 0.0, "Max": 1.0},
    "spatial relationship": {"Min": 0.0, "Max": 1.0},
    "scene": {"Min": 0.0, "Max": 0.8222},
    "appearance style": {"Min": 0.0009, "Max": 0.2855},
    "temporal style": {"Min": 0.0, "Max": 0.364},
    "overall consistency": {"Min": 0.0, "Max": 0.364},
}

DIM_WEIGHT = {
    "subject consistency": 1,
    "background consistency": 1,
    "temporal flickering": 1,
    "motion smoothness": 1,
    "aesthetic quality": 1,
    "imaging quality": 1,
    "dynamic degree": 0.5,
    "object class": 1,
    "multiple objects": 1,
    "human action": 1,
    "color": 1,
    "spatial relationship": 1,
    "scene": 1,
    "appearance style": 1,
    "temporal style": 1,
    "overall consistency": 1,
}

ordered_scaled_res = [
    "total score",
    "quality score",
    "semantic score",
    "subject consistency",
    "background consistency",
    "temporal flickering",
    "motion smoothness",
    "dynamic degree",
    "aesthetic quality",
    "imaging quality",
    "object class",
    "multiple objects",
    "human action",
    "color",
    "spatial relationship",
    "scene",
    "appearance style",
    "temporal style",
    "overall consistency",
]


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--score_dir", type=str)  # ckpt_dir/eval/vbench
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()
    res_postfix = "_eval_results.json"
    info_postfix = "_full_info.json"
    files = os.listdir(args.score_dir)
    res_files = [x for x in files if res_postfix in x]
    info_files = [x for x in files if info_postfix in x]
    assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files"

    full_results = {}
    for res_file in res_files:
        # first check if results is normal
        info_file = res_file.split(res_postfix)[0] + info_postfix
        with open(os.path.join(args.score_dir, info_file), "r", encoding="utf-8") as f:
            info = json.load(f)
            assert len(info[0]["video_list"]) > 0, f"Error: {info_file} has 0 video list"
        # read results
        with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f:
            data = json.load(f)
            for key, val in data.items():
                full_results[key] = format(val[0], ".4f")

    scaled_results = {}
    dims = set()
    for key, val in full_results.items():
        dim = key.replace("_", " ") if "_" in key else key
        scaled_score = (float(val) - NORMALIZE_DIC[dim]["Min"]) / (
            NORMALIZE_DIC[dim]["Max"] - NORMALIZE_DIC[dim]["Min"]
        )
        scaled_score *= DIM_WEIGHT[dim]
        scaled_results[dim] = scaled_score
        dims.add(dim)

    assert len(dims) == len(NORMALIZE_DIC), f"{set(NORMALIZE_DIC.keys())-dims} not calculated yet"

    quality_score = sum([scaled_results[i] for i in QUALITY_LIST]) / sum([DIM_WEIGHT[i] for i in QUALITY_LIST])
    semantic_score = sum([scaled_results[i] for i in SEMANTIC_LIST]) / sum([DIM_WEIGHT[i] for i in SEMANTIC_LIST])
    scaled_results["quality score"] = quality_score
    scaled_results["semantic score"] = semantic_score
    scaled_results["total score"] = (quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT) / (
        QUALITY_WEIGHT + SEMANTIC_WEIGHT
    )

    formated_scaled_results = {"items": []}
    for key in ordered_scaled_res:
        # formated_scaled_results[key] = format(val * 100, ".2f") + "%"
        formated_score = format(scaled_results[key] * 100, ".2f") + "%"
        formated_scaled_results["items"].append({key: formated_score})

    output_file_path = os.path.join(args.score_dir, "all_results.json")
    with open(output_file_path, "w") as outfile:
        json.dump(full_results, outfile, indent=4, sort_keys=True)
    print(f"results saved to: {output_file_path}")

    scaled_file_path = os.path.join(args.score_dir, "scaled_results.json")
    with open(scaled_file_path, "w") as outfile:
        json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True)
    print(f"results saved to: {scaled_file_path}")


================================================
FILE: Open-Sora/eval/vbench_i2v/calc_vbench_i2v.py
================================================
import argparse
import os
import time

import torch
from vbench import VBench
from vbench2_beta_i2v import VBenchI2V

full_info_path = "eval/vbench_i2v/vbench2_i2v_full_info.json"
video_quality_dimensions = [
    "subject_consistency",
    "background_consistency",
    "motion_smoothness",
    "dynamic_degree",
    "aesthetic_quality",
    "imaging_quality",
    "temporal_flickering",
]
i2v_dimensions = ["i2v_subject", "i2v_background", "camera_motion"]


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ("yes", "true", "t", "y", "1"):
        return True
    elif v.lower() in ("no", "false", "f", "n", "0"):
        return False
    else:
        raise argparse.ArgumentTypeError("Boolean value expected.")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("video_folder", type=str)  # samples/samples..._vbench_i2v/
    parser.add_argument("model_ckpt", type=str)
    parser.add_argument("--start", type=int, default=0)  # start index of dimension to be evaluated
    parser.add_argument("--end", type=int, default=-1)  # start index of dimension to be evaluated
    parser.add_argument("--calc_i2v", type=str2bool, default=True)
    parser.add_argument("--calc_quality", type=str2bool, default=True)
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()
    output_dir = os.path.join(args.model_ckpt, "vbench_i2v")
    os.makedirs(output_dir, exist_ok=True)
    video_path = args.video_folder

    start_time = time.time()

    if args.calc_i2v:
        my_VBench_I2V = VBenchI2V(torch.device("cuda"), full_info_path, output_dir)
        end = args.end if args.end != -1 else len(i2v_dimensions)
        for i2v_dim in i2v_dimensions[args.start : end]:
            my_VBench_I2V.evaluate(videos_path=video_path, name=i2v_dim, dimension_list=[i2v_dim], resolution="1-1")

    kwargs = {}
    kwargs["imaging_quality_preprocessing_mode"] = "longer"  # use VBench/evaluate.py default

    if args.calc_quality:
        my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir)
        end = args.end if args.end != -1 else len(video_quality_dimensions)
        for quality_dim in video_quality_dimensions[args.start : end]:
            my_VBench.evaluate(
                videos_path=video_path, name=quality_dim, dimension_list=[quality_dim], mode="vbench_standard", **kwargs
            )

    print("Runtime: %s seconds " % (time.time() - start_time))


================================================
FILE: Open-Sora/eval/vbench_i2v/json_to_txt.py
================================================
import json
import os

RESOLUTIONS = ["1-1", "16-9", "7-4", "8-5"]

cache_root = "/mnt/jfs-hdd/sora/data/vbench-i2v/crop"
resolution = RESOLUTIONS[0]
json_file = "vbench2_i2v_full_info.json"
save_path = "all_i2v.txt"

data = json.load(open(json_file))
txt = [
    f'{x["prompt_en"]}{{"reference_path": "{os.path.join(cache_root, resolution, x["image_name"])}", "mask_strategy": "0"}}'
    for x in data
]
with open(save_path, "w") as f:
    f.write("\n".join(txt))


================================================
FILE: Open-Sora/eval/vbench_i2v/launch.sh
================================================
#!/bin/bash

CKPT=$1
NUM_FRAMES=$2
MODEL_NAME=$3
RES=$4
ASP_RATIO=$5

NUM_SAMPLING_STEPS=$6
FLOW=$7
LLM_REFINE=$8

if [[ $CKPT == *"ema"* ]]; then
    parentdir=$(dirname $CKPT)
    CKPT_BASE=$(basename $parentdir)_ema
else
    CKPT_BASE=$(basename $CKPT)
fi
LOG_BASE=$(dirname $CKPT)/eval
echo "Logging to $LOG_BASE"

GPUS=(0 1 2 3 4 5 6 7)
TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
START_INDEX_LIST=(0 140 280 420 560 700 840 980)
END_INDEX_LIST=(140 280 420 560 700 840 980 2000)


for i in "${!GPUS[@]}"; do
    if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
        then
            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
        else
            if [ -z ${NUM_SAMPLING_STEPS} ];
                then
                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                else
                    if [ -z ${FLOW} ];
                    then
                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                    else
                        if [ -z ${LLM_REFINE} ];
                            then
                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                            else
                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                        fi
                    fi
            fi
    fi
done


================================================
FILE: Open-Sora/eval/vbench_i2v/launch_calc.sh
================================================
# !/bin/bash

VIDEO_DIR=$1
CKPT_DIR=$2
LOG_BASE=$CKPT_DIR
mkdir -p $LOG_BASE
echo "Logging to $LOG_BASE"

GPUS=(0 1 2 3 4 5 6 7)
CALC_I2V_LIST=(True True False False False False False False)
CALC_QUALITY_LIST=(False False True True True True True True)
START_INDEX_LIST=(0 2 0 2 3 4 5 6)
END_INDEX_LIST=(2 -1 2 3 4 5 6 -1)
TASK_ID_LIST=(calc_vbench_i2v_a calc_vbench_i2v_b calc_vbench_i2v_c calc_vbench_i2v_d calc_vbench_i2v_e calc_vbench_i2v_f calc_vbench_i2v_g calc_vbench_i2v_h) # for log records only


for i in "${!GPUS[@]}"; do
    CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench_i2v/calc_vbench_i2v.py $VIDEO_DIR $CKPT_DIR --calc_i2v ${CALC_I2V_LIST[i]} --calc_quality ${CALC_QUALITY_LIST[i]} --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
done


================================================
FILE: Open-Sora/gradio/README.md
================================================
---
title: Open Sora
emoji: 🎥
colorFrom: red
colorTo: purple
sdk: gradio
sdk_version: 4.25.0
app_file: app.py
pinned: false
license: apache-2.0
preload_from_hub:
    - hpcai-tech/OpenSora-STDiT-v3
    - hpcai-tech/OpenSora-VAE-v1.2
    - DeepFloyd/t5-v1_1-xxl
---


# 🕹 Gradio Demo

We have provided a Gradio demo app for you to generate videos via a web interface. You can choose to run it locally or deploy it to Hugging Face by following the instructions given below.

## 🚀 Run Gradio Locally (Outdated)

We assume that you have already installed `opensora` based on the instructions given in the [main README](../README.md). Follow the steps below to run this app on your local machine.

1. First of all, you need to install `gradio` and `spaces`.

```bash
pip install gradio spaces
```

2. Afterwards, you can use the following command to launch the application. Remember to launch the command in the project root directory instead of the `gradio` folder.

```bash
# start the gradio app
python gradio/app.py

# run with a different port
python gradio/app.py --port 8000

# run with acceleration such as flash attention and fused norm
python gradio/app.py --enable-optimization

# run with a sharable Gradio link
python gradio/app.py --share
```

3. You should then be able to access this demo via the link which appears in your terminal.

## 📦 Deploy Gradio to Hugging Face Space (Outdated)

We have also tested this Gradio app on Hugging Face Spaces. You can follow the steps below.

1. Create a Space on Hugging Face, remember to choose `Gradio SDK` and GPU space hardware.

2. Clone the Space repository in your local machine.

3. Copy the `configs` folder and `gradio/app.py` and `gradio/requirements.txt` to the repository you just cloned. The file structure will look like:

```text
- configs
    - ...
- app.py
- requirements.txt
- README.md
- LICENSE
- ...
```

4. Push the files to your remote Hugging Face Spaces repository. The application will be built and run automatically.

## Advanced Usage

![Gradio Demo](../assets/readme/gradio_advanced.png)

For the "**FPS**" option, as now we fix the output video's FPS to 24, this option will not affect the output video's length. Thus, for a smaller FPS, the video is supposed to be longer but accelerated due to 24 FPS. Thus, the video will be less smooth but faster. For a larger FPS, the video will be smoother but slower.

For the "**Number of Loops**", it will affect the output video's length and generation speed. For example, if you set the number of loops to 2, the output video will be twice as long as the original video. This is achieved by conditioning the next generation on 1/4 of the previous generated frames and then concatenating all the frames together.

A trick to give different text prompts for different parts of the video is to use the `|x|` symbol to separate the text prompts, where x is the start frame of the next text prompt. This format requires a `|0|` at the start of the prompt. For example, if you want to generate a video with the text prompt "A cat" for the first 2 generations and "A dog" for the rest generations, you can use the text prompt "|0|A cat|2|A dog". You can still check the "**Enhance prompt with GPT4o**" to refine your prompts in each part separately.


================================================
FILE: Open-Sora/gradio/app.py
================================================
#!/usr/bin/env python
"""
This script runs a Gradio App for the Open-Sora model.

Usage:
    python demo.py <config-path>
"""

import argparse
import datetime
import importlib
import os
import subprocess
import sys
from tempfile import NamedTemporaryFile

import spaces
import torch

import gradio as gr

MODEL_TYPES = ["v1.2-stage3"]
WATERMARK_PATH = "./assets/images/watermark/watermark.png"
CONFIG_MAP = {
    "v1.2-stage3": "configs/opensora-v1-2/inference/sample.py",
}
HF_STDIT_MAP = {"v1.2-stage3": "hpcai-tech/OpenSora-STDiT-v3"}


# ============================
# Prepare Runtime Environment
# ============================
def install_dependencies(enable_optimization=False):
    """
    Install the required dependencies for the demo if they are not already installed.
    """

    def _is_package_available(name) -> bool:
        try:
            importlib.import_module(name)
            return True
        except (ImportError, ModuleNotFoundError):
            return False

    if enable_optimization:
        # install flash attention
        if not _is_package_available("flash_attn"):
            subprocess.run(
                f"{sys.executable} -m pip install flash-attn --no-build-isolation",
                env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
                shell=True,
            )

        # install apex for fused layernorm
        if not _is_package_available("apex"):
            subprocess.run(
                f'{sys.executable} -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git',
                shell=True,
            )

        # install ninja
        if not _is_package_available("ninja"):
            subprocess.run(f"{sys.executable} -m pip install ninja", shell=True)

        # install xformers
        if not _is_package_available("xformers"):
            subprocess.run(
                f"{sys.executable} -m pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers",
                shell=True,
            )


# ============================
# Model-related
# ============================
def read_config(config_path):
    """
    Read the configuration file.
    """
    from mmengine.config import Config

    return Config.fromfile(config_path)


def build_models(model_type, config, enable_optimization=False):
    """
    Build the models for the given model type and configuration.
    """
    # build vae
    from opensora.registry import MODELS, build_module

    vae = build_module(config.vae, MODELS).cuda()

    # build text encoder
    text_encoder = build_module(config.text_encoder, MODELS)  # T5 must be fp32
    text_encoder.t5.model = text_encoder.t5.model.cuda()

    # build stdit
    # we load model from HuggingFace directly so that we don't need to
    # handle model download logic in HuggingFace Space
    from opensora.models.stdit.stdit3 import STDiT3

    model_kwargs = {k: v for k, v in config.model.items() if k not in ("type", "from_pretrained", "force_huggingface")}
    stdit = STDiT3.from_pretrained(HF_STDIT_MAP[model_type], **model_kwargs)
    stdit = stdit.cuda()

    # build scheduler
    from opensora.registry import SCHEDULERS

    scheduler = build_module(config.scheduler, SCHEDULERS)

    # hack for classifier-free guidance
    text_encoder.y_embedder = stdit.y_embedder

    # move modelst to device
    vae = vae.to(torch.bfloat16).eval()
    text_encoder.t5.model = text_encoder.t5.model.eval()  # t5 must be in fp32
    stdit = stdit.to(torch.bfloat16).eval()

    # clear cuda
    torch.cuda.empty_cache()
    return vae, text_encoder, stdit, scheduler


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model-type",
        default="v1.2-stage3",
        choices=MODEL_TYPES,
        help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
    )
    parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder")
    parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
    parser.add_argument("--host", default="0.0.0.0", type=str, help="The host to run the Gradio App on.")
    parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
    parser.add_argument(
        "--enable-optimization",
        action="store_true",
        help="Whether to enable optimization such as flash attention and fused layernorm",
    )
    return parser.parse_args()


# ============================
# Main Gradio Script
# ============================
# as `run_inference` needs to be wrapped by `spaces.GPU` and the input can only be the prompt text
# so we can't pass the models to `run_inference` as arguments.
# instead, we need to define them globally so that we can access these models inside `run_inference`

# read config
args = parse_args()
config = read_config(CONFIG_MAP[args.model_type])
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# make outputs dir
os.makedirs(args.output, exist_ok=True)

# disable torch jit as it can cause failure in gradio SDK
# gradio sdk uses torch with cuda 11.3
torch.jit._state.disable()

# set up
install_dependencies(enable_optimization=args.enable_optimization)

# import after installation
from opensora.datasets import IMG_FPS, save_sample
from opensora.datasets.aspect import get_image_size, get_num_frames
from opensora.models.text_encoder.t5 import text_preprocessing
from opensora.utils.inference_utils import (
    add_watermark,
    append_generated,
    append_score_to_prompts,
    apply_mask_strategy,
    collect_references_batch,
    dframe_to_frame,
    extract_json_from_prompts,
    extract_prompts_loop,
    get_random_prompt_by_openai,
    has_openai_key,
    merge_prompt,
    prepare_multi_resolution_info,
    refine_prompts_by_openai,
    split_prompt,
)
from opensora.utils.misc import to_torch_dtype

# some global variables
dtype = to_torch_dtype(config.dtype)
device = torch.device("cuda")

# build model
vae, text_encoder, stdit, scheduler = build_models(
    args.model_type, config, enable_optimization=args.enable_optimization
)


def run_inference(
    mode,
    prompt_text,
    resolution,
    aspect_ratio,
    length,
    motion_strength,
    aesthetic_score,
    use_motion_strength,
    use_aesthetic_score,
    camera_motion,
    reference_image,
    refine_prompt,
    fps,
    num_loop,
    seed,
    sampling_steps,
    cfg_scale,
):
    if prompt_text is None or prompt_text == "":
        gr.Warning("Your prompt is empty, please enter a valid prompt")
        return None

    torch.manual_seed(seed)
    with torch.inference_mode():
        # ======================
        # 1. Preparation arguments
        # ======================
        # parse the inputs
        # frame_interval must be 1 so  we ignore it here
        image_size = get_image_size(resolution, aspect_ratio)

        # compute generation parameters
        if mode == "Text2Image":
            num_frames = 1
            fps = IMG_FPS
        else:
            num_frames = config.num_frames
            num_frames = get_num_frames(length)

        condition_frame_length = int(num_frames / 17 * 5 / 3)
        condition_frame_edit = 0.0

        input_size = (num_frames, *image_size)
        latent_size = vae.get_latent_size(input_size)
        multi_resolution = "OpenSora"
        align = 5

        # == prepare mask strategy ==
        if mode == "Text2Image":
            mask_strategy = [None]
        elif mode == "Text2Video":
            if reference_image is not None:
                mask_strategy = ["0"]
            else:
                mask_strategy = [None]
        else:
            raise ValueError(f"Invalid mode: {mode}")

        # == prepare reference ==
        if mode == "Text2Image":
            refs = [""]
        elif mode == "Text2Video":
            if reference_image is not None:
                # save image to disk
                from PIL import Image

                im = Image.fromarray(reference_image)
                temp_file = NamedTemporaryFile(suffix=".png")
                im.save(temp_file.name)
                refs = [temp_file.name]
            else:
                refs = [""]
        else:
            raise ValueError(f"Invalid mode: {mode}")

        # == get json from prompts ==
        batch_prompts = [prompt_text]
        batch_prompts, refs, mask_strategy = extract_json_from_prompts(batch_prompts, refs, mask_strategy)

        # == get reference for condition ==
        refs = collect_references_batch(refs, vae, image_size)

        # == multi-resolution info ==
        model_args = prepare_multi_resolution_info(
            multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype
        )

        # == process prompts step by step ==
        # 0. split prompt
        # each element in the list is [prompt_segment_list, loop_idx_list]
        batched_prompt_segment_list = []
        batched_loop_idx_list = []
        for prompt in batch_prompts:
            prompt_segment_list, loop_idx_list = split_prompt(prompt)
            batched_prompt_segment_list.append(prompt_segment_list)
            batched_loop_idx_list.append(loop_idx_list)

        # 1. refine prompt by openai
        if refine_prompt:
            # check if openai key is provided
            if not has_openai_key():
                gr.Warning("OpenAI API key is not provided, the prompt will not be enhanced.")
            else:
                for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
                    batched_prompt_segment_list[idx] = refine_prompts_by_openai(prompt_segment_list)

        # process scores
        aesthetic_score = aesthetic_score if use_aesthetic_score else None
        motion_strength = motion_strength if use_motion_strength and mode != "Text2Image" else None
        camera_motion = None if camera_motion == "none" or mode == "Text2Image" else camera_motion
        # 2. append score
        for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
            batched_prompt_segment_list[idx] = append_score_to_prompts(
                prompt_segment_list,
                aes=aesthetic_score,
                flow=motion_strength,
                camera_motion=camera_motion,
            )

        # 3. clean prompt with T5
        for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
            batched_prompt_segment_list[idx] = [text_preprocessing(prompt) for prompt in prompt_segment_list]

        # 4. merge to obtain the final prompt
        batch_prompts = []
        for prompt_segment_list, loop_idx_list in zip(batched_prompt_segment_list, batched_loop_idx_list):
            batch_prompts.append(merge_prompt(prompt_segment_list, loop_idx_list))

        # =========================
        # Generate image/video
        # =========================
        video_clips = []

        for loop_i in range(num_loop):
            # 4.4 sample in hidden space
            batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i)

            # == loop ==
            if loop_i > 0:
                refs, mask_strategy = append_generated(
                    vae, video_clips[-1], refs, mask_strategy, loop_i, condition_frame_length, condition_frame_edit
                )

            # == sampling ==
            z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
            masks = apply_mask_strategy(z, refs, mask_strategy, loop_i, align=align)

            # 4.6. diffusion sampling
            # hack to update num_sampling_steps and cfg_scale
            scheduler_kwargs = config.scheduler.copy()
            scheduler_kwargs.pop("type")
            scheduler_kwargs["num_sampling_steps"] = sampling_steps
            scheduler_kwargs["cfg_scale"] = cfg_scale

            scheduler.__init__(**scheduler_kwargs)
            samples = scheduler.sample(
                stdit,
                text_encoder,
                z=z,
                prompts=batch_prompts_loop,
                device=device,
                additional_args=model_args,
                progress=True,
                mask=masks,
            )
            samples = vae.decode(samples.to(dtype), num_frames=num_frames)
            video_clips.append(samples)

        # =========================
        # Save output
        # =========================
        video_clips = [val[0] for val in video_clips]
        for i in range(1, num_loop):
            video_clips[i] = video_clips[i][:, dframe_to_frame(condition_frame_length) :]
        video = torch.cat(video_clips, dim=1)
        current_datetime = datetime.datetime.now()
        timestamp = current_datetime.timestamp()
        save_path = os.path.join(args.output, f"output_{timestamp}")
        saved_path = save_sample(video, save_path=save_path, fps=24)
        torch.cuda.empty_cache()

        # add watermark
        # all watermarked videos should have a _watermarked suffix
        if mode != "Text2Image" and os.path.exists(WATERMARK_PATH):
            watermarked_path = saved_path.replace(".mp4", "_watermarked.mp4")
            success = add_watermark(saved_path, WATERMARK_PATH, watermarked_path)
            if success:
                return watermarked_path
            else:
                return saved_path
        else:
            return saved_path


@spaces.GPU(duration=200)
def run_image_inference(
    prompt_text,
    resolution,
    aspect_ratio,
    length,
    motion_strength,
    aesthetic_score,
    use_motion_strength,
    use_aesthetic_score,
    camera_motion,
    reference_image,
    refine_prompt,
    fps,
    num_loop,
    seed,
    sampling_steps,
    cfg_scale,
):
    return run_inference(
        "Text2Image",
        prompt_text,
        resolution,
        aspect_ratio,
        length,
        motion_strength,
        aesthetic_score,
        use_motion_strength,
        use_aesthetic_score,
        camera_motion,
        reference_image,
        refine_prompt,
        fps,
        num_loop,
        seed,
        sampling_steps,
        cfg_scale,
    )


@spaces.GPU(duration=200)
def run_video_inference(
    prompt_text,
    resolution,
    aspect_ratio,
    length,
    motion_strength,
    aesthetic_score,
    use_motion_strength,
    use_aesthetic_score,
    camera_motion,
    reference_image,
    refine_prompt,
    fps,
    num_loop,
    seed,
    sampling_steps,
    cfg_scale,
):
    # if (resolution == "480p" and length == "16s") or \
    #     (resolution == "720p" and length in ["8s", "16s"]):
    #     gr.Warning("Generation is interrupted as the combination of 480p and 16s will lead to CUDA out of memory")
    # else:
    return run_inference(
        "Text2Video",
        prompt_text,
        resolution,
        aspect_ratio,
        length,
        motion_strength,
        aesthetic_score,
        use_motion_strength,
        use_aesthetic_score,
        camera_motion,
        reference_image,
        refine_prompt,
        fps,
        num_loop,
        seed,
        sampling_steps,
        cfg_scale,
    )


def generate_random_prompt():
    if "OPENAI_API_KEY" not in os.environ:
        gr.Warning("Your prompt is empty and the OpenAI API key is not provided, please enter a valid prompt")
        return None
    else:
        prompt_text = get_random_prompt_by_openai()
        return prompt_text


def main():
    # create demo
    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column():
                gr.HTML(
                    """
                <div style='text-align: center;'>
                    <p align="center">
                        <img src="https://github.com/hpcaitech/Open-Sora/raw/main/assets/readme/icon.png" width="250"/>
                    </p>
                    <div style="display: flex; gap: 10px; justify-content: center;">
                        <a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
                        <a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
                        <a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
                        <a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
                        <a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
                        <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
                        <a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
                    </div>
                    <h1 style='margin-top: 5px;'>Open-Sora: Democratizing Efficient Video Production for All</h1>
                </div>
                """
                )

        with gr.Row():
            with gr.Column():
                prompt_text = gr.Textbox(label="Prompt", placeholder="Describe your video here", lines=4)
                refine_prompt = gr.Checkbox(
                    value=has_openai_key(), label="Refine prompt with GPT4o", interactive=has_openai_key()
                )
                random_prompt_btn = gr.Button("Random Prompt By GPT4o", interactive=has_openai_key())

                gr.Markdown("## Basic Settings")
                resolution = gr.Radio(
                    choices=["144p", "240p", "360p", "480p", "720p"],
                    value="480p",
                    label="Resolution",
                )
                aspect_ratio = gr.Radio(
                    choices=["9:16", "16:9", "3:4", "4:3", "1:1"],
                    value="9:16",
                    label="Aspect Ratio (H:W)",
                )
                length = gr.Radio(
                    choices=["2s", "4s", "8s", "16s"],
                    value="2s",
                    label="Video Length",
                    info="only effective for video generation, 8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time.",
                )

                with gr.Row():
                    seed = gr.Slider(value=1024, minimum=1, maximum=2048, step=1, label="Seed")

                    sampling_steps = gr.Slider(value=30, minimum=1, maximum=200, step=1, label="Sampling steps")
                    cfg_scale = gr.Slider(value=7.0, minimum=0.0, maximum=10.0, step=0.1, label="CFG Scale")

                with gr.Row():
                    with gr.Column():
                        motion_strength = gr.Slider(
                            value=5,
                            minimum=0,
                            maximum=100,
                            step=1,
                            label="Motion Strength",
                            info="only effective for video generation",
                        )
                        use_motion_strength = gr.Checkbox(value=False, label="Enable")

                    with gr.Column():
                        aesthetic_score = gr.Slider(
                            value=6.5,
                            minimum=4,
                            maximum=7,
                            step=0.1,
                            label="Aesthetic",
                            info="effective for text & video generation",
                        )
                        use_aesthetic_score = gr.Checkbox(value=True, label="Enable")

                camera_motion = gr.Radio(
                    value="none",
                    label="Camera Motion",
                    choices=["none", "pan right", "pan left", "tilt up", "tilt down", "zoom in", "zoom out", "static"],
                    interactive=True,
                )

                gr.Markdown("## Advanced Settings")
                with gr.Row():
                    fps = gr.Slider(
                        value=24,
                        minimum=1,
                        maximum=60,
                        step=1,
                        label="FPS",
                        info="This is the frames per seconds for video generation, keep it to 24 if you are not sure",
                    )
                    num_loop = gr.Slider(
                        value=1,
                        minimum=1,
                        maximum=20,
                        step=1,
                        label="Number of Loops",
                        info="This will change the length of the generated video, keep it to 1 if you are not sure",
                    )

                gr.Markdown("## Reference Image")
                reference_image = gr.Image(label="Image (optional)", show_download_button=True)

            with gr.Column():
                output_video = gr.Video(label="Output Video", height="100%")

        with gr.Row():
            image_gen_button = gr.Button("Generate image")
            video_gen_button = gr.Button("Generate video")

        image_gen_button.click(
            fn=run_image_inference,
            inputs=[
                prompt_text,
                resolution,
                aspect_ratio,
                length,
                motion_strength,
                aesthetic_score,
                use_motion_strength,
                use_aesthetic_score,
                camera_motion,
                reference_image,
                refine_prompt,
                fps,
                num_loop,
                seed,
                sampling_steps,
                cfg_scale,
            ],
            outputs=reference_image,
        )
        video_gen_button.click(
            fn=run_video_inference,
            inputs=[
                prompt_text,
                resolution,
                aspect_ratio,
                length,
                motion_strength,
                aesthetic_score,
                use_motion_strength,
                use_aesthetic_score,
                camera_motion,
                reference_image,
                refine_prompt,
                fps,
                num_loop,
                seed,
                sampling_steps,
                cfg_scale,
            ],
            outputs=output_video,
        )
        random_prompt_btn.click(fn=generate_random_prompt, outputs=prompt_text)

    # launch
    demo.queue(max_size=5, default_concurrency_limit=1)
    demo.launch(server_port=args.port, server_name=args.host, share=args.share, max_threads=1)


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/gradio/requirements.txt
================================================
xformers
transformers
git+https://github.com/hpcaitech/Open-Sora.git


================================================
FILE: Open-Sora/notebooks/inference.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Inference for OpenSora"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define global variables. You should change the following variables according to your setting."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global variables\n",
    "ROOT = \"..\"\n",
    "cfg_path = f\"{ROOT}/configs/opensora-v1-2/inference/sample.py\"\n",
    "ckpt_path = \"/home/lishenggui/projects/sora/Open-Sora-dev/outputs/207-STDiT3-XL-2/epoch0-global_step9000/\"\n",
    "vae_path = f\"{ROOT}/pretrained_models/vae-pipeline\"\n",
    "save_dir = f\"{ROOT}/samples/samples_notebook/\"\n",
    "device = \"cuda:0\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import necessary libraries and load the models."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from pprint import pformat\n",
    "\n",
    "import colossalai\n",
    "import torch\n",
    "import torch.distributed as dist\n",
    "from colossalai.cluster import DistCoordinator\n",
    "from mmengine.runner import set_random_seed\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "from opensora.acceleration.parallel_states import set_sequence_parallel_group\n",
    "from opensora.datasets import save_sample, is_img\n",
    "from opensora.datasets.aspect import get_image_size, get_num_frames\n",
    "from opensora.models.text_encoder.t5 import text_preprocessing\n",
    "from opensora.registry import MODELS, SCHEDULERS, build_module\n",
    "from opensora.utils.config_utils import read_config\n",
    "from opensora.utils.inference_utils import (\n",
    "    append_generated,\n",
    "    apply_mask_strategy,\n",
    "    collect_references_batch,\n",
    "    extract_json_from_prompts,\n",
    "    extract_prompts_loop,\n",
    "    get_save_path_name,\n",
    "    load_prompts,\n",
    "    prepare_multi_resolution_info,\n",
    ")\n",
    "from opensora.utils.misc import all_exists, create_logger, is_distributed, is_main_process, to_torch_dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.set_grad_enabled(False)\n",
    "\n",
    "# == parse configs ==\n",
    "cfg = read_config(cfg_path)\n",
    "cfg.model.from_pretrained = ckpt_path\n",
    "cfg.vae.from_pretrained = vae_path\n",
    "\n",
    "# == device and dtype ==\n",
    "cfg_dtype = cfg.get(\"dtype\", \"fp32\")\n",
    "assert cfg_dtype in [\"fp16\", \"bf16\", \"fp32\"], f\"Unknown mixed precision {cfg_dtype}\"\n",
    "dtype = to_torch_dtype(cfg.get(\"dtype\", \"bf16\"))\n",
    "torch.backends.cuda.matmul.allow_tf32 = True\n",
    "torch.backends.cudnn.allow_tf32 = True\n",
    "\n",
    "set_random_seed(seed=cfg.get(\"seed\", 1024))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# == build text-encoder and vae ==\n",
    "text_encoder = build_module(cfg.text_encoder, MODELS, device=device)\n",
    "vae = build_module(cfg.vae, MODELS).to(device, dtype).eval()\n",
    "\n",
    "# == build diffusion model ==\n",
    "input_size = (None, None, None)\n",
    "latent_size = vae.get_latent_size(input_size)\n",
    "model = (\n",
    "    build_module(\n",
    "        cfg.model,\n",
    "        MODELS,\n",
    "        input_size=latent_size,\n",
    "        in_channels=vae.out_channels,\n",
    "        caption_channels=text_encoder.output_dim,\n",
    "        model_max_length=text_encoder.model_max_length,\n",
    "    )\n",
    "    .to(device, dtype)\n",
    "    .eval()\n",
    ")\n",
    "text_encoder.y_embedder = model.y_embedder  # HACK: for classifier-free guidance"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define inference function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_idx = 0\n",
    "multi_resolution = cfg.get(\"multi_resolution\", None)\n",
    "batch_size = cfg.get(\"batch_size\", 1)\n",
    "\n",
    "\n",
    "def inference(\n",
    "    prompts=cfg.get(\"prompt\", None),\n",
    "    image_size=None,\n",
    "    num_frames=None,\n",
    "    resolution=None,\n",
    "    aspect_ratio=None,\n",
    "    mask_strategy=None,\n",
    "    reference_path=None,\n",
    "    num_sampling_steps=None,\n",
    "    cfg_scale=None,\n",
    "    seed=None,\n",
    "    fps=cfg.fps,\n",
    "    num_sample=cfg.get(\"num_sample\", 1),\n",
    "    loop=cfg.get(\"loop\", 1),\n",
    "    condition_frame_length=cfg.get(\"condition_frame_length\", 5),\n",
    "    align=cfg.get(\"align\", None),\n",
    "    sample_name=cfg.get(\"sample_name\", None),\n",
    "    prompt_as_path=cfg.get(\"prompt_as_path\", False),\n",
    "    disable_progress=False,\n",
    "):\n",
    "    global start_idx\n",
    "    os.makedirs(save_dir, exist_ok=True)\n",
    "    if seed is not None:\n",
    "        set_random_seed(seed=seed)\n",
    "    if not isinstance(prompts, list):\n",
    "        prompts = [prompts]\n",
    "    if mask_strategy is None:\n",
    "        mask_strategy = [\"\"] * len(prompts)\n",
    "    if reference_path is None:\n",
    "        reference_path = [\"\"] * len(prompts)\n",
    "    save_fps = cfg.fps // cfg.get(\"frame_interval\", 1)\n",
    "    if num_sampling_steps is not None:\n",
    "        cfg.scheduler[\"num_sampling_steps\"] = num_sampling_steps\n",
    "    if cfg_scale is not None:\n",
    "        cfg.scheduler[\"scale\"] = cfg_scale\n",
    "    scheduler = build_module(cfg.scheduler, SCHEDULERS)\n",
    "    ret_path = []\n",
    "\n",
    "    # == prepare video size ==\n",
    "    if image_size is None:\n",
    "        assert (\n",
    "            resolution is not None and aspect_ratio is not None\n",
    "        ), \"resolution and aspect_ratio must be provided if image_size is not provided\"\n",
    "        image_size = get_image_size(resolution, aspect_ratio)\n",
    "    num_frames = get_num_frames(num_frames)\n",
    "    input_size = (num_frames, *image_size)\n",
    "    latent_size = vae.get_latent_size(input_size)\n",
    "\n",
    "    # == Iter over all samples ==\n",
    "    for i in tqdm(range(0, len(prompts), batch_size), disable=disable_progress):\n",
    "        # == prepare batch prompts ==\n",
    "        batch_prompts = prompts[i : i + batch_size]\n",
    "        ms = mask_strategy[i : i + batch_size]\n",
    "        refs = reference_path[i : i + batch_size]\n",
    "\n",
    "        batch_prompts, refs, ms = extract_json_from_prompts(batch_prompts, refs, ms)\n",
    "        refs = collect_references_batch(refs, vae, image_size)\n",
    "\n",
    "        # == multi-resolution info ==\n",
    "        model_args = prepare_multi_resolution_info(\n",
    "            multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype\n",
    "        )\n",
    "\n",
    "        # == Iter over number of sampling for one prompt ==\n",
    "        for k in range(num_sample):\n",
    "            # == prepare save paths ==\n",
    "            save_paths = [\n",
    "                get_save_path_name(\n",
    "                    save_dir,\n",
    "                    sample_name=sample_name,\n",
    "                    sample_idx=start_idx + idx,\n",
    "                    prompt=batch_prompts[idx],\n",
    "                    prompt_as_path=prompt_as_path,\n",
    "                    num_sample=num_sample,\n",
    "                    k=k,\n",
    "                )\n",
    "                for idx in range(len(batch_prompts))\n",
    "            ]\n",
    "\n",
    "            # NOTE: Skip if the sample already exists\n",
    "            # This is useful for resuming sampling VBench\n",
    "            if prompt_as_path and all_exists(save_paths):\n",
    "                continue\n",
    "\n",
    "            # == Iter over loop generation ==\n",
    "            video_clips = []\n",
    "            for loop_i in range(loop):\n",
    "                batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i)\n",
    "                batch_prompts_cleaned = [text_preprocessing(prompt) for prompt in batch_prompts_loop]\n",
    "\n",
    "                # == loop ==\n",
    "                if loop_i > 0:\n",
    "                    refs, ms = append_generated(vae, video_clips[-1], refs, ms, loop_i, condition_frame_length)\n",
    "\n",
    "                # == sampling ==\n",
    "                z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)\n",
    "                masks = apply_mask_strategy(z, refs, ms, loop_i, align=align)\n",
    "                samples = scheduler.sample(\n",
    "                    model,\n",
    "                    text_encoder,\n",
    "                    z=z,\n",
    "                    prompts=batch_prompts_cleaned,\n",
    "                    device=device,\n",
    "                    additional_args=model_args,\n",
    "                    progress=False,\n",
    "                    mask=masks,\n",
    "                )\n",
    "                samples = vae.decode(samples.to(dtype), num_frames=num_frames)\n",
    "                video_clips.append(samples)\n",
    "\n",
    "            # == save samples ==\n",
    "            if is_main_process():\n",
    "                for idx, batch_prompt in enumerate(batch_prompts):\n",
    "                    save_path = save_paths[idx]\n",
    "                    video = [video_clips[i][idx] for i in range(loop)]\n",
    "                    for i in range(1, loop):\n",
    "                        video[i] = video[i][:, condition_frame_length:]\n",
    "                    video = torch.cat(video, dim=1)\n",
    "                    path = save_sample(\n",
    "                        video,\n",
    "                        fps=save_fps,\n",
    "                        save_path=save_path,\n",
    "                        verbose=False,\n",
    "                    )\n",
    "                    ret_path.append(path)\n",
    "        start_idx += len(batch_prompts)\n",
    "    return ret_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Video, Image, display\n",
    "\n",
    "\n",
    "def display_results(paths):\n",
    "    for path in paths:\n",
    "        if is_img(path):\n",
    "            display(Image(path))\n",
    "        else:\n",
    "            display(Video(path, embed=True))\n",
    "\n",
    "\n",
    "def reset_start_idx():\n",
    "    global start_idx\n",
    "    start_idx = 0\n",
    "\n",
    "\n",
    "ALL_ASPECT_RATIO = [\"1:1\", \"16:9\", \"9:16\", \"3:4\", \"4:3\", \"1:2\", \"2:1\"]\n",
    "\n",
    "\n",
    "def inference_all_aspects(prompts, resolution, num_frames, *args, **kwargs):\n",
    "    paths = []\n",
    "    for aspect_ratio in tqdm(ALL_ASPECT_RATIO):\n",
    "        paths.extend(\n",
    "            inference(\n",
    "                prompts,\n",
    "                resolution=resolution,\n",
    "                num_frames=num_frames,\n",
    "                aspect_ratio=aspect_ratio,\n",
    "                disable_progress=True,\n",
    "                *args,\n",
    "                **kwargs\n",
    "            )\n",
    "        )\n",
    "    return paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Inference for OpenSora"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sample code for inference for OpenSora."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "paths = inference(\n",
    "    [\"a man.\", \"a woman\"],\n",
    "    resolution=\"240p\",\n",
    "    aspect_ratio=\"1:1\",\n",
    "    num_frames=\"1x\",\n",
    "    num_sampling_steps=30,\n",
    "    cfg_scale=7.0,\n",
    ")\n",
    "display_results(paths)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sample all aspect ratios."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PROMPT = \"a boy.\"\n",
    "paths = inference_all_aspects(\n",
    "    PROMPT,\n",
    "    resolution=\"240p\",\n",
    "    num_frames=\"1x\",\n",
    "    num_sampling_steps=30,\n",
    "    cfg_scale=7.0,\n",
    ")\n",
    "display_results(paths)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sample all resolution and length."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PROMPT = \"a boy.\"\n",
    "sample_cfg = {\n",
    "    \"144p\": [1, \"1x\", \"2x\", \"4x\", \"8x\"],\n",
    "    \"240p\": [1, \"1x\", \"2x\", \"4x\", \"8x\"],\n",
    "    \"360p\": [1, \"1x\", \"2x\", \"4x\"],\n",
    "    \"480p\": [1, \"1x\", \"2x\", \"4x\"],\n",
    "    \"720p\": [1, \"1x\", \"2x\"],\n",
    "}\n",
    "all_paths = []\n",
    "for resolution, num_frames in sample_cfg.items():\n",
    "    for num_frame in num_frames:\n",
    "        print(f\"Resolution: {resolution}, Num Frames: {num_frame}\")\n",
    "        paths = inference(\n",
    "            PROMPT,\n",
    "            resolution=resolution,\n",
    "            num_frames=num_frame,\n",
    "            aspect_ratio=\"9:16\",\n",
    "            num_sampling_steps=30,\n",
    "            cfg_scale=7.0,\n",
    "            disable_progress=True,\n",
    "        )\n",
    "        display_results(paths)\n",
    "        all_paths.extend(paths)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sample all resolution, length, and aspect ratios."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PROMPT = \"a boy.\"\n",
    "sample_cfg = {\n",
    "    \"144p\": [1, \"1x\", \"2x\", \"4x\", \"8x\"],\n",
    "    \"240p\": [1, \"1x\", \"2x\", \"4x\", \"8x\"],\n",
    "    \"360p\": [1, \"1x\", \"2x\", \"4x\"],\n",
    "    \"480p\": [1, \"1x\", \"2x\", \"4x\"],\n",
    "    \"720p\": [1, \"1x\", \"2x\"],\n",
    "}\n",
    "all_paths = []\n",
    "for resolution, num_frames in sample_cfg.items():\n",
    "    for num_frame in num_frames:\n",
    "        paths = inference_all_aspects(\n",
    "            PROMPT,\n",
    "            resolution=resolution,\n",
    "            num_frames=num_frames,\n",
    "            num_sampling_steps=30,\n",
    "            cfg_scale=7.0,\n",
    "        )\n",
    "        display_results(paths)\n",
    "        all_paths.extend(paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "opensora",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: Open-Sora/notebooks/launch.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Process Pipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Process Commands"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# TODO: change to your own project path!!!\n",
    "OPEN_SORA_HOME = \"/path/to/Open-Sora/\"\n",
    "\n",
    "\n",
    "def convert_dataset_cmd(input_dir, output_file, datatype=\"video\"):\n",
    "    commands = []\n",
    "    commands.append(f'echo \"Converting {input_dir} to {output_file}\"')\n",
    "    output_dir = os.path.dirname(output_file)\n",
    "\n",
    "    commands.append(f\"mkdir -p {output_dir}\")\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append(f\"python -m tools.datasets.convert {datatype} {input_dir} --output {output_file}\")\n",
    "    return \" && \".join(commands), output_file\n",
    "\n",
    "\n",
    "def get_video_info(input_file):\n",
    "    commands = []\n",
    "    base, ext = os.path.splitext(input_file)\n",
    "    output_file = f\"{base}_info{ext}\"\n",
    "    output_format = ext[1:]\n",
    "\n",
    "    commands.append(f'echo \"Getting info of {input_file} to {output_file}\"')\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append(\n",
    "        f\"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --info --fmin 1\"\n",
    "    )\n",
    "    return \" && \".join(commands), output_file\n",
    "\n",
    "\n",
    "def get_video_info_torchvision(input_file):\n",
    "    commands = []\n",
    "    base, ext = os.path.splitext(input_file)\n",
    "    output_file = f\"{base}_info{ext}\"\n",
    "    output_format = ext[1:]\n",
    "\n",
    "    commands.append(f'echo \"Getting info of {input_file} to {output_file}\"')\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append(\n",
    "        f\"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --video-info --fmin 1\"\n",
    "    )\n",
    "    return \" && \".join(commands), output_file\n",
    "\n",
    "\n",
    "def get_caption_llava7b_video(input_file):\n",
    "    commands = []\n",
    "    base, ext = os.path.splitext(input_file)\n",
    "    output_file = f\"{base}_caption{ext}\"\n",
    "    output_format = ext[1:]\n",
    "\n",
    "    commands.append(f'echo \"Getting info of {input_file} to {output_file}\"')\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append(f\"conda activate llava2\")\n",
    "    commands.append(\n",
    "        f\"torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava {input_file} --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video\"\n",
    "    )\n",
    "    commands.append(f\"conda activate opensora\")\n",
    "    commands.append(\n",
    "        f\"python -m tools.datasets.datautil {base}_caption_part*{ext} --output {output_file} --format {output_format} --intersection {input_file} --clean-caption --refine-llm-caption --remove-empty-caption\"\n",
    "    )\n",
    "    return \" && \".join(commands), output_file\n",
    "\n",
    "\n",
    "def get_caption_load(input_file):\n",
    "    commands = []\n",
    "    base, ext = os.path.splitext(input_file)\n",
    "    output_file = f\"{base}_caption{ext}\"\n",
    "    output_format = ext[1:]\n",
    "\n",
    "    commands.append(f'echo \"Getting caption of {input_file} to {output_file}\"')\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append(\n",
    "        f\"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --load-caption json --remove-empty-caption --clean-caption\"\n",
    "    )\n",
    "    return \" && \".join(commands), output_file\n",
    "\n",
    "\n",
    "def get_aesthetic_score(input_file):\n",
    "    commands = []\n",
    "    base, ext = os.path.splitext(input_file)\n",
    "    output_file = f\"{base}_aes{ext}\"\n",
    "    output_format = ext[1:]\n",
    "\n",
    "    commands.append(f'echo \"Getting aesthetic score of {input_file} to {output_file}\"')\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference {input_file}\")\n",
    "    commands.append(\n",
    "        f\"python -m tools.datasets.datautil {base}_aes_part*{ext} --output {output_file} --format {output_format} --sort aes\"\n",
    "    )\n",
    "    return \" && \".join(commands), output_file\n",
    "\n",
    "\n",
    "def get_flow_score(input_file):\n",
    "    commands = []\n",
    "    base, ext = os.path.splitext(input_file)\n",
    "    output_file = f\"{base}_flow{ext}\"\n",
    "\n",
    "    commands.append(f'echo \"Getting flow score of {input_file} to {output_file}\"')\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference {input_file}\")\n",
    "    return \" && \".join(commands), output_file\n",
    "\n",
    "\n",
    "def get_ocr(input_file):\n",
    "    commands = []\n",
    "    base, ext = os.path.splitext(input_file)\n",
    "    output_file = f\"{base}_match{ext}\"\n",
    "\n",
    "    commands.append(f'echo \"Getting match score of {input_file} to {output_file}\"')\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.ocr.inference {input_file}\")\n",
    "    return \" && \".join(commands), output_file\n",
    "\n",
    "    \n",
    "def get_match_score(input_file):\n",
    "    commands = []\n",
    "    base, ext = os.path.splitext(input_file)\n",
    "    output_file = f\"{base}_match{ext}\"\n",
    "\n",
    "    commands.append(f'echo \"Getting match score of {input_file} to {output_file}\"')\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.matching.inference {input_file}\")\n",
    "    return \" && \".join(commands), output_file\n",
    "\n",
    "\n",
    "def get_cmotion_score(input_file):\n",
    "    commands = []\n",
    "    base, ext = os.path.splitext(input_file)\n",
    "    output_file = f\"{base}_cmotion{ext}\"\n",
    "\n",
    "    commands.append(f'echo \"Getting cmotion score of {input_file} to {output_file}\"')\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append(f\"python -m tools.caption.camera_motion_detect {input_file}\")\n",
    "    return \" && \".join(commands), output_file\n",
    "\n",
    "\n",
    "def get_commands(job_list):\n",
    "    commands = []\n",
    "    output_file = None\n",
    "    for job in job_list:\n",
    "        cmd = job.pop(\"cmd\")\n",
    "        if output_file is None:\n",
    "            command, output_file = cmd(**job)\n",
    "            commands.append(command)\n",
    "        else:\n",
    "            job[\"input_file\"] = output_file\n",
    "            command, output_file = cmd(**job)\n",
    "            commands.append(command)\n",
    "    commands.append(f'echo \"All Done!\"')\n",
    "    return \" && \".join(commands), output_file"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remote Launch via Paramiko\n",
    "\n",
    "First, add hosts to `~/.ssh/config`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import paramiko\n",
    "\n",
    "HOSTS = [\"host-0\", \"host-1\", \"host-2\", \"host-3\", \"host-4\", \"host-5\", \"host-6\", \"host-7\"]\n",
    "\n",
    "# load from ~/.ssh/config\n",
    "ssh_config = paramiko.SSHConfig()\n",
    "user_config_file = os.path.expanduser(\"~/.ssh/config\")\n",
    "if os.path.exists(user_config_file):\n",
    "    with open(user_config_file) as f:\n",
    "        ssh_config.parse(f)\n",
    "\n",
    "\n",
    "def get_ssh_config(hostname):\n",
    "    # get the configuration for the host\n",
    "    user_config = ssh_config.lookup(hostname)\n",
    "    cfg = {\n",
    "        \"hostname\": user_config[\"hostname\"],\n",
    "        \"username\": user_config[\"user\"],\n",
    "        \"port\": int(user_config[\"port\"]),\n",
    "        \"key_filename\": user_config[\"identityfile\"],\n",
    "    }\n",
    "    return cfg\n",
    "\n",
    "\n",
    "def connect(hostname):\n",
    "    cfg = get_ssh_config(hostname)\n",
    "    # connect\n",
    "    client = paramiko.SSHClient()\n",
    "    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n",
    "    client.connect(**cfg)\n",
    "    return client\n",
    "\n",
    "\n",
    "def run_command(command, hostname, nohup=False, log_file=None, sleep=None):\n",
    "    client = connect(hostname)\n",
    "    print(\"HOST:\", hostname)\n",
    "    if sleep:\n",
    "        command = f\"sleep {sleep}; {command}\"\n",
    "    command = f\"bash -ic '{command}'\"\n",
    "    if log_file:\n",
    "        command = f\"{command} >> {log_file} 2>&1\"\n",
    "    if nohup:\n",
    "        command = f\"nohup {command} &\"\n",
    "    print(\"COMMAND:\", command)\n",
    "    stdin, stdout, stderr = client.exec_command(command, get_pty=False)\n",
    "\n",
    "    stdout_str = stdout.read().decode()\n",
    "    stderr_str = stderr.read().decode()\n",
    "    if stdout_str:\n",
    "        print(\"==== STDOUT ====\\n\", stdout_str)\n",
    "    if stderr_str:\n",
    "        print(\"==== STDERR ====\\n\", stderr_str)\n",
    "\n",
    "    client.close()\n",
    "\n",
    "\n",
    "def run_command_all_hosts(command, hosts=HOSTS):\n",
    "    for hostname in hosts:\n",
    "        run_command(command, hostname)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here are tools to examine machine's status."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def nvidia_smi(host):\n",
    "    if host:\n",
    "        run_command(\"nvidia-smi\", host)\n",
    "    else:\n",
    "        run_command_all_hosts(\"nvidia-smi\")\n",
    "\n",
    "\n",
    "def nvitop(host=None):\n",
    "    if host:\n",
    "        run_command(f\"/home/user/.local/bin/nvitop -1\", host)\n",
    "    else:\n",
    "        run_command_all_hosts(\"/home/user/.local/bin/nvitop -1\")\n",
    "\n",
    "\n",
    "def ps(host=None, interest=\"python|sleep|torchrun|colossal\", all=True):\n",
    "    cmd = \"ps aux\" if all else \"ps ux\"\n",
    "    if host:\n",
    "        if interest is None:\n",
    "            run_command(f\"{cmd} | cat\", host)\n",
    "        else:\n",
    "            run_command(f'{cmd} | cat | grep --color=never -E \"{interest}\"', host)\n",
    "    else:\n",
    "        if interest is None:\n",
    "            run_command_all_hosts(f\"{cmd} | cat\")\n",
    "        else:\n",
    "            run_command_all_hosts(f'{cmd} | cat | grep --color=never -E \"{interest}\"')\n",
    "\n",
    "\n",
    "def kill(pid, host):\n",
    "    run_command(f\"kill -KILL {pid}\", host)\n",
    "\n",
    "\n",
    "def pkill(interest, host):\n",
    "    run_command(f'pkill -9 -f \"{interest}\"', host)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example\n",
    "\n",
    "Remote launch via paramiko."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sleep = None\n",
    "run_command(cmd, host, log_file=log_file, nohup=True, sleep=sleep)\n",
    "ps(host)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using following commands to monitor the status of the jobs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ps()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nvitop(host)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kill(, host)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def colossal_run(data_path, load_path=None):\n",
    "    commands = []\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    command = f\"colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora-v1-1/train/video.py --wandb True --data-path {data_path}\"\n",
    "    if load_path:\n",
    "        command = f\"{command} --load-path {load_path}\"\n",
    "    commands.append(command)\n",
    "    cmd = \" && \".join(commands)\n",
    "    return cmd\n",
    "\n",
    "\n",
    "def kill_all():\n",
    "    commands = []\n",
    "    commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
    "    commands.append('cat hostfile  | xargs -I \"{}\" ssh \"{}\" pkill -9 python')\n",
    "    cmd = \" && \".join(commands)\n",
    "    return cmd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "host = \"host-0\"\n",
    "log_file = os.path.join(OPEN_SORA_HOME, \"logs/train.log\")\n",
    "data_path = \"/path/to/meta.csv\"\n",
    "cmd = colossal_run(data_path)\n",
    "print(cmd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "run_command(cmd, host, log_file=log_file, nohup=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cmd = kill_all()\n",
    "run_command(cmd, host)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: Open-Sora/opensora/__init__.py
================================================


================================================
FILE: Open-Sora/opensora/acceleration/__init__.py
================================================


================================================
FILE: Open-Sora/opensora/acceleration/checkpoint.py
================================================
from collections.abc import Iterable

import torch.nn as nn
from torch.utils.checkpoint import checkpoint, checkpoint_sequential


def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
    assert isinstance(model, nn.Module)

    def set_attr(module):
        module.grad_checkpointing = True
        module.fp32_attention = use_fp32_attention
        module.grad_checkpointing_step = gc_step

    model.apply(set_attr)


def auto_grad_checkpoint(module, *args, **kwargs):
    if getattr(module, "grad_checkpointing", False):
        if not isinstance(module, Iterable):
            return checkpoint(module, *args, use_reentrant=False, **kwargs)
        gc_step = module[0].grad_checkpointing_step
        return checkpoint_sequential(module, gc_step, *args, use_reentrant=False, **kwargs)
    return module(*args, **kwargs)


================================================
FILE: Open-Sora/opensora/acceleration/communications.py
================================================
import torch
import torch.distributed as dist


# ====================
# All-To-All
# ====================
def _all_to_all(
    input_: torch.Tensor,
    world_size: int,
    group: dist.ProcessGroup,
    scatter_dim: int,
    gather_dim: int,
):
    input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
    output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
    dist.all_to_all(output_list, input_list, group=group)
    return torch.cat(output_list, dim=gather_dim).contiguous()


class _AllToAll(torch.autograd.Function):
    """All-to-all communication.

    Args:
        input_: input matrix
        process_group: communication group
        scatter_dim: scatter dimension
        gather_dim: gather dimension
    """

    @staticmethod
    def forward(ctx, input_, process_group, scatter_dim, gather_dim):
        ctx.process_group = process_group
        ctx.scatter_dim = scatter_dim
        ctx.gather_dim = gather_dim
        ctx.world_size = dist.get_world_size(process_group)
        output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        grad_output = _all_to_all(
            grad_output,
            ctx.world_size,
            ctx.process_group,
            ctx.gather_dim,
            ctx.scatter_dim,
        )
        return (
            grad_output,
            None,
            None,
            None,
        )


def all_to_all(
    input_: torch.Tensor,
    process_group: dist.ProcessGroup,
    scatter_dim: int = 2,
    gather_dim: int = 1,
):
    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)


def _gather(
    input_: torch.Tensor,
    world_size: int,
    group: dist.ProcessGroup,
    gather_dim: int,
):
    if gather_list is None:
        gather_list = [torch.empty_like(input_) for _ in range(world_size)]
    dist.gather(input_, gather_list, group=group, gather_dim=gather_dim)
    return gather_list


# ====================
# Gather-Split
# ====================


def _split(input_, pg: dist.ProcessGroup, dim=-1):
    # skip if only one rank involved
    world_size = dist.get_world_size(pg)
    rank = dist.get_rank(pg)
    if world_size == 1:
        return input_

    # Split along last dimension.
    dim_size = input_.size(dim)
    assert dim_size % world_size == 0, (
        f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), "
        f"cannot split tensor evenly"
    )

    tensor_list = torch.split(input_, dim_size // world_size, dim=dim)
    output = tensor_list[rank].contiguous()

    return output


def _gather(input_, pg: dist.ProcessGroup, dim=-1):
    # skip if only one rank involved
    input_ = input_.contiguous()
    world_size = dist.get_world_size(pg)
    dist.get_rank(pg)

    if world_size == 1:
        return input_

    # all gather
    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
    assert input_.device.type == "cuda"
    torch.distributed.all_gather(tensor_list, input_, group=pg)

    # concat
    output = torch.cat(tensor_list, dim=dim).contiguous()

    return output


class _GatherForwardSplitBackward(torch.autograd.Function):
    """Gather the input from model parallel region and concatenate.

    Args:
        input_: input matrix.
        process_group: parallel mode.
        dim: dimension
    """

    @staticmethod
    def symbolic(graph, input_):
        return _gather(input_)

    @staticmethod
    def forward(ctx, input_, process_group, dim, grad_scale):
        ctx.mode = process_group
        ctx.dim = dim
        ctx.grad_scale = grad_scale
        return _gather(input_, process_group, dim)

    @staticmethod
    def backward(ctx, grad_output):
        if ctx.grad_scale == "up":
            grad_output = grad_output * dist.get_world_size(ctx.mode)
        elif ctx.grad_scale == "down":
            grad_output = grad_output / dist.get_world_size(ctx.mode)

        return _split(grad_output, ctx.mode, ctx.dim), None, None, None


class _SplitForwardGatherBackward(torch.autograd.Function):
    """
    Split the input and keep only the corresponding chuck to the rank.

    Args:
        input_: input matrix.
        process_group: parallel mode.
        dim: dimension
    """

    @staticmethod
    def symbolic(graph, input_):
        return _split(input_)

    @staticmethod
    def forward(ctx, input_, process_group, dim, grad_scale):
        ctx.mode = process_group
        ctx.dim = dim
        ctx.grad_scale = grad_scale
        return _split(input_, process_group, dim)

    @staticmethod
    def backward(ctx, grad_output):
        if ctx.grad_scale == "up":
            grad_output = grad_output * dist.get_world_size(ctx.mode)
        elif ctx.grad_scale == "down":
            grad_output = grad_output / dist.get_world_size(ctx.mode)
        return _gather(grad_output, ctx.mode, ctx.dim), None, None, None


def split_forward_gather_backward(input_, process_group, dim, grad_scale=1.0):
    return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale)


def gather_forward_split_backward(input_, process_group, dim, grad_scale=None):
    return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale)


================================================
FILE: Open-Sora/opensora/acceleration/parallel_states.py
================================================
import torch.distributed as dist

_GLOBAL_PARALLEL_GROUPS = dict()


def set_data_parallel_group(group: dist.ProcessGroup):
    _GLOBAL_PARALLEL_GROUPS["data"] = group


def get_data_parallel_group():
    return _GLOBAL_PARALLEL_GROUPS.get("data", dist.group.WORLD)


def set_sequence_parallel_group(group: dist.ProcessGroup):
    _GLOBAL_PARALLEL_GROUPS["sequence"] = group


def get_sequence_parallel_group():
    return _GLOBAL_PARALLEL_GROUPS.get("sequence", None)


================================================
FILE: Open-Sora/opensora/acceleration/plugin.py
================================================
import random
from typing import Optional

import numpy as np
import torch
from colossalai.booster.plugin import LowLevelZeroPlugin
from colossalai.cluster import ProcessGroupMesh
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler

DP_AXIS, SP_AXIS = 0, 1


class ZeroSeqParallelPlugin(LowLevelZeroPlugin):
    def __init__(
        self,
        sp_size: int = 1,
        stage: int = 2,
        precision: str = "fp16",
        initial_scale: float = 2**32,
        min_scale: float = 1,
        growth_factor: float = 2,
        backoff_factor: float = 0.5,
        growth_interval: int = 1000,
        hysteresis: int = 2,
        max_scale: float = 2**32,
        max_norm: float = 0.0,
        norm_type: float = 2.0,
        reduce_bucket_size_in_m: int = 12,
        communication_dtype: Optional[torch.dtype] = None,
        overlap_communication: bool = True,
        cpu_offload: bool = False,
        master_weights: bool = True,
        verbose: bool = False,
    ) -> None:
        super().__init__(
            stage=stage,
            precision=precision,
            initial_scale=initial_scale,
            min_scale=min_scale,
            growth_factor=growth_factor,
            backoff_factor=backoff_factor,
            growth_interval=growth_interval,
            hysteresis=hysteresis,
            max_scale=max_scale,
            max_norm=max_norm,
            norm_type=norm_type,
            reduce_bucket_size_in_m=reduce_bucket_size_in_m,
            communication_dtype=communication_dtype,
            overlap_communication=overlap_communication,
            cpu_offload=cpu_offload,
            master_weights=master_weights,
            verbose=verbose,
        )
        self.sp_size = sp_size
        assert self.world_size % sp_size == 0, "world_size must be divisible by sp_size"
        self.dp_size = self.world_size // sp_size
        self.pg_mesh = ProcessGroupMesh(self.dp_size, self.sp_size)
        self.dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS)
        self.sp_group = self.pg_mesh.get_group_along_axis(SP_AXIS)
        self.dp_rank = self.pg_mesh.coordinate(DP_AXIS)
        self.sp_rank = self.pg_mesh.coordinate(SP_AXIS)

    def __del__(self):
        """Destroy the prcess groups in ProcessGroupMesh"""
        self.pg_mesh.destroy_mesh_process_groups()

    def prepare_dataloader(
        self,
        dataset,
        batch_size,
        shuffle=False,
        seed=1024,
        drop_last=False,
        pin_memory=False,
        num_workers=0,
        distributed_sampler_cls=None,
        **kwargs,
    ):
        _kwargs = kwargs.copy()
        distributed_sampler_cls = distributed_sampler_cls or DistributedSampler
        sampler = distributed_sampler_cls(dataset, num_replicas=self.dp_size, rank=self.dp_rank, shuffle=shuffle)

        # Deterministic dataloader
        def seed_worker(worker_id):
            worker_seed = seed
            np.random.seed(worker_seed)
            torch.manual_seed(worker_seed)
            random.seed(worker_seed)

        return DataLoader(
            dataset,
            batch_size=batch_size,
            sampler=sampler,
            worker_init_fn=seed_worker,
            drop_last=drop_last,
            pin_memory=pin_memory,
            num_workers=num_workers,
            **_kwargs,
        )


================================================
FILE: Open-Sora/opensora/acceleration/shardformer/__init__.py
================================================


================================================
FILE: Open-Sora/opensora/acceleration/shardformer/modeling/__init__.py
================================================


================================================
FILE: Open-Sora/opensora/acceleration/shardformer/modeling/t5.py
================================================
import torch
import torch.nn as nn


class T5LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
        # half-precision inputs is done in fp32

        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

        # convert into half-precision if necessary
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        return self.weight * hidden_states

    @staticmethod
    def from_native_module(module, *args, **kwargs):
        assert module.__class__.__name__ == "FusedRMSNorm", (
            "Recovering T5LayerNorm requires the original layer to be apex's Fused RMS Norm."
            "Apex's fused norm is automatically used by Hugging Face Transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L265C5-L265C48"
        )

        layer_norm = T5LayerNorm(module.normalized_shape, eps=module.eps)
        layer_norm.weight.data.copy_(module.weight.data)
        layer_norm = layer_norm.to(module.weight.device)
        return layer_norm


================================================
FILE: Open-Sora/opensora/acceleration/shardformer/policy/__init__.py
================================================


================================================
FILE: Open-Sora/opensora/acceleration/shardformer/policy/t5_encoder.py
================================================
from colossalai.shardformer.modeling.jit import get_jit_fused_dropout_add_func
from colossalai.shardformer.modeling.t5 import get_jit_fused_T5_layer_ff_forward, get_T5_layer_self_attention_forward
from colossalai.shardformer.policies.base_policy import Policy, SubModuleReplacementDescription


class T5EncoderPolicy(Policy):
    def config_sanity_check(self):
        assert not self.shard_config.enable_tensor_parallelism
        assert not self.shard_config.enable_flash_attention

    def preprocess(self):
        return self.model

    def module_policy(self):
        from transformers.models.t5.modeling_t5 import T5LayerFF, T5LayerSelfAttention, T5Stack

        policy = {}

        # check whether apex is installed
        try:
            from opensora.acceleration.shardformer.modeling.t5 import T5LayerNorm

            # recover hf from fused rms norm to T5 norm which is faster
            self.append_or_create_submodule_replacement(
                description=SubModuleReplacementDescription(
                    suffix="layer_norm",
                    target_module=T5LayerNorm,
                ),
                policy=policy,
                target_key=T5LayerFF,
            )
            self.append_or_create_submodule_replacement(
                description=SubModuleReplacementDescription(suffix="layer_norm", target_module=T5LayerNorm),
                policy=policy,
                target_key=T5LayerSelfAttention,
            )
            self.append_or_create_submodule_replacement(
                description=SubModuleReplacementDescription(suffix="final_layer_norm", target_module=T5LayerNorm),
                policy=policy,
                target_key=T5Stack,
            )
        except (ImportError, ModuleNotFoundError):
            pass

        # use jit operator
        if self.shard_config.enable_jit_fused:
            self.append_or_create_method_replacement(
                description={
                    "forward": get_jit_fused_T5_layer_ff_forward(),
                    "dropout_add": get_jit_fused_dropout_add_func(),
                },
                policy=policy,
                target_key=T5LayerFF,
            )
            self.append_or_create_method_replacement(
                description={
                    "forward": get_T5_layer_self_attention_forward(),
                    "dropout_add": get_jit_fused_dropout_add_func(),
                },
                policy=policy,
                target_key=T5LayerSelfAttention,
            )

        return policy

    def postprocess(self):
        return self.model


================================================
FILE: Open-Sora/opensora/datasets/__init__.py
================================================
from .datasets import IMG_FPS, BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset
from .utils import get_transforms_image, get_transforms_video, is_img, is_vid, save_sample


================================================
FILE: Open-Sora/opensora/datasets/aspect.py
================================================
import math


# computation
def get_h_w(a, ts, eps=1e-4):
    h = (ts * a) ** 0.5
    h = h + eps
    h = math.ceil(h) if math.ceil(h) % 2 == 0 else math.floor(h)
    w = h / a
    w = w + eps
    w = math.ceil(w) if math.ceil(w) % 2 == 0 else math.floor(w)
    return h, w


def get_aspect_ratios_dict(ars, ts=360 * 640):
    est = {f"{a:.2f}": get_h_w(a, ts) for a in ars}
    return est


def get_ar(ratio):
    h, w = ratio.split(":")
    return int(h) / int(w)


# H:W
ASPECT_RATIO_MAP = {
    "3:8": "0.38",
    "9:21": "0.43",
    "12:25": "0.48",
    "1:2": "0.50",
    "9:17": "0.53",
    "27:50": "0.54",
    "9:16": "0.56",
    "5:8": "0.62",
    "2:3": "0.67",
    "3:4": "0.75",
    "1:1": "1.00",
    "4:3": "1.33",
    "3:2": "1.50",
    "16:9": "1.78",
    "17:9": "1.89",
    "2:1": "2.00",
    "50:27": "2.08",
}


AR = [get_ar(ratio) for ratio in ASPECT_RATIO_MAP.keys()]

# computed from above code
# S = 8294400
ASPECT_RATIO_4K = {
    "0.38": (1764, 4704),
    "0.43": (1886, 4400),
    "0.48": (1996, 4158),
    "0.50": (2036, 4072),
    "0.53": (2096, 3960),
    "0.54": (2118, 3918),
    "0.62": (2276, 3642),
    "0.56": (2160, 3840),  # base
    "0.67": (2352, 3528),
    "0.75": (2494, 3326),
    "1.00": (2880, 2880),
    "1.33": (3326, 2494),
    "1.50": (3528, 2352),
    "1.78": (3840, 2160),
    "1.89": (3958, 2096),
    "2.00": (4072, 2036),
    "2.08": (4156, 1994),
}

# S = 3686400
ASPECT_RATIO_2K = {
    "0.38": (1176, 3136),
    "0.43": (1256, 2930),
    "0.48": (1330, 2770),
    "0.50": (1358, 2716),
    "0.53": (1398, 2640),
    "0.54": (1412, 2612),
    "0.56": (1440, 2560),  # base
    "0.62": (1518, 2428),
    "0.67": (1568, 2352),
    "0.75": (1662, 2216),
    "1.00": (1920, 1920),
    "1.33": (2218, 1664),
    "1.50": (2352, 1568),
    "1.78": (2560, 1440),
    "1.89": (2638, 1396),
    "2.00": (2716, 1358),
    "2.08": (2772, 1330),
}

# S = 2073600
ASPECT_RATIO_1080P = {
    "0.38": (882, 2352),
    "0.43": (942, 2198),
    "0.48": (998, 2080),
    "0.50": (1018, 2036),
    "0.53": (1048, 1980),
    "0.54": (1058, 1958),
    "0.56": (1080, 1920),  # base
    "0.62": (1138, 1820),
    "0.67": (1176, 1764),
    "0.75": (1248, 1664),
    "1.00": (1440, 1440),
    "1.33": (1662, 1246),
    "1.50": (1764, 1176),
    "1.78": (1920, 1080),
    "1.89": (1980, 1048),
    "2.00": (2036, 1018),
    "2.08": (2078, 998),
}

# S = 921600
ASPECT_RATIO_720P = {
    "0.38": (588, 1568),
    "0.43": (628, 1466),
    "0.48": (666, 1388),
    "0.50": (678, 1356),
    "0.53": (698, 1318),
    "0.54": (706, 1306),
    "0.56": (720, 1280),  # base
    "0.62": (758, 1212),
    "0.67": (784, 1176),
    "0.75": (832, 1110),
    "1.00": (960, 960),
    "1.33": (1108, 832),
    "1.50": (1176, 784),
    "1.78": (1280, 720),
    "1.89": (1320, 698),
    "2.00": (1358, 680),
    "2.08": (1386, 666),
}

# S = 409920
ASPECT_RATIO_480P = {
    "0.38": (392, 1046),
    "0.43": (420, 980),
    "0.48": (444, 925),
    "0.50": (452, 904),
    "0.53": (466, 880),
    "0.54": (470, 870),
    "0.56": (480, 854),  # base
    "0.62": (506, 810),
    "0.67": (522, 784),
    "0.75": (554, 738),
    "1.00": (640, 640),
    "1.33": (740, 555),
    "1.50": (784, 522),
    "1.78": (854, 480),
    "1.89": (880, 466),
    "2.00": (906, 454),
    "2.08": (924, 444),
}

# S = 230400
ASPECT_RATIO_360P = {
    "0.38": (294, 784),
    "0.43": (314, 732),
    "0.48": (332, 692),
    "0.50": (340, 680),
    "0.53": (350, 662),
    "0.54": (352, 652),
    "0.56": (360, 640),  # base
    "0.62": (380, 608),
    "0.67": (392, 588),
    "0.75": (416, 554),
    "1.00": (480, 480),
    "1.33": (554, 416),
    "1.50": (588, 392),
    "1.78": (640, 360),
    "1.89": (660, 350),
    "2.00": (678, 340),
    "2.08": (692, 332),
}

# S = 102240
ASPECT_RATIO_240P = {
    "0.38": (196, 522),
    "0.43": (210, 490),
    "0.48": (222, 462),
    "0.50": (226, 452),
    "0.53": (232, 438),
    "0.54": (236, 436),
    "0.56": (240, 426),  # base
    "0.62": (252, 404),
    "0.67": (262, 393),
    "0.75": (276, 368),
    "1.00": (320, 320),
    "1.33": (370, 278),
    "1.50": (392, 262),
    "1.78": (426, 240),
    "1.89": (440, 232),
    "2.00": (452, 226),
    "2.08": (462, 222),
}

# S = 36864
ASPECT_RATIO_144P = {
    "0.38": (117, 312),
    "0.43": (125, 291),
    "0.48": (133, 277),
    "0.50": (135, 270),
    "0.53": (139, 262),
    "0.54": (141, 260),
    "0.56": (144, 256),  # base
    "0.62": (151, 241),
    "0.67": (156, 234),
    "0.75": (166, 221),
    "1.00": (192, 192),
    "1.33": (221, 165),
    "1.50": (235, 156),
    "1.78": (256, 144),
    "1.89": (263, 139),
    "2.00": (271, 135),
    "2.08": (277, 132),
}

# from PixArt
# S = 8294400
ASPECT_RATIO_2880 = {
    "0.25": (1408, 5760),
    "0.26": (1408, 5568),
    "0.27": (1408, 5376),
    "0.28": (1408, 5184),
    "0.32": (1600, 4992),
    "0.33": (1600, 4800),
    "0.34": (1600, 4672),
    "0.40": (1792, 4480),
    "0.42": (1792, 4288),
    "0.47": (1920, 4096),
    "0.49": (1920, 3904),
    "0.51": (1920, 3776),
    "0.55": (2112, 3840),
    "0.59": (2112, 3584),
    "0.68": (2304, 3392),
    "0.72": (2304, 3200),
    "0.78": (2496, 3200),
    "0.83": (2496, 3008),
    "0.89": (2688, 3008),
    "0.93": (2688, 2880),
    "1.00": (2880, 2880),
    "1.07": (2880, 2688),
    "1.12": (3008, 2688),
    "1.21": (3008, 2496),
    "1.28": (3200, 2496),
    "1.39": (3200, 2304),
    "1.47": (3392, 2304),
    "1.70": (3584, 2112),
    "1.82": (3840, 2112),
    "2.03": (3904, 1920),
    "2.13": (4096, 1920),
    "2.39": (4288, 1792),
    "2.50": (4480, 1792),
    "2.92": (4672, 1600),
    "3.00": (4800, 1600),
    "3.12": (4992, 1600),
    "3.68": (5184, 1408),
    "3.82": (5376, 1408),
    "3.95": (5568, 1408),
    "4.00": (5760, 1408),
}

# S = 4194304
ASPECT_RATIO_2048 = {
    "0.25": (1024, 4096),
    "0.26": (1024, 3968),
    "0.27": (1024, 3840),
    "0.28": (1024, 3712),
    "0.32": (1152, 3584),
    "0.33": (1152, 3456),
    "0.35": (1152, 3328),
    "0.40": (1280, 3200),
    "0.42": (1280, 3072),
    "0.48": (1408, 2944),
    "0.50": (1408, 2816),
    "0.52": (1408, 2688),
    "0.57": (1536, 2688),
    "0.60": (1536, 2560),
    "0.68": (1664, 2432),
    "0.72": (1664, 2304),
    "0.78": (1792, 2304),
    "0.82": (1792, 2176),
    "0.88": (1920, 2176),
    "0.94": (1920, 2048),
    "1.00": (2048, 2048),
    "1.07": (2048, 1920),
    "1.13": (2176, 1920),
    "1.21": (2176, 1792),
    "1.29": (2304, 1792),
    "1.38": (2304, 1664),
    "1.46": (2432, 1664),
    "1.67": (2560, 1536),
    "1.75": (2688, 1536),
    "2.00": (2816, 1408),
    "2.09": (2944, 1408),
    "2.40": (3072, 1280),
    "2.50": (3200, 1280),
    "2.89": (3328, 1152),
    "3.00": (3456, 1152),
    "3.11": (3584, 1152),
    "3.62": (3712, 1024),
    "3.75": (3840, 1024),
    "3.88": (3968, 1024),
    "4.00": (4096, 1024),
}

# S = 1048576
ASPECT_RATIO_1024 = {
    "0.25": (512, 2048),
    "0.26": (512, 1984),
    "0.27": (512, 1920),
    "0.28": (512, 1856),
    "0.32": (576, 1792),
    "0.33": (576, 1728),
    "0.35": (576, 1664),
    "0.40": (640, 1600),
    "0.42": (640, 1536),
    "0.48": (704, 1472),
    "0.50": (704, 1408),
    "0.52": (704, 1344),
    "0.57": (768, 1344),
    "0.60": (768, 1280),
    "0.68": (832, 1216),
    "0.72": (832, 1152),
    "0.78": (896, 1152),
    "0.82": (896, 1088),
    "0.88": (960, 1088),
    "0.94": (960, 1024),
    "1.00": (1024, 1024),
    "1.07": (1024, 960),
    "1.13": (1088, 960),
    "1.21": (1088, 896),
    "1.29": (1152, 896),
    "1.38": (1152, 832),
    "1.46": (1216, 832),
    "1.67": (1280, 768),
    "1.75": (1344, 768),
    "2.00": (1408, 704),
    "2.09": (1472, 704),
    "2.40": (1536, 640),
    "2.50": (1600, 640),
    "2.89": (1664, 576),
    "3.00": (1728, 576),
    "3.11": (1792, 576),
    "3.62": (1856, 512),
    "3.75": (1920, 512),
    "3.88": (1984, 512),
    "4.00": (2048, 512),
}

# S = 262144
ASPECT_RATIO_512 = {
    "0.25": (256, 1024),
    "0.26": (256, 992),
    "0.27": (256, 960),
    "0.28": (256, 928),
    "0.32": (288, 896),
    "0.33": (288, 864),
    "0.35": (288, 832),
    "0.40": (320, 800),
    "0.42": (320, 768),
    "0.48": (352, 736),
    "0.50": (352, 704),
    "0.52": (352, 672),
    "0.57": (384, 672),
    "0.60": (384, 640),
    "0.68": (416, 608),
    "0.72": (416, 576),
    "0.78": (448, 576),
    "0.82": (448, 544),
    "0.88": (480, 544),
    "0.94": (480, 512),
    "1.00": (512, 512),
    "1.07": (512, 480),
    "1.13": (544, 480),
    "1.21": (544, 448),
    "1.29": (576, 448),
    "1.38": (576, 416),
    "1.46": (608, 416),
    "1.67": (640, 384),
    "1.75": (672, 384),
    "2.00": (704, 352),
    "2.09": (736, 352),
    "2.40": (768, 320),
    "2.50": (800, 320),
    "2.89": (832, 288),
    "3.00": (864, 288),
    "3.11": (896, 288),
    "3.62": (928, 256),
    "3.75": (960, 256),
    "3.88": (992, 256),
    "4.00": (1024, 256),
}

# S = 65536
ASPECT_RATIO_256 = {
    "0.25": (128, 512),
    "0.26": (128, 496),
    "0.27": (128, 480),
    "0.28": (128, 464),
    "0.32": (144, 448),
    "0.33": (144, 432),
    "0.35": (144, 416),
    "0.40": (160, 400),
    "0.42": (160, 384),
    "0.48": (176, 368),
    "0.50": (176, 352),
    "0.52": (176, 336),
    "0.57": (192, 336),
    "0.60": (192, 320),
    "0.68": (208, 304),
    "0.72": (208, 288),
    "0.78": (224, 288),
    "0.82": (224, 272),
    "0.88": (240, 272),
    "0.94": (240, 256),
    "1.00": (256, 256),
    "1.07": (256, 240),
    "1.13": (272, 240),
    "1.21": (272, 224),
    "1.29": (288, 224),
    "1.38": (288, 208),
    "1.46": (304, 208),
    "1.67": (320, 192),
    "1.75": (336, 192),
    "2.00": (352, 176),
    "2.09": (368, 176),
    "2.40": (384, 160),
    "2.50": (400, 160),
    "2.89": (416, 144),
    "3.00": (432, 144),
    "3.11": (448, 144),
    "3.62": (464, 128),
    "3.75": (480, 128),
    "3.88": (496, 128),
    "4.00": (512, 128),
}


def get_closest_ratio(height: float, width: float, ratios: dict):
    aspect_ratio = height / width
    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
    return closest_ratio


ASPECT_RATIOS = {
    "144p": (36864, ASPECT_RATIO_144P),
    "256": (65536, ASPECT_RATIO_256),
    "240p": (102240, ASPECT_RATIO_240P),
    "360p": (230400, ASPECT_RATIO_360P),
    "512": (262144, ASPECT_RATIO_512),
    "480p": (409920, ASPECT_RATIO_480P),
    "720p": (921600, ASPECT_RATIO_720P),
    "1024": (1048576, ASPECT_RATIO_1024),
    "1080p": (2073600, ASPECT_RATIO_1080P),
    "2k": (3686400, ASPECT_RATIO_2K),
    "2048": (4194304, ASPECT_RATIO_2048),
    "2880": (8294400, ASPECT_RATIO_2880),
    "4k": (8294400, ASPECT_RATIO_4K),
}


def get_num_pixels(name):
    return ASPECT_RATIOS[name][0]


def get_image_size(resolution, ar_ratio):
    if ar_ratio in ASPECT_RATIO_MAP:
        ar_key = ASPECT_RATIO_MAP[ar_ratio]
    else:
        ar_key = ar_ratio
    rs_dict = ASPECT_RATIOS[resolution][1]
    assert ar_key in rs_dict, f"Aspect ratio {ar_ratio} not found for resolution {resolution}"
    return rs_dict[ar_key]


NUM_FRAMES_MAP = {
    "1x": 51,
    "2x": 102,
    "4x": 204,
    "8x": 408,
    "16x": 816,
    "2s": 51,
    "4s": 102,
    "8s": 204,
    "16s": 408,
    "32s": 816,
}


def get_num_frames(num_frames):
    if num_frames in NUM_FRAMES_MAP:
        return NUM_FRAMES_MAP[num_frames]
    else:
        return int(num_frames)


================================================
FILE: Open-Sora/opensora/datasets/bucket.py
================================================
from collections import OrderedDict

import numpy as np

from opensora.utils.misc import get_logger

from .aspect import ASPECT_RATIOS, get_closest_ratio


def find_approximate_hw(hw, hw_dict, approx=0.8):
    for k, v in hw_dict.items():
        if hw >= v * approx:
            return k
    return None


def find_closet_smaller_bucket(t, t_dict, frame_interval):
    # process image
    if t == 1:
        if 1 in t_dict:
            return 1
        else:
            return None
    # process video
    for k, v in t_dict.items():
        if t >= v * frame_interval and v != 1:
            return k
    return None


class Bucket:
    def __init__(self, bucket_config):
        for key in bucket_config:
            assert key in ASPECT_RATIOS, f"Aspect ratio {key} not found."
        # wrap config with OrderedDict
        bucket_probs = OrderedDict()
        bucket_bs = OrderedDict()
        bucket_names = sorted(bucket_config.keys(), key=lambda x: ASPECT_RATIOS[x][0], reverse=True)
        for key in bucket_names:
            bucket_time_names = sorted(bucket_config[key].keys(), key=lambda x: x, reverse=True)
            bucket_probs[key] = OrderedDict({k: bucket_config[key][k][0] for k in bucket_time_names})
            bucket_bs[key] = OrderedDict({k: bucket_config[key][k][1] for k in bucket_time_names})

        # first level: HW
        num_bucket = 0
        hw_criteria = dict()
        t_criteria = dict()
        ar_criteria = dict()
        bucket_id = OrderedDict()
        bucket_id_cnt = 0
        for k1, v1 in bucket_probs.items():
            hw_criteria[k1] = ASPECT_RATIOS[k1][0]
            t_criteria[k1] = dict()
            ar_criteria[k1] = dict()
            bucket_id[k1] = dict()
            for k2, _ in v1.items():
                t_criteria[k1][k2] = k2
                bucket_id[k1][k2] = bucket_id_cnt
                bucket_id_cnt += 1
                ar_criteria[k1][k2] = dict()
                for k3, v3 in ASPECT_RATIOS[k1][1].items():
                    ar_criteria[k1][k2][k3] = v3
                    num_bucket += 1

        self.bucket_probs = bucket_probs
        self.bucket_bs = bucket_bs
        self.bucket_id = bucket_id
        self.hw_criteria = hw_criteria
        self.t_criteria = t_criteria
        self.ar_criteria = ar_criteria
        self.num_bucket = num_bucket
        get_logger().info("Number of buckets: %s", num_bucket)

    def get_bucket_id(self, T, H, W, frame_interval=1, seed=None):
        resolution = H * W
        approx = 0.8

        fail = True
        for hw_id, t_criteria in self.bucket_probs.items():
            if resolution < self.hw_criteria[hw_id] * approx:
                continue

            # if sample is an image
            if T == 1:
                if 1 in t_criteria:
                    rng = np.random.default_rng(seed + self.bucket_id[hw_id][1])
                    if rng.random() < t_criteria[1]:
                        fail = False
                        t_id = 1
                        break
                else:
                    continue

            # otherwise, find suitable t_id for video
            t_fail = True
            for t_id, prob in t_criteria.items():
                rng = np.random.default_rng(seed + self.bucket_id[hw_id][t_id])
                if isinstance(prob, tuple):
                    prob_t = prob[1]
                    if rng.random() > prob_t:
                        continue
                if T > t_id * frame_interval and t_id != 1:
                    t_fail = False
                    break
            if t_fail:
                continue

            # leave the loop if prob is high enough
            if isinstance(prob, tuple):
                prob = prob[0]
            if prob >= 1 or rng.random() < prob:
                fail = False
                break
        if fail:
            return None

        # get aspect ratio id
        ar_criteria = self.ar_criteria[hw_id][t_id]
        ar_id = get_closest_ratio(H, W, ar_criteria)
        return hw_id, t_id, ar_id

    def get_thw(self, bucket_id):
        assert len(bucket_id) == 3
        T = self.t_criteria[bucket_id[0]][bucket_id[1]]
        H, W = self.ar_criteria[bucket_id[0]][bucket_id[1]][bucket_id[2]]
        return T, H, W

    def get_prob(self, bucket_id):
        return self.bucket_probs[bucket_id[0]][bucket_id[1]]

    def get_batch_size(self, bucket_id):
        return self.bucket_bs[bucket_id[0]][bucket_id[1]]

    def __len__(self):
        return self.num_bucket


def closet_smaller_bucket(value, bucket):
    for i in range(1, len(bucket)):
        if value < bucket[i]:
            return bucket[i - 1]
    return bucket[-1]


================================================
FILE: Open-Sora/opensora/datasets/dataloader.py
================================================
import collections
import random
from typing import Optional

import numpy as np
import torch
from torch.distributed import ProcessGroup
from torch.distributed.distributed_c10d import _get_default_group
from torch.utils.data import DataLoader

from .datasets import BatchFeatureDataset, VariableVideoTextDataset, VideoTextDataset
from .sampler import BatchDistributedSampler, StatefulDistributedSampler, VariableVideoBatchSampler


# Deterministic dataloader
def get_seed_worker(seed):
    def seed_worker(worker_id):
        worker_seed = seed
        np.random.seed(worker_seed)
        torch.manual_seed(worker_seed)
        random.seed(worker_seed)

    return seed_worker


def prepare_dataloader(
    dataset,
    batch_size=None,
    shuffle=False,
    seed=1024,
    drop_last=False,
    pin_memory=False,
    num_workers=0,
    process_group: Optional[ProcessGroup] = None,
    bucket_config=None,
    num_bucket_build_workers=1,
    prefetch_factor=None,
    **kwargs,
):
    _kwargs = kwargs.copy()
    if isinstance(dataset, VariableVideoTextDataset):
        batch_sampler = VariableVideoBatchSampler(
            dataset,
            bucket_config,
            num_replicas=process_group.size(),
            rank=process_group.rank(),
            shuffle=shuffle,
            seed=seed,
            drop_last=drop_last,
            verbose=True,
            num_bucket_build_workers=num_bucket_build_workers,
        )
        return (
            DataLoader(
                dataset,
                batch_sampler=batch_sampler,
                worker_init_fn=get_seed_worker(seed),
                pin_memory=pin_memory,
                num_workers=num_workers,
                collate_fn=collate_fn_default,
                prefetch_factor=prefetch_factor,
                **_kwargs,
            ),
            batch_sampler,
        )
    elif isinstance(dataset, VideoTextDataset):
        process_group = process_group or _get_default_group()
        sampler = StatefulDistributedSampler(
            dataset,
            num_replicas=process_group.size(),
            rank=process_group.rank(),
            shuffle=shuffle,
        )
        return (
            DataLoader(
                dataset,
                batch_size=batch_size,
                sampler=sampler,
                worker_init_fn=get_seed_worker(seed),
                drop_last=drop_last,
                pin_memory=pin_memory,
                num_workers=num_workers,
                collate_fn=collate_fn_default,
                prefetch_factor=prefetch_factor,
                **_kwargs,
            ),
            sampler,
        )
    elif isinstance(dataset, BatchFeatureDataset):
        sampler = BatchDistributedSampler(
            dataset,
            num_replicas=process_group.size(),
            rank=process_group.rank(),
        )
        return (
            DataLoader(
                dataset,
                batch_size=1,
                sampler=sampler,
                worker_init_fn=get_seed_worker(seed),
                pin_memory=pin_memory,
                num_workers=num_workers,
                collate_fn=collate_fn_batch,
                prefetch_factor=prefetch_factor,
                **_kwargs,
            ),
            sampler,
        )
    else:
        raise ValueError(f"Unsupported dataset type: {type(dataset)}")


def collate_fn_default(batch):
    # filter out None
    batch = [x for x in batch if x is not None]

    # HACK: for loading text features
    use_mask = False
    if "mask" in batch[0] and isinstance(batch[0]["mask"], int):
        masks = [x.pop("mask") for x in batch]

        texts = [x.pop("text") for x in batch]
        texts = torch.cat(texts, dim=1)
        use_mask = True

    ret = torch.utils.data.default_collate(batch)

    if use_mask:
        ret["mask"] = masks
        ret["text"] = texts
    return ret


def collate_fn_batch(batch):
    """
    Used only with BatchDistributedSampler
    """
    # filter out None
    batch = [x for x in batch if x is not None]
    
    res = torch.utils.data.default_collate(batch)

    # squeeze the first dimension, which is due to torch.stack() in default_collate()
    if isinstance(res, collections.abc.Mapping):
        for k, v in res.items():
            if isinstance(v, torch.Tensor):
                res[k] = v.squeeze(0)
    elif isinstance(res, collections.abc.Sequence):
        res = [x.squeeze(0) if isinstance(x, torch.Tensor) else x for x in res]
    elif isinstance(res, torch.Tensor):
        res = res.squeeze(0)
    else:
        raise TypeError

    return res


================================================
FILE: Open-Sora/opensora/datasets/datasets.py
================================================
import os
from glob import glob

import numpy as np
import torch
from PIL import ImageFile
from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader

from opensora.registry import DATASETS

from .read_video import read_video
from .utils import VID_EXTENSIONS, get_transforms_image, get_transforms_video, read_file, temporal_random_crop

ImageFile.LOAD_TRUNCATED_IMAGES = True
IMG_FPS = 120


@DATASETS.register_module()
class VideoTextDataset(torch.utils.data.Dataset):
    """load video according to the csv file.

    Args:
        target_video_len (int): the number of video frames will be load.
        align_transform (callable): Align different videos in a specified size.
        temporal_sample (callable): Sample the target length of a video.
    """

    def __init__(
        self,
        data_path=None,
        num_frames=16,
        frame_interval=1,
        image_size=(256, 256),
        transform_name="center",
    ):
        self.data_path = data_path
        self.data = read_file(data_path)
        self.get_text = "text" in self.data.columns
        self.num_frames = num_frames
        self.frame_interval = frame_interval
        self.image_size = image_size
        self.transforms = {
            "image": get_transforms_image(transform_name, image_size),
            "video": get_transforms_video(transform_name, image_size),
        }

    def _print_data_number(self):
        num_videos = 0
        num_images = 0
        for path in self.data["path"]:
            if self.get_type(path) == "video":
                num_videos += 1
            else:
                num_images += 1
        print(f"Dataset contains {num_videos} videos and {num_images} images.")

    def get_type(self, path):
        ext = os.path.splitext(path)[-1].lower()
        if ext.lower() in VID_EXTENSIONS:
            return "video"
        else:
            assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}"
            return "image"

    def getitem(self, index):
        sample = self.data.iloc[index]
        path = sample["path"]
        file_type = self.get_type(path)

        if file_type == "video":
            # loading
            vframes, vinfo = read_video(path, backend="av")
            video_fps = vinfo["video_fps"] if "video_fps" in vinfo else 24

            # Sampling video frames
            video = temporal_random_crop(vframes, self.num_frames, self.frame_interval)

            # transform
            transform = self.transforms["video"]
            video = transform(video)  # T C H W
        else:
            # loading
            image = pil_loader(path)
            video_fps = IMG_FPS

            # transform
            transform = self.transforms["image"]
            image = transform(image)

            # repeat
            video = image.unsqueeze(0).repeat(self.num_frames, 1, 1, 1)

        # TCHW -> CTHW
        video = video.permute(1, 0, 2, 3)

        ret = {"video": video, "fps": video_fps}
        if self.get_text:
            ret["text"] = sample["text"]
        return ret

    def __getitem__(self, index):
        for _ in range(10):
            try:
                return self.getitem(index)
            except Exception as e:
                path = self.data.iloc[index]["path"]
                print(f"data {path}: {e}")
                index = np.random.randint(len(self))
        raise RuntimeError("Too many bad data.")

    def __len__(self):
        return len(self.data)


@DATASETS.register_module()
class VariableVideoTextDataset(VideoTextDataset):
    def __init__(
        self,
        data_path=None,
        num_frames=None,
        frame_interval=1,
        image_size=(None, None),
        transform_name=None,
        dummy_text_feature=False,
    ):
        super().__init__(data_path, num_frames, frame_interval, image_size, transform_name=None)
        self.transform_name = transform_name
        self.data["id"] = np.arange(len(self.data))
        self.dummy_text_feature = dummy_text_feature

    def get_data_info(self, index):
        T = self.data.iloc[index]["num_frames"]
        H = self.data.iloc[index]["height"]
        W = self.data.iloc[index]["width"]
        return T, H, W

    def getitem(self, index):
        # a hack to pass in the (time, height, width) info from sampler
        index, num_frames, height, width = [int(val) for val in index.split("-")]

        sample = self.data.iloc[index]
        path = sample["path"]
        file_type = self.get_type(path)
        ar = height / width

        video_fps = 24  # default fps
        if file_type == "video":
            # loading
            vframes, vinfo = read_video(path, backend="av")
            video_fps = vinfo["video_fps"] if "video_fps" in vinfo else 24

            # Sampling video frames
            video = temporal_random_crop(vframes, num_frames, self.frame_interval)
            video = video.clone()
            del vframes

            video_fps = video_fps // self.frame_interval

            # transform
            transform = get_transforms_video(self.transform_name, (height, width))
            video = transform(video)  # T C H W
        else:
            # loading
            image = pil_loader(path)
            video_fps = IMG_FPS

            # transform
            transform = get_transforms_image(self.transform_name, (height, width))
            image = transform(image)

            # repeat
            video = image.unsqueeze(0)

        # TCHW -> CTHW
        video = video.permute(1, 0, 2, 3)
        ret = {
            "video": video,
            "num_frames": num_frames,
            "height": height,
            "width": width,
            "ar": ar,
            "fps": video_fps,
        }
        if self.get_text:
            ret["text"] = sample["text"]
        if self.dummy_text_feature:
            text_len = 50
            ret["text"] = torch.zeros((1, text_len, 1152))
            ret["mask"] = text_len
        return ret

    def __getitem__(self, index):
        try:
            return self.getitem(index)
        except:
            return None


@DATASETS.register_module()
class BatchFeatureDataset(torch.utils.data.Dataset):
    """
    The dataset is composed of multiple .bin files.
    Each .bin file is a list of batch data (like a buffer). All .bin files have the same length.
    In each training iteration, one batch is fetched from the current buffer.
    Once a buffer is consumed, load another one.
    Avoid loading the same .bin on two difference GPUs, i.e., one .bin is assigned to one GPU only.
    """

    def __init__(self, data_path=None):
        self.path_list = sorted(glob(data_path + "/**/*.bin"))

        self._len_buffer = len(torch.load(self.path_list[0]))
        self._num_buffers = len(self.path_list)
        self.num_samples = self.len_buffer * len(self.path_list)

        self.cur_file_idx = -1
        self.cur_buffer = None

    @property
    def num_buffers(self):
        return self._num_buffers

    @property
    def len_buffer(self):
        return self._len_buffer

    def _load_buffer(self, idx):
        file_idx = idx // self.len_buffer
        if file_idx != self.cur_file_idx:
            self.cur_file_idx = file_idx
            self.cur_buffer = torch.load(self.path_list[file_idx])

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        self._load_buffer(idx)

        batch = self.cur_buffer[idx % self.len_buffer]  # dict; keys are {'x', 'fps'} and text related

        ret = {
            "video": batch["x"],
            "text": batch["y"],
            "mask": batch["mask"],
            "fps": batch["fps"],
            "height": batch["height"],
            "width": batch["width"],
            "num_frames": batch["num_frames"],
        }
        return ret


================================================
FILE: Open-Sora/opensora/datasets/read_video.py
================================================
import gc
import math
import os
import re
import warnings
from fractions import Fraction
from typing import Any, Dict, List, Optional, Tuple, Union

import av
import cv2
import numpy as np
import torch
from torchvision import get_video_backend
from torchvision.io.video import _check_av_available

MAX_NUM_FRAMES = 2500


def read_video_av(
    filename: str,
    start_pts: Union[float, Fraction] = 0,
    end_pts: Optional[Union[float, Fraction]] = None,
    pts_unit: str = "pts",
    output_format: str = "THWC",
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
    """
    Reads a video from a file, returning both the video frames and the audio frames

    This method is modified from torchvision.io.video.read_video, with the following changes:

    1. will not extract audio frames and return empty for aframes
    2. remove checks and only support pyav
    3. add container.close() and gc.collect() to avoid thread leakage
    4. try our best to avoid memory leak

    Args:
        filename (str): path to the video file
        start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
            The start presentation time of the video
        end_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
            The end presentation time
        pts_unit (str, optional): unit in which start_pts and end_pts values will be interpreted,
            either 'pts' or 'sec'. Defaults to 'pts'.
        output_format (str, optional): The format of the output video tensors. Can be either "THWC" (default) or "TCHW".

    Returns:
        vframes (Tensor[T, H, W, C] or Tensor[T, C, H, W]): the `T` video frames
        aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points
        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int)
    """
    # format
    output_format = output_format.upper()
    if output_format not in ("THWC", "TCHW"):
        raise ValueError(f"output_format should be either 'THWC' or 'TCHW', got {output_format}.")
    # file existence
    if not os.path.exists(filename):
        raise RuntimeError(f"File not found: {filename}")
    # backend check
    assert get_video_backend() == "pyav", "pyav backend is required for read_video_av"
    _check_av_available()
    # end_pts check
    if end_pts is None:
        end_pts = float("inf")
    if end_pts < start_pts:
        raise ValueError(f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}")

    # == get video info ==
    info = {}
    # TODO: creating an container leads to memory leak (1G for 8 workers 1 GPU)
    container = av.open(filename, metadata_errors="ignore")
    # fps
    video_fps = container.streams.video[0].average_rate
    # guard against potentially corrupted files
    if video_fps is not None:
        info["video_fps"] = float(video_fps)
    iter_video = container.decode(**{"video": 0})
    frame = next(iter_video).to_rgb().to_ndarray()
    height, width = frame.shape[:2]
    total_frames = container.streams.video[0].frames
    if total_frames == 0:
        total_frames = MAX_NUM_FRAMES
        warnings.warn(f"total_frames is 0, using {MAX_NUM_FRAMES} as a fallback")
    container.close()
    del container

    # HACK: must create before iterating stream
    # use np.zeros will not actually allocate memory
    # use np.ones will lead to a little memory leak
    video_frames = np.zeros((total_frames, height, width, 3), dtype=np.uint8)

    # == read ==
    try:
        # TODO: The reading has memory leak (4G for 8 workers 1 GPU)
        container = av.open(filename, metadata_errors="ignore")
        assert container.streams.video is not None
        video_frames = _read_from_stream(
            video_frames,
            container,
            start_pts,
            end_pts,
            pts_unit,
            container.streams.video[0],
            {"video": 0},
            filename=filename,
        )
    except av.AVError as e:
        print(f"[Warning] Error while reading video {filename}: {e}")

    vframes = torch.from_numpy(video_frames).clone()
    del video_frames
    if output_format == "TCHW":
        # [T,H,W,C] --> [T,C,H,W]
        vframes = vframes.permute(0, 3, 1, 2)

    aframes = torch.empty((1, 0), dtype=torch.float32)
    return vframes, aframes, info


def _read_from_stream(
    video_frames,
    container: "av.container.Container",
    start_offset: float,
    end_offset: float,
    pts_unit: str,
    stream: "av.stream.Stream",
    stream_name: Dict[str, Optional[Union[int, Tuple[int, ...], List[int]]]],
    filename: Optional[str] = None,
) -> List["av.frame.Frame"]:
    if pts_unit == "sec":
        # TODO: we should change all of this from ground up to simply take
        # sec and convert to MS in C++
        start_offset = int(math.floor(start_offset * (1 / stream.time_base)))
        if end_offset != float("inf"):
            end_offset = int(math.ceil(end_offset * (1 / stream.time_base)))
    else:
        warnings.warn("The pts_unit 'pts' gives wrong results. Please use pts_unit 'sec'.")

    should_buffer = True
    max_buffer_size = 5
    if stream.type == "video":
        # DivX-style packed B-frames can have out-of-order pts (2 frames in a single pkt)
        # so need to buffer some extra frames to sort everything
        # properly
        extradata = stream.codec_context.extradata
        # overly complicated way of finding if `divx_packed` is set, following
        # https://github.com/FFmpeg/FFmpeg/commit/d5a21172283572af587b3d939eba0091484d3263
        if extradata and b"DivX" in extradata:
            # can't use regex directly because of some weird characters sometimes...
            pos = extradata.find(b"DivX")
            d = extradata[pos:]
            o = re.search(rb"DivX(\d+)Build(\d+)(\w)", d)
            if o is None:
                o = re.search(rb"DivX(\d+)b(\d+)(\w)", d)
            if o is not None:
                should_buffer = o.group(3) == b"p"
    seek_offset = start_offset
    # some files don't seek to the right location, so better be safe here
    seek_offset = max(seek_offset - 1, 0)
    if should_buffer:
        # FIXME this is kind of a hack, but we will jump to the previous keyframe
        # so this will be safe
        seek_offset = max(seek_offset - max_buffer_size, 0)
    try:
        # TODO check if stream needs to always be the video stream here or not
        container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
    except av.AVError as e:
        print(f"[Warning] Error while seeking video {filename}: {e}")
        return []

    # == main ==
    buffer_count = 0
    frames_pts = []
    cnt = 0
    try:
        for _idx, frame in enumerate(container.decode(**stream_name)):
            frames_pts.append(frame.pts)
            video_frames[cnt] = frame.to_rgb().to_ndarray()
            cnt += 1
            if cnt >= len(video_frames):
                break
            if frame.pts >= end_offset:
                if should_buffer and buffer_count < max_buffer_size:
                    buffer_count += 1
                    continue
                break
    except av.AVError as e:
        print(f"[Warning] Error while reading video {filename}: {e}")

    # garbage collection for thread leakage
    container.close()
    del container
    # NOTE: manually garbage collect to close pyav threads
    gc.collect()

    # ensure that the results are sorted wrt the pts
    # NOTE: here we assert frames_pts is sorted
    start_ptr = 0
    end_ptr = cnt
    while start_ptr < end_ptr and frames_pts[start_ptr] < start_offset:
        start_ptr += 1
    while start_ptr < end_ptr and frames_pts[end_ptr - 1] > end_offset:
        end_ptr -= 1
    if start_offset > 0 and start_offset not in frames_pts[start_ptr:end_ptr]:
        # if there is no frame that exactly matches the pts of start_offset
        # add the last frame smaller than start_offset, to guarantee that
        # we will have all the necessary data. This is most useful for audio
        if start_ptr > 0:
            start_ptr -= 1
    result = video_frames[start_ptr:end_ptr].copy()
    return result


def read_video_cv2(video_path):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        # print("Error: Unable to open video")
        raise ValueError
    else:
        fps = cap.get(cv2.CAP_PROP_FPS)
        vinfo = {
            "video_fps": fps,
        }

        frames = []
        while True:
            # Read a frame from the video
            ret, frame = cap.read()

            # If frame is not read correctly, break the loop
            if not ret:
                break

            frames.append(frame[:, :, ::-1])  # BGR to RGB

            # Exit if 'q' is pressed
            if cv2.waitKey(25) & 0xFF == ord("q"):
                break

        # Release the video capture object and close all windows
        cap.release()
        cv2.destroyAllWindows()

        frames = np.stack(frames)
        frames = torch.from_numpy(frames)  # [T, H, W, C=3]
        frames = frames.permute(0, 3, 1, 2)
        return frames, vinfo


def read_video(video_path, backend="av"):
    if backend == "cv2":
        vframes, vinfo = read_video_cv2(video_path)
    elif backend == "av":
        vframes, _, vinfo = read_video_av(filename=video_path, pts_unit="sec", output_format="TCHW")
    else:
        raise ValueError

    return vframes, vinfo


================================================
FILE: Open-Sora/opensora/datasets/sampler.py
================================================
from collections import OrderedDict, defaultdict
from pprint import pformat
from typing import Iterator, List, Optional

import numpy as np
import torch
import torch.distributed as dist
from torch.utils.data import Dataset, DistributedSampler

from opensora.utils.misc import format_numel_str, get_logger

from .aspect import get_num_pixels
from .bucket import Bucket
from .datasets import VariableVideoTextDataset


# use pandarallel to accelerate bucket processing
# NOTE: pandarallel should only access local variables
def apply(data, method=None, frame_interval=None, seed=None, num_bucket=None):
    return method(
        data["num_frames"],
        data["height"],
        data["width"],
        frame_interval,
        seed + data["id"] * num_bucket,
    )


class StatefulDistributedSampler(DistributedSampler):
    def __init__(
        self,
        dataset: Dataset,
        num_replicas: Optional[int] = None,
        rank: Optional[int] = None,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
    ) -> None:
        super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
        self.start_index: int = 0

    def __iter__(self) -> Iterator:
        iterator = super().__iter__()
        indices = list(iterator)
        indices = indices[self.start_index :]
        return iter(indices)

    def __len__(self) -> int:
        return self.num_samples - self.start_index

    def reset(self) -> None:
        self.start_index = 0

    def state_dict(self, step) -> dict:
        return {"start_index": step}

    def load_state_dict(self, state_dict: dict) -> None:
        self.__dict__.update(state_dict)


class VariableVideoBatchSampler(DistributedSampler):
    def __init__(
        self,
        dataset: VariableVideoTextDataset,
        bucket_config: dict,
        num_replicas: Optional[int] = None,
        rank: Optional[int] = None,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
        verbose: bool = False,
        num_bucket_build_workers: int = 1,
    ) -> None:
        super().__init__(
            dataset=dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle, seed=seed, drop_last=drop_last
        )
        self.dataset = dataset
        self.bucket = Bucket(bucket_config)
        self.verbose = verbose
        self.last_micro_batch_access_index = 0
        self.approximate_num_batch = None

        self._get_num_batch_cached_bucket_sample_dict = None
        self.num_bucket_build_workers = num_bucket_build_workers

    def __iter__(self) -> Iterator[List[int]]:
        if self._get_num_batch_cached_bucket_sample_dict is not None:
            bucket_sample_dict = self._get_num_batch_cached_bucket_sample_dict
            self._get_num_batch_cached_bucket_sample_dict = None
        else:
            bucket_sample_dict = self.group_by_bucket()
            if self.verbose:
                self._print_bucket_info(bucket_sample_dict)

        g = torch.Generator()
        g.manual_seed(self.seed + self.epoch)
        bucket_micro_batch_count = OrderedDict()
        bucket_last_consumed = OrderedDict()

        # process the samples
        for bucket_id, data_list in bucket_sample_dict.items():
            # handle droplast
            bs_per_gpu = self.bucket.get_batch_size(bucket_id)
            remainder = len(data_list) % bs_per_gpu

            if remainder > 0:
                if not self.drop_last:
                    # if there is remainder, we pad to make it divisible
                    data_list += data_list[: bs_per_gpu - remainder]
                else:
                    # we just drop the remainder to make it divisible
                    data_list = data_list[:-remainder]
            bucket_sample_dict[bucket_id] = data_list

            # handle shuffle
            if self.shuffle:
                data_indices = torch.randperm(len(data_list), generator=g).tolist()
                data_list = [data_list[i] for i in data_indices]
                bucket_sample_dict[bucket_id] = data_list

            # compute how many micro-batches each bucket has
            num_micro_batches = len(data_list) // bs_per_gpu
            bucket_micro_batch_count[bucket_id] = num_micro_batches

        # compute the bucket access order
        # each bucket may have more than one batch of data
        # thus bucket_id may appear more than 1 time
        bucket_id_access_order = []
        for bucket_id, num_micro_batch in bucket_micro_batch_count.items():
            bucket_id_access_order.extend([bucket_id] * num_micro_batch)

        # randomize the access order
        if self.shuffle:
            bucket_id_access_order_indices = torch.randperm(len(bucket_id_access_order), generator=g).tolist()
            bucket_id_access_order = [bucket_id_access_order[i] for i in bucket_id_access_order_indices]

        # make the number of bucket accesses divisible by dp size
        remainder = len(bucket_id_access_order) % self.num_replicas
        if remainder > 0:
            if self.drop_last:
                bucket_id_access_order = bucket_id_access_order[: len(bucket_id_access_order) - remainder]
            else:
                bucket_id_access_order += bucket_id_access_order[: self.num_replicas - remainder]

        # prepare each batch from its bucket
        # according to the predefined bucket access order
        num_iters = len(bucket_id_access_order) // self.num_replicas
        start_iter_idx = self.last_micro_batch_access_index // self.num_replicas

        # re-compute the micro-batch consumption
        # this is useful when resuming from a state dict with a different number of GPUs
        self.last_micro_batch_access_index = start_iter_idx * self.num_replicas
        for i in range(self.last_micro_batch_access_index):
            bucket_id = bucket_id_access_order[i]
            bucket_bs = self.bucket.get_batch_size(bucket_id)
            if bucket_id in bucket_last_consumed:
                bucket_last_consumed[bucket_id] += bucket_bs
            else:
                bucket_last_consumed[bucket_id] = bucket_bs

        for i in range(start_iter_idx, num_iters):
            bucket_access_list = bucket_id_access_order[i * self.num_replicas : (i + 1) * self.num_replicas]
            self.last_micro_batch_access_index += self.num_replicas

            # compute the data samples consumed by each access
            bucket_access_boundaries = []
            for bucket_id in bucket_access_list:
                bucket_bs = self.bucket.get_batch_size(bucket_id)
                last_consumed_index = bucket_last_consumed.get(bucket_id, 0)
                bucket_access_boundaries.append([last_consumed_index, last_consumed_index + bucket_bs])

                # update consumption
                if bucket_id in bucket_last_consumed:
                    bucket_last_consumed[bucket_id] += bucket_bs
                else:
                    bucket_last_consumed[bucket_id] = bucket_bs

            # compute the range of data accessed by each GPU
            bucket_id = bucket_access_list[self.rank]
            boundary = bucket_access_boundaries[self.rank]
            cur_micro_batch = bucket_sample_dict[bucket_id][boundary[0] : boundary[1]]

            # encode t, h, w into the sample index
            real_t, real_h, real_w = self.bucket.get_thw(bucket_id)
            cur_micro_batch = [f"{idx}-{real_t}-{real_h}-{real_w}" for idx in cur_micro_batch]
            yield cur_micro_batch

        self.reset()

    def __len__(self) -> int:
        return self.get_num_batch() // dist.get_world_size()

    def group_by_bucket(self) -> dict:
        bucket_sample_dict = OrderedDict()

        from pandarallel import pandarallel

        pandarallel.initialize(nb_workers=self.num_bucket_build_workers, progress_bar=False)
        get_logger().info("Building buckets...")
        bucket_ids = self.dataset.data.parallel_apply(
            apply,
            axis=1,
            method=self.bucket.get_bucket_id,
            frame_interval=self.dataset.frame_interval,
            seed=self.seed + self.epoch,
            num_bucket=self.bucket.num_bucket,
        )

        # group by bucket
        # each data sample is put into a bucket with a similar image/video size
        for i in range(len(self.dataset)):
            bucket_id = bucket_ids[i]
            if bucket_id is None:
                continue
            if bucket_id not in bucket_sample_dict:
                bucket_sample_dict[bucket_id] = []
            bucket_sample_dict[bucket_id].append(i)
        return bucket_sample_dict

    def get_num_batch(self) -> int:
        bucket_sample_dict = self.group_by_bucket()
        self._get_num_batch_cached_bucket_sample_dict = bucket_sample_dict

        # calculate the number of batches
        if self.verbose:
            self._print_bucket_info(bucket_sample_dict)
        return self.approximate_num_batch

    def _print_bucket_info(self, bucket_sample_dict: dict) -> None:
        # collect statistics
        total_samples = 0
        total_batch = 0
        num_aspect_dict = defaultdict(lambda: [0, 0])
        num_hwt_dict = defaultdict(lambda: [0, 0])
        for k, v in bucket_sample_dict.items():
            size = len(v)
            num_batch = size // self.bucket.get_batch_size(k[:-1])

            total_samples += size
            total_batch += num_batch

            num_aspect_dict[k[-1]][0] += size
            num_aspect_dict[k[-1]][1] += num_batch
            num_hwt_dict[k[:-1]][0] += size
            num_hwt_dict[k[:-1]][1] += num_batch

        # sort
        num_aspect_dict = dict(sorted(num_aspect_dict.items(), key=lambda x: x[0]))
        num_hwt_dict = dict(
            sorted(num_hwt_dict.items(), key=lambda x: (get_num_pixels(x[0][0]), x[0][1]), reverse=True)
        )
        num_hwt_img_dict = {k: v for k, v in num_hwt_dict.items() if k[1] == 1}
        num_hwt_vid_dict = {k: v for k, v in num_hwt_dict.items() if k[1] > 1}

        # log
        if dist.get_rank() == 0 and self.verbose:
            get_logger().info("Bucket Info:")
            get_logger().info(
                "Bucket [#sample, #batch] by aspect ratio:\n%s", pformat(num_aspect_dict, sort_dicts=False)
            )
            get_logger().info(
                "Image Bucket [#sample, #batch] by HxWxT:\n%s", pformat(num_hwt_img_dict, sort_dicts=False)
            )
            get_logger().info(
                "Video Bucket [#sample, #batch] by HxWxT:\n%s", pformat(num_hwt_vid_dict, sort_dicts=False)
            )
            get_logger().info(
                "#training batch: %s, #training sample: %s, #non empty bucket: %s",
                format_numel_str(total_batch),
                format_numel_str(total_samples),
                len(bucket_sample_dict),
            )
        self.approximate_num_batch = total_batch

    def reset(self):
        self.last_micro_batch_access_index = 0

    def state_dict(self, num_steps: int) -> dict:
        # the last_micro_batch_access_index in the __iter__ is often
        # not accurate during multi-workers and data prefetching
        # thus, we need the user to pass the actual steps which have been executed
        # to calculate the correct last_micro_batch_access_index
        return {"seed": self.seed, "epoch": self.epoch, "last_micro_batch_access_index": num_steps * self.num_replicas}

    def load_state_dict(self, state_dict: dict) -> None:
        self.__dict__.update(state_dict)


class BatchDistributedSampler(DistributedSampler):
    """
    Used with BatchDataset;
    Suppose len_buffer == 5, num_buffers == 6, #GPUs == 3, then
           | buffer {i}          | buffer {i+1}
    ------ | ------------------- | -------------------
    rank 0 |  0,  1,  2,  3,  4, |  5,  6,  7,  8,  9
    rank 1 | 10, 11, 12, 13, 14, | 15, 16, 17, 18, 19
    rank 2 | 20, 21, 22, 23, 24, | 25, 26, 27, 28, 29
    """

    def __init__(self, dataset: Dataset, **kwargs):
        super().__init__(dataset, **kwargs)
        self.start_index = 0

    def __iter__(self):
        num_buffers = self.dataset.num_buffers
        len_buffer = self.dataset.len_buffer
        num_buffers_i = num_buffers // self.num_replicas
        num_samples_i = len_buffer * num_buffers_i

        indices_i = np.arange(self.start_index, num_samples_i) + self.rank * num_samples_i
        indices_i = indices_i.tolist()

        return iter(indices_i)

    def reset(self):
        self.start_index = 0

    def state_dict(self, step) -> dict:
        return {"start_index": step}

    def load_state_dict(self, state_dict: dict):
        self.start_index = state_dict["start_index"] + 1


================================================
FILE: Open-Sora/opensora/datasets/utils.py
================================================
import os
import re

import numpy as np
import pandas as pd
import requests
import torch
import torchvision
import torchvision.transforms as transforms
from PIL import Image
from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader
from torchvision.io import write_video
from torchvision.utils import save_image

from . import video_transforms

VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")

regex = re.compile(
    r"^(?:http|ftp)s?://"  # http:// or https://
    r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"  # domain...
    r"localhost|"  # localhost...
    r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
    r"(?::\d+)?"  # optional port
    r"(?:/?|[/?]\S+)$",
    re.IGNORECASE,
)


def is_img(path):
    ext = os.path.splitext(path)[-1].lower()
    return ext in IMG_EXTENSIONS


def is_vid(path):
    ext = os.path.splitext(path)[-1].lower()
    return ext in VID_EXTENSIONS


def is_url(url):
    return re.match(regex, url) is not None


def read_file(input_path):
    if input_path.endswith(".csv"):
        return pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        return pd.read_parquet(input_path)
    else:
        raise NotImplementedError(f"Unsupported file format: {input_path}")


def download_url(input_path):
    output_dir = "cache"
    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.basename(input_path)
    output_path = os.path.join(output_dir, base_name)
    img_data = requests.get(input_path).content
    with open(output_path, "wb") as handler:
        handler.write(img_data)
    print(f"URL {input_path} downloaded to {output_path}")
    return output_path


def temporal_random_crop(vframes, num_frames, frame_interval):
    temporal_sample = video_transforms.TemporalRandomCrop(num_frames * frame_interval)
    total_frames = len(vframes)
    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
    assert (
        end_frame_ind - start_frame_ind >= num_frames
    ), f"Not enough frames to sample, {end_frame_ind} - {start_frame_ind} < {num_frames}"
    frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, num_frames, dtype=int)
    video = vframes[frame_indice]
    return video


def get_transforms_video(name="center", image_size=(256, 256)):
    if name is None:
        return None
    elif name == "center":
        assert image_size[0] == image_size[1], "image_size must be square for center crop"
        transform_video = transforms.Compose(
            [
                video_transforms.ToTensorVideo(),  # TCHW
                # video_transforms.RandomHorizontalFlipVideo(),
                video_transforms.UCFCenterCropVideo(image_size[0]),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    elif name == "resize_crop":
        transform_video = transforms.Compose(
            [
                video_transforms.ToTensorVideo(),  # TCHW
                video_transforms.ResizeCrop(image_size),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    else:
        raise NotImplementedError(f"Transform {name} not implemented")
    return transform_video


def get_transforms_image(name="center", image_size=(256, 256)):
    if name is None:
        return None
    elif name == "center":
        assert image_size[0] == image_size[1], "Image size must be square for center crop"
        transform = transforms.Compose(
            [
                transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, image_size[0])),
                # transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    elif name == "resize_crop":
        transform = transforms.Compose(
            [
                transforms.Lambda(lambda pil_image: resize_crop_to_fill(pil_image, image_size)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            ]
        )
    else:
        raise NotImplementedError(f"Transform {name} not implemented")
    return transform


def read_image_from_path(path, transform=None, transform_name="center", num_frames=1, image_size=(256, 256)):
    image = pil_loader(path)
    if transform is None:
        transform = get_transforms_image(image_size=image_size, name=transform_name)
    image = transform(image)
    video = image.unsqueeze(0).repeat(num_frames, 1, 1, 1)
    video = video.permute(1, 0, 2, 3)
    return video


def read_video_from_path(path, transform=None, transform_name="center", image_size=(256, 256)):
    vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit="sec", output_format="TCHW")
    if transform is None:
        transform = get_transforms_video(image_size=image_size, name=transform_name)
    video = transform(vframes)  # T C H W
    video = video.permute(1, 0, 2, 3)
    return video


def read_from_path(path, image_size, transform_name="center"):
    if is_url(path):
        path = download_url(path)
    ext = os.path.splitext(path)[-1].lower()
    if ext.lower() in VID_EXTENSIONS:
        return read_video_from_path(path, image_size=image_size, transform_name=transform_name)
    else:
        assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}"
        return read_image_from_path(path, image_size=image_size, transform_name=transform_name)


def save_sample(x, save_path=None, fps=8, normalize=True, value_range=(-1, 1), force_video=False, verbose=True):
    """
    Args:
        x (Tensor): shape [C, T, H, W]
    """
    assert x.ndim == 4

    if not force_video and x.shape[1] == 1:  # T = 1: save as image
        save_path += ".png"
        x = x.squeeze(1)
        save_image([x], save_path, normalize=normalize, value_range=value_range)
    else:
        save_path += ".mp4"
        if normalize:
            low, high = value_range
            x.clamp_(min=low, max=high)
            x.sub_(low).div_(max(high - low, 1e-5))

        x = x.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 3, 0).to("cpu", torch.uint8)
        write_video(save_path, x, fps=fps, video_codec="h264")
    if verbose:
        print(f"Saved to {save_path}")
    return save_path


def center_crop_arr(pil_image, image_size):
    """
    Center cropping implementation from ADM.
    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
    """
    while min(*pil_image.size) >= 2 * image_size:
        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX)

    scale = image_size / min(*pil_image.size)
    pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC)

    arr = np.array(pil_image)
    crop_y = (arr.shape[0] - image_size) // 2
    crop_x = (arr.shape[1] - image_size) // 2
    return Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size])


def resize_crop_to_fill(pil_image, image_size):
    w, h = pil_image.size  # PIL is (W, H)
    th, tw = image_size
    rh, rw = th / h, tw / w
    if rh > rw:
        sh, sw = th, round(w * rh)
        image = pil_image.resize((sw, sh), Image.BICUBIC)
        i = 0
        j = int(round((sw - tw) / 2.0))
    else:
        sh, sw = round(h * rw), tw
        image = pil_image.resize((sw, sh), Image.BICUBIC)
        i = int(round((sh - th) / 2.0))
        j = 0
    arr = np.array(image)
    assert i + th <= arr.shape[0] and j + tw <= arr.shape[1]
    return Image.fromarray(arr[i : i + th, j : j + tw])


================================================
FILE: Open-Sora/opensora/datasets/video_transforms.py
================================================
# Copyright 2024 Vchitect/Latte

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.# Modified from Latte

# - This file is adapted from https://github.com/Vchitect/Latte/blob/main/datasets/video_transforms.py


import numbers
import random

import numpy as np
import torch


def _is_tensor_video_clip(clip):
    if not torch.is_tensor(clip):
        raise TypeError("clip should be Tensor. Got %s" % type(clip))

    if not clip.ndimension() == 4:
        raise ValueError("clip should be 4D. Got %dD" % clip.dim())

    return True


def crop(clip, i, j, h, w):
    """
    Args:
        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
    """
    if len(clip.size()) != 4:
        raise ValueError("clip should be a 4D tensor")
    return clip[..., i : i + h, j : j + w]


def resize(clip, target_size, interpolation_mode):
    if len(target_size) != 2:
        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
    return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)


def resize_scale(clip, target_size, interpolation_mode):
    if len(target_size) != 2:
        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
    H, W = clip.size(-2), clip.size(-1)
    scale_ = target_size[0] / min(H, W)
    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)


def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
    """
    Do spatial cropping and resizing to the video clip
    Args:
        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        i (int): i in (i,j) i.e coordinates of the upper left corner.
        j (int): j in (i,j) i.e coordinates of the upper left corner.
        h (int): Height of the cropped region.
        w (int): Width of the cropped region.
        size (tuple(int, int)): height and width of resized clip
    Returns:
        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    clip = crop(clip, i, j, h, w)
    clip = resize(clip, size, interpolation_mode)
    return clip


def center_crop(clip, crop_size):
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)
    th, tw = crop_size
    if h < th or w < tw:
        raise ValueError("height and width must be no smaller than crop_size")

    i = int(round((h - th) / 2.0))
    j = int(round((w - tw) / 2.0))
    return crop(clip, i, j, th, tw)


def center_crop_using_short_edge(clip):
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)
    if h < w:
        th, tw = h, h
        i = 0
        j = int(round((w - tw) / 2.0))
    else:
        th, tw = w, w
        i = int(round((h - th) / 2.0))
        j = 0
    return crop(clip, i, j, th, tw)


def resize_crop_to_fill(clip, target_size):
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)
    th, tw = target_size[0], target_size[1]
    rh, rw = th / h, tw / w
    if rh > rw:
        sh, sw = th, round(w * rh)
        clip = resize(clip, (sh, sw), "bilinear")
        i = 0
        j = int(round(sw - tw) / 2.0)
    else:
        sh, sw = round(h * rw), tw
        clip = resize(clip, (sh, sw), "bilinear")
        i = int(round(sh - th) / 2.0)
        j = 0
    assert i + th <= clip.size(-2) and j + tw <= clip.size(-1)
    return crop(clip, i, j, th, tw)


def random_shift_crop(clip):
    """
    Slide along the long edge, with the short edge as crop size
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    h, w = clip.size(-2), clip.size(-1)

    if h <= w:
        short_edge = h
    else:
        short_edge = w

    th, tw = short_edge, short_edge

    i = torch.randint(0, h - th + 1, size=(1,)).item()
    j = torch.randint(0, w - tw + 1, size=(1,)).item()
    return crop(clip, i, j, th, tw)


def to_tensor(clip):
    """
    Convert tensor data type from uint8 to float, divide value by 255.0 and
    permute the dimensions of clip tensor
    Args:
        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
    Return:
        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
    """
    _is_tensor_video_clip(clip)
    if not clip.dtype == torch.uint8:
        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
    # return clip.float().permute(3, 0, 1, 2) / 255.0
    return clip.float() / 255.0


def normalize(clip, mean, std, inplace=False):
    """
    Args:
        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
        mean (tuple): pixel RGB mean. Size is (3)
        std (tuple): pixel standard deviation. Size is (3)
    Returns:
        normalized clip (torch.tensor): Size is (T, C, H, W)
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    if not inplace:
        clip = clip.clone()
    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
    # print(mean)
    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
    return clip


def hflip(clip):
    """
    Args:
        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
    Returns:
        flipped clip (torch.tensor): Size is (T, C, H, W)
    """
    if not _is_tensor_video_clip(clip):
        raise ValueError("clip should be a 4D torch.tensor")
    return clip.flip(-1)


class ResizeCrop:
    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, clip):
        clip = resize_crop_to_fill(clip, self.size)
        return clip

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size})"


class RandomCropVideo:
    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: randomly cropped video clip.
                size is (T, C, OH, OW)
        """
        i, j, h, w = self.get_params(clip)
        return crop(clip, i, j, h, w)

    def get_params(self, clip):
        h, w = clip.shape[-2:]
        th, tw = self.size

        if h < th or w < tw:
            raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")

        if w == tw and h == th:
            return 0, 0, h, w

        i = torch.randint(0, h - th + 1, size=(1,)).item()
        j = torch.randint(0, w - tw + 1, size=(1,)).item()

        return i, j, th, tw

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size})"


class CenterCropResizeVideo:
    """
    First use the short side for cropping length,
    center crop video, then resize to the specified size
    """

    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)

        self.interpolation_mode = interpolation_mode

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: scale resized / center cropped video clip.
                size is (T, C, crop_size, crop_size)
        """
        clip_center_crop = center_crop_using_short_edge(clip)
        clip_center_crop_resize = resize(
            clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode
        )
        return clip_center_crop_resize

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"


class UCFCenterCropVideo:
    """
    First scale to the specified size in equal proportion to the short edge,
    then center cropping
    """

    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)

        self.interpolation_mode = interpolation_mode

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: scale resized / center cropped video clip.
                size is (T, C, crop_size, crop_size)
        """
        clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
        clip_center_crop = center_crop(clip_resize, self.size)
        return clip_center_crop

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"


class KineticsRandomCropResizeVideo:
    """
    Slide along the long edge, with the short edge as crop size. And resie to the desired size.
    """

    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)

        self.interpolation_mode = interpolation_mode

    def __call__(self, clip):
        clip_random_crop = random_shift_crop(clip)
        clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
        return clip_resize


class CenterCropVideo:
    def __init__(
        self,
        size,
        interpolation_mode="bilinear",
    ):
        if isinstance(size, tuple):
            if len(size) != 2:
                raise ValueError(f"size should be tuple (height, width), instead got {size}")
            self.size = size
        else:
            self.size = (size, size)

        self.interpolation_mode = interpolation_mode

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
        Returns:
            torch.tensor: center cropped video clip.
                size is (T, C, crop_size, crop_size)
        """
        clip_center_crop = center_crop(clip, self.size)
        return clip_center_crop

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"


class NormalizeVideo:
    """
    Normalize the video clip by mean subtraction and division by standard deviation
    Args:
        mean (3-tuple): pixel RGB mean
        std (3-tuple): pixel RGB standard deviation
        inplace (boolean): whether do in-place normalization
    """

    def __init__(self, mean, std, inplace=False):
        self.mean = mean
        self.std = std
        self.inplace = inplace

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
        """
        return normalize(clip, self.mean, self.std, self.inplace)

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"


class ToTensorVideo:
    """
    Convert tensor data type from uint8 to float, divide value by 255.0 and
    permute the dimensions of clip tensor
    """

    def __init__(self):
        pass

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
        Return:
            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
        """
        return to_tensor(clip)

    def __repr__(self) -> str:
        return self.__class__.__name__


class RandomHorizontalFlipVideo:
    """
    Flip the video clip along the horizontal direction with a given probability
    Args:
        p (float): probability of the clip being flipped. Default value is 0.5
    """

    def __init__(self, p=0.5):
        self.p = p

    def __call__(self, clip):
        """
        Args:
            clip (torch.tensor): Size is (T, C, H, W)
        Return:
            clip (torch.tensor): Size is (T, C, H, W)
        """
        if random.random() < self.p:
            clip = hflip(clip)
        return clip

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(p={self.p})"


#  ------------------------------------------------------------
#  ---------------------  Sampling  ---------------------------
#  ------------------------------------------------------------
class TemporalRandomCrop(object):
    """Temporally crop the given frame indices at a random location.

    Args:
            size (int): Desired length of frames will be seen in the model.
    """

    def __init__(self, size):
        self.size = size

    def __call__(self, total_frames):
        rand_end = max(0, total_frames - self.size - 1)
        begin_index = random.randint(0, rand_end)
        end_index = min(begin_index + self.size, total_frames)
        return begin_index, end_index


if __name__ == "__main__":
    import os

    import numpy as np
    import torchvision.io as io
    from torchvision import transforms
    from torchvision.utils import save_image

    vframes, aframes, info = io.read_video(filename="./v_Archery_g01_c03.avi", pts_unit="sec", output_format="TCHW")

    trans = transforms.Compose(
        [
            ToTensorVideo(),
            RandomHorizontalFlipVideo(),
            UCFCenterCropVideo(512),
            # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
        ]
    )

    target_video_len = 32
    frame_interval = 1
    total_frames = len(vframes)
    print(total_frames)

    temporal_sample = TemporalRandomCrop(target_video_len * frame_interval)

    # Sampling video frames
    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
    # print(start_frame_ind)
    # print(end_frame_ind)
    assert end_frame_ind - start_frame_ind >= target_video_len
    frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int)
    print(frame_indice)

    select_vframes = vframes[frame_indice]
    print(select_vframes.shape)
    print(select_vframes.dtype)

    select_vframes_trans = trans(select_vframes)
    print(select_vframes_trans.shape)
    print(select_vframes_trans.dtype)

    select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8)
    print(select_vframes_trans_int.dtype)
    print(select_vframes_trans_int.permute(0, 2, 3, 1).shape)

    io.write_video("./test.avi", select_vframes_trans_int.permute(0, 2, 3, 1), fps=8)

    for i in range(target_video_len):
        save_image(
            select_vframes_trans[i], os.path.join("./test000", "%04d.png" % i), normalize=True, value_range=(-1, 1)
        )


================================================
FILE: Open-Sora/opensora/models/__init__.py
================================================
from .dit import *
from .latte import *
from .pixart import *
from .stdit import *
from .text_encoder import *
from .vae import *


================================================
FILE: Open-Sora/opensora/models/cache_functions/__init__.py
================================================
from .cache_cutfresh import cache_cutfresh
from .fresh_ratio_scheduler import fresh_ratio_scheduler
from .score_evaluate import score_evaluate
from .global_force_fresh import global_force_fresh
from .cache_cutfresh import cache_cutfresh
from .update_cache import update_cache
from .force_init import force_init
from .attention import cached_attention_forward
from .cache_init import cache_init

================================================
FILE: Open-Sora/opensora/models/cache_functions/attention.py
================================================
# Besides, re-arrange the attention module
from torch.jit import Final
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Union
from xformers.ops.fmha.attn_bias import BlockDiagonalMask
def cached_attention_forward(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attn_bias: Optional[Union[torch.Tensor, BlockDiagonalMask]] = None,
    p: float = 0.0,
    scale: Optional[float] = None
) -> torch.Tensor:
    scale = 1.0 / query.shape[-1] ** 0.5
    query = query * scale
    query = query.transpose(1, 2)
    key = key.transpose(1, 2)
    value = value.transpose(1, 2)
    #attn = query @ key.transpose(-2, -1)
    attn = torch.matmul(query, key.transpose(-2, -1))
    if attn_bias is not None:
        attn_bias = attn_bias.materialize(shape= attn.shape, dtype= attn.dtype, device= attn.device)
        attn = attn + attn_bias
    #out_map = attn
    attn_map = attn.softmax(-1)
    attn = F.dropout(attn_map, p)
    attn = torch.matmul(attn, value)
    #attn = attn @ value

    return attn.transpose(1, 2).contiguous(), attn_map.mean(dim=1)

================================================
FILE: Open-Sora/opensora/models/cache_functions/cache_cutfresh.py
================================================
from .fresh_ratio_scheduler import fresh_ratio_scheduler
from .score_evaluate import score_evaluate
#from .token_merge import token_merge
import torch
def cache_cutfresh(cache_dic, tokens, current):
    '''
    Cut fresh tokens from the input tokens and update the cache counter.
    
    cache_dic: dict, the cache dictionary containing cache(main extra memory cost), indices and some other information.
    tokens: torch.Tensor, the input tokens to be cut.
    current: dict, the current step, layer, and module information. Particularly convenient for debugging.
    '''
    step = current['step']
    layer = current['layer']
    module = current['module']

    fresh_ratio = fresh_ratio_scheduler(cache_dic, current)

    fresh_ratio = torch.clamp(torch.tensor(fresh_ratio, device = tokens.device), min=0, max=1)
    # Generate the index tensor for fresh tokens
    score = score_evaluate(cache_dic, tokens, current) # s1, s2, s3 mentioned in the paper
    #score = local_selection_with_space_time_bonus(cache_dic, score, 0.3, 2, time_mean=False) # s4 mentioned in the paper.
    indices = score.argsort(dim=-1, descending=True)
    topk = int(fresh_ratio * score.shape[1])
    fresh_indices = indices[:, :topk]
    stale_indices = indices[:, topk:]
    # (B, fresh_ratio *N)

    # Updating the Cache Frequency Score s3 counter mentioned in the paper
    # stale tokens index + 1 in each ***module***, fresh tokens index = 0
    cache_dic['cache_index'][current['flag']][layer][module] += 1
    cache_dic['cache_index'][current['flag']][layer][module].scatter_(dim=1, index=fresh_indices, 
                                                                    src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device))
    cache_dic['cache_index']['layer_index'][module] += 1
    cache_dic['cache_index']['layer_index'][module].scatter_(dim=1, index=fresh_indices, 
                                                                    src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device))
    # select the fresh tokens out
    fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])

    if module in ['mlp', 'attn', 'cross-attn']:
         
        fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices_expand)

        return fresh_indices, fresh_tokens
    else:
        raise ValueError("Unrecognized module?", module)
    
import torch
from einops import rearrange

def local_selection_with_space_time_bonus(cache_dic, score, bonus_ratio, grid_size=2, time_mean = False):
    # Get the shape of the tensor from cache_dic
    B, T, H, W = cache_dic['dynamic_size']
    
    # Reshape the score to [B, T, H, W]
    score = rearrange(score, "B (T H W) -> B T H W", T=T, H=H, W=W)
    
    # Calculate the padding size to make H and W divisible by grid_size
    pad_h = (grid_size - H % grid_size) % grid_size  # Number of zeros to pad in H dimension
    pad_w = (grid_size - W % grid_size) % grid_size  # Number of zeros to pad in W dimension
    
    # Pad the H and W dimensions with zeros
    if pad_h > 0 or pad_w > 0:
        score = torch.nn.functional.pad(score, (0, pad_w, 0, pad_h))  # (pad width left/right, pad height top/bottom)

    # Update H and W after padding
    H_padded, W_padded = score.shape[2], score.shape[3]
    
    # Step 1: Normalize along the H*W dimension so that information from different time steps has equal weight
    score = score.view(B, T, -1)  # Merge H and W into one dimension [B, T, H*W]
    score = torch.nn.functional.softmax(score, dim=-1)  # Normalize along H*W dimension
    score = score.view(B, T, H_padded, W_padded)  # Restore to [B, T, H_padded, W_padded] shape

    # Step 2: Perform block-wise operation on each spatial slice (each T time step)
    block_size = grid_size * grid_size
    assert (H_padded * W_padded) % block_size == 0, f"H_padded * W_padded must be divisible by block size, shape: {B},{T},{H_padded},{W_padded}; block:{grid_size}*{grid_size};" 

    # Reshape the score into block-wise grouped shape
    score_reshaped = score.view(B, T, H_padded // grid_size, grid_size, W_padded // grid_size, grid_size)
    score_reshaped = score_reshaped.permute(0, 1, 2, 4, 3, 5).contiguous()  # [B, T, H//grid_size, W//grid_size, grid_size, grid_size]
    score_reshaped = score_reshaped.view(B, T, -1, block_size)  # [B, T, num_blocks, block_size]

    # Step 3: Find the maximum score in each block
    max_scores, max_indices = score_reshaped.max(dim=-1, keepdim=True)  # [B, T, num_blocks, 1]
    
    # Step 4: Create a mask to identify the token with the maximum score
    mask = torch.zeros_like(score_reshaped)
    mask.scatter_(-1, max_indices, 1)  # Set the mask to 1 at the index of the maximum score
    
    # Step 5: Apply the bonus only to the token with the maximum score
    score_reshaped = score_reshaped + (mask * max_scores * bonus_ratio)  # Apply bonus only to the maximum score
    
    # Step 6: Restore the score to its original shape
    score_modified = score_reshaped.view(B, T, H_padded // grid_size, W_padded // grid_size, grid_size, grid_size)
    score_modified = score_modified.permute(0, 1, 2, 4, 3, 5).contiguous()
    score_modified = score_modified.view(B, T, H_padded, W_padded)

    # Step 7: Remove the padded zeros
    if pad_h > 0 or pad_w > 0:
        score_modified = score_modified[:, :, :H, :W]  # Remove the padded zeros

    if time_mean:
        score_modified = score_modified.mean(dim = 1)
        score_modified = score_modified.unsqueeze(1).expand(B, T, H, W)
        
    # Finally, reshape the score back to the original shape [B, (T H W)]
    score_modified = rearrange(score_modified, "B T H W -> B (T H W)")
    
    return score_modified


================================================
FILE: Open-Sora/opensora/models/cache_functions/cache_init.py
================================================
def cache_init(model_kwargs, num_steps):   
    '''
    Initialize for cache.
    '''
    cache_dic = {}
    cache = {}
    indices_cache = {}
    cache_index = {}
    cache[-1]={}
    cache[0]={}
    indices_cache[-1]={}
    indices_cache[0]={}
    cache_index[-1]={}
    cache_index[0]={}
    cache_index['layer_index']={}
    cache_dic['attn_map'] = {}
    cache_dic['attn_map'][-1] = {}
    cache_dic['attn_map'][0] = {}
    cache_dic['cross_attn_map'] = {}
    cache_dic['cross_attn_map'][-1] = {}
    cache_dic['cross_attn_map'][0] = {}

    for j in range(28):
        cache[-1][j] = {}
        indices_cache[-1] = {}
        cache_index[-1][j] = {}
        cache_dic['attn_map'][-1][j] = {}
        cache_dic['cross_attn_map'][-1][j] = {}

        cache[0][j] = {}
        indices_cache[0] = {}
        cache_index[0][j] = {}
        cache_dic['attn_map'][0][j] = {}
        cache_dic['cross_attn_map'][0][j] = {}

    cache_dic['cache_type'] = model_kwargs['cache_type']
    cache_dic['cache_index'] = cache_index
    cache_dic['cache'] = cache
    cache_dic['indices_cache'] = indices_cache
    cache_dic['fresh_ratio_schedule'] = model_kwargs['ratio_scheduler']
    cache_dic['fresh_ratio'] = model_kwargs['fresh_ratio']
    cache_dic['fresh_threshold'] = model_kwargs['fresh_threshold']
    cache_dic['force_fresh'] = model_kwargs['force_fresh']
    cache_dic['soft_fresh_weight'] = model_kwargs['soft_fresh_weight']
    #cache_dic['extra_flops'] = 0.0
    #cache_dic['merge_weight'] = merge_weight
    current = {}
    current['num_steps'] = num_steps
    return cache_dic, current
    

================================================
FILE: Open-Sora/opensora/models/cache_functions/force_init.py
================================================
import torch
from .force_scheduler import force_scheduler
def force_init(cache_dic, current, tokens):
    cache_dic['cache_index'][current['flag']][current['layer']][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device)
    force_scheduler(cache_dic, current)
    if current['layer'] == 0:
        cache_dic['cache_index']['layer_index'][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device)

================================================
FILE: Open-Sora/opensora/models/cache_functions/force_scheduler.py
================================================
import torch
def force_scheduler(cache_dic, current):
    thresholds = {}
    if cache_dic['fresh_ratio'] == 0:
        # FORA
        linear_step_weight = 0.0
    else: 
        # TokenCache
        linear_step_weight = 0.0 #N=6 0.2 #N=4 0.3
    step_factor = torch.tensor(1 - linear_step_weight + 2 * linear_step_weight * current['step'] / current['num_steps'])
    threshold = torch.round(cache_dic['fresh_threshold'] / step_factor)

    # Here we set force activation cycles for different modules separately.
    thresholds = {
        'spat-attn' : 3,
        'temp-attn' : 3,
       'cross-attn' : 6,
              'mlp' : 3   }
    
    #thresholds = {
    #    'spat-attn' : 2,
    #    'temp-attn' : 2,
    #   'cross-attn' : 2,
    #          'mlp' : 2   }

    cache_dic['cal_threshold'] = thresholds
    #return threshold

================================================
FILE: Open-Sora/opensora/models/cache_functions/fresh_ratio_scheduler.py
================================================
import torch
def fresh_ratio_scheduler(cache_dic, current):
    '''
    Return the fresh ratio for the current step.
    '''
    fresh_ratio = cache_dic['fresh_ratio']
    fresh_ratio_schedule = cache_dic['fresh_ratio_schedule']
    step = current['step']
    num_steps = current['num_steps']
    threshold = cache_dic['fresh_threshold']
    weight = 0.9
    if fresh_ratio_schedule == 'constant':
        return fresh_ratio
    elif fresh_ratio_schedule == 'linear':
        return fresh_ratio * (1 + weight - 2 * weight * step / num_steps)
    elif fresh_ratio_schedule == 'exp':
        #return 0.5 * (0.052 ** (step/num_steps))
        return fresh_ratio * (weight ** (step / num_steps))
    elif fresh_ratio_schedule == 'linear-mode':
        mode = (step % threshold)/threshold - 0.5
        mode_weight = 0.1
        return fresh_ratio * (1 + weight - 2 * weight * step / num_steps + mode_weight * mode)
    elif fresh_ratio_schedule == 'layerwise':
        return fresh_ratio * (1 + weight - 2 * weight * current['layer'] / 27)
    
    elif fresh_ratio_schedule == 'ToCa':
        '''
        Video cost too much to tune the parameters
        However, simply set these parameters have good enough performances and fast speed mentioned in our paper.
        We will search a better parameter setting for better in future.
        '''
        step_weight = 0.0
        step_factor = 1 + step_weight - 2 * step_weight * step / num_steps

        layer_weight = 0.0
        layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27

        module_weight = 1.5
        module_time_weight = 0.33
        module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight)
        
        # set for temporal and spatial branch
        type_weight = 0.0
        type_factor = 1 + type_weight if current['flag'] == -1 else 1 - type_weight

        return fresh_ratio * layer_factor * step_factor * module_factor * type_factor

    else:
        raise ValueError("unrecognized fresh ratio schedule", fresh_ratio_schedule)


================================================
FILE: Open-Sora/opensora/models/cache_functions/global_force_fresh.py
================================================
from .force_scheduler import force_scheduler
def global_force_fresh(cache_dic, current):
    '''
    Return whether to force fresh tokens globally.
    '''
    is_force_fresh = {}
    fresh_thresholds = {}
    first_step = (current['step'] == 0)
    first_3steps = (current['step'] <= 2) # Note the fact that for OpenSora series models, the first 3 steps is with great importance!!!
    last_step = current['step'] == current['num_steps'] - 1
    force_fresh = cache_dic['force_fresh']
    if not first_step:
        fresh_thresholds['spat-attn']  = cache_dic['cal_threshold']['spat-attn']
        fresh_thresholds['temp-attn']  = cache_dic['cal_threshold']['temp-attn']
        fresh_thresholds['cross-attn'] = cache_dic['cal_threshold']['cross-attn']
        fresh_thresholds['mlp']        = cache_dic['cal_threshold']['mlp']
    else:
        fresh_thresholds['spat-attn']  = cache_dic['fresh_threshold']
        fresh_thresholds['temp-attn']  = cache_dic['fresh_threshold']
        fresh_thresholds['cross-attn'] = cache_dic['fresh_threshold']
        fresh_thresholds['mlp']        = cache_dic['fresh_threshold']

    if force_fresh == 'global':
        if current['flag'] == -1:
            is_force_fresh['attn'] =   (first_3steps or (current['step']% fresh_thresholds['temp-attn'] == 0))
        else:
            is_force_fresh['attn'] =   (first_3steps or (current['step']% fresh_thresholds['spat-attn'] == 0))

        is_force_fresh['cross-attn'] = (first_3steps or (current['step']% fresh_thresholds['cross-attn'] == 0))
        is_force_fresh['mlp'] =        (first_3steps or (current['step']% fresh_thresholds['mlp'] == 0))

        return is_force_fresh
    elif force_fresh == 'local':
        return first_step
    elif force_fresh == 'none':
        return first_step
    else:
        raise ValueError("unrecognized force fresh strategy", force_fresh)

================================================
FILE: Open-Sora/opensora/models/cache_functions/score_evaluate.py
================================================
import torch
import torch.nn as nn
from .scores import attn_score, similarity_score, norm_score
def score_evaluate(cache_dic, tokens, current) -> torch.Tensor:
    '''
    Return the score tensor (B, N) for the given tokens.
    '''

    #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')):
    ## abandoned branch, if you want to explore the local force fresh strategy, this may help.
    #    force_fresh_mask = torch.as_tensor((cache_dic['cache_index'][-1][current['layer']][current['module']] >= 2 * cache_dic['fresh_threshold']), dtype = int) # 2 because the threshold is for step, not module
    #    force_len = force_fresh_mask.sum(dim=1)
    #    force_indices = force_fresh_mask.argsort(dim = -1, descending = True)[:, :force_len.min()]
    #
    #    force_indices = force_indices[:, torch.randperm(force_indices.shape[1])]

    if cache_dic['cache_type'] == 'random':
        score = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1], device=tokens.device)
        score = torch.cat([score, score], dim=0).to(tokens.device)

    elif cache_dic['cache_type'] == 'straight':
        score = torch.ones(tokens.shape[0], tokens.shape[1]).to(tokens.device)
    
    elif cache_dic['cache_type'] == 'attention':
        score = attn_score(cache_dic, current)
    
    elif cache_dic['cache_type'] == 'similarity':
        score = similarity_score(cache_dic, current, tokens)

    elif cache_dic['cache_type'] == 'norm':
        score = norm_score(cache_dic, current, tokens)

    elif cache_dic['cache_type'] == 'compress':
        score1 = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1])
        score1 = torch.cat([score1, score1], dim=0).to(tokens.device)
        score2 = cache_dic['attn_map'][current['flag']][current['layer']].sum(dim=1)#.mean(dim=0) # (B, N)
        # normalize
        score2 = score2 / score2.max(dim=1, keepdim=True)[0]
        score = 0.5 * score1 + 0.5 * score2

    # abandon the branch, if you want to explore the local force fresh strategy, this may help.
    #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # current['is_force_fresh'] is False, cause when it is True, no cut and fresh are needed
    #        #print(torch.ones_like(force_indices, dtype=float, device=force_indices.device).dtype)
    #    score.scatter_(dim=1, index=force_indices, src=torch.ones_like(force_indices, dtype=torch.float32, 
    #                                                                       device=force_indices.device))
    
    if (True and (cache_dic['force_fresh'] == 'global')):
        soft_step_score = cache_dic['cache_index'][current['flag']][current['layer']][current['module']].float() / (cache_dic['fresh_threshold'])
        #soft_layer_score = cache_dic['cache_index']['layer_index'][current['module']].float() / (27)
        score = score + cache_dic['soft_fresh_weight'] * soft_step_score #+ 0.1 *soft_layer_score
    
    return score.to(tokens.device)

================================================
FILE: Open-Sora/opensora/models/cache_functions/scores.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

def attn_score(cache_dic, current):
    #self_attn_score = 1- cache_dic['attn_map'][current['flag']][current['layer']].diagonal(dim1=1, dim2=2)
    #self_attn_score = F.normalize(self_attn_score, dim=1, p=2)
    #attention_score = F.normalize(cache_dic['attn_map'][current['flag']][current['layer']].sum(dim=1), dim=1, p=2)
    #cross_attn_map = F.threshold(cache_dic['cross_attn_map'][current['flag']][current['layer']],threshold=0.0, value=0.0)
    #cross_attention_score = F.normalize(cross_attn_map.sum(dim=-1), dim=-1, p=2)
    
    cond_cmap, uncond_cmap = torch.split(cache_dic['cross_attn_map'][current['flag']][current['layer']], len(cache_dic['cross_attn_map'][current['flag']][current['layer']]) // 2, dim=0)
    cond_weight = 0.5
    cmap = cond_weight * cond_cmap + (1 - cond_weight) * uncond_cmap
    cross_attention_entropy = -torch.sum(cmap * torch.log(cmap + 1e-7), dim=-1)
    cross_attention_score   = F.normalize(1 + cross_attention_entropy, dim=1, p=2)
    #score = self_attn_score
    #score = attention_score
    score = cross_attention_score.repeat(2, 1)
    #cross_weight = 0.0
    #score =  (1-cross_weight) * attention_score + cross_weight * cross_attention_score
    return score

def similarity_score(cache_dic, current, tokens):
    cosine_sim = F.cosine_similarity(tokens, cache_dic['cache'][current['flag']][current['layer']][current['module']], dim=-1)

    return F.normalize(1- cosine_sim, dim=-1, p=2)

def norm_score(cache_dic, current, tokens):
    norm = tokens.norm(dim=-1, p=2)
    return F.normalize(norm, dim=-1, p=2)


================================================
FILE: Open-Sora/opensora/models/cache_functions/token_merge.py
================================================
import torch
def token_merge(cache_dic, tokens, current, fresh_indices, stale_indices):
    '''
    An abandoned branch in exploring if token merge helps. The answer is no, at least no for training-free strategy.
    '''
    if (current['layer'] % 1 == 0):
        fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]))
        stale_tokens = torch.gather(input = tokens, dim = 1, index = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]))
        method = 'similarity'
        if method == 'distance':
            descending = False
            distance = torch.cdist(stale_tokens, fresh_tokens, p=1)
            stale_fresh_dist, stale_fresh_indices_allstale = torch.min(distance, dim=2)
        elif method == 'similarity':
            descending = True
            fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1)
            stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1)
            similarity = stale_tokens @ fresh_tokens.transpose(1, 2)
            stale_fresh_dist, stale_fresh_indices_allstale = torch.max(similarity, dim=2)
        

        saved_topk_stale = int((stale_fresh_dist > 0.995).sum(dim=1).min())
        merged_stale_sequence = torch.sort(stale_fresh_dist, dim=1, descending=descending)[1][:,:saved_topk_stale]
        stale_fresh_indices = stale_fresh_indices_allstale.gather(1, merged_stale_sequence)
        merged_stale_sequence = stale_indices.gather(1, merged_stale_sequence)
        merged_stale_fresh_indices = fresh_indices.gather(1, stale_fresh_indices)
        cache_dic['merged_stale_fresh_indices'] = merged_stale_fresh_indices 
        cache_dic['merged_stale_sequence'] = merged_stale_sequence


================================================
FILE: Open-Sora/opensora/models/cache_functions/update_cache.py
================================================
import torch
def update_cache(fresh_indices, fresh_tokens, cache_dic, current, fresh_attn_map=None):
    '''
    Update the cache with the fresh tokens.
    '''
    step = current['step']
    layer = current['layer']
    module = current['module']
    # Update the cached tokens at the positions
    if module == 'attn':
        indices = fresh_indices#.sort(dim=1, descending=False)[0]
        cache_dic['attn_map'][current['flag']][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map)
    elif module == 'cross-attn':
        indices = fresh_indices#.sort(dim=1, descending=False)[0]
        cache_dic['cross_attn_map'][current['flag']][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map)
    elif module == 'mlp':
        indices = fresh_indices

    cache_dic['cache'][current['flag']][layer][module].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]), src=fresh_tokens)


================================================
FILE: Open-Sora/opensora/models/dit/__init__.py
================================================
from .dit import DiT, DiT_XL_2, DiT_XL_2x2


================================================
FILE: Open-Sora/opensora/models/dit/dit.py
================================================
# Modified from Meta DiT

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DiT:   https://github.com/facebookresearch/DiT/tree/main
# GLIDE: https://github.com/openai/glide-text2im
# MAE:   https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------

import numpy as np
import torch
import torch.nn as nn
import torch.utils.checkpoint
from einops import rearrange
from timm.models.vision_transformer import Mlp

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    FinalLayer,
    LabelEmbedder,
    PatchEmbed3D,
    TimestepEmbedder,
    approx_gelu,
    get_1d_sincos_pos_embed,
    get_2d_sincos_pos_embed,
    get_layernorm,
    modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


class DiTBlock(nn.Module):
    """
    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
    """

    def __init__(
        self,
        hidden_size,
        num_heads,
        mlp_ratio=4.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.enable_flash_attn = enable_flash_attn
        mlp_hidden_dim = int(hidden_size * mlp_ratio)

        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = Attention(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=enable_flash_attn,
        )
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))

    def forward(self, x, c):
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1, x, shift_msa, scale_msa))
        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2, x, shift_mlp, scale_mlp))
        return x


@MODELS.register_module()
class DiT(nn.Module):
    """
    Diffusion model with a Transformer backbone.
    """

    def __init__(
        self,
        input_size=(16, 32, 32),
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        learn_sigma=True,
        condition="text",
        no_temporal_pos_emb=False,
        caption_channels=512,
        model_max_length=77,
        dtype=torch.float32,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.learn_sigma = learn_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if learn_sigma else in_channels
        self.hidden_size = hidden_size
        self.patch_size = patch_size
        self.input_size = input_size
        num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
        self.num_patches = num_patches
        self.num_temporal = input_size[0] // patch_size[0]
        self.num_spatial = num_patches // self.num_temporal
        self.num_heads = num_heads
        self.dtype = dtype
        self.use_text_encoder = not condition.startswith("label")
        if enable_flash_attn:
            assert dtype in [
                torch.float16,
                torch.bfloat16,
            ], f"Flash attention only supports float16 and bfloat16, but got {self.dtype}"
        self.no_temporal_pos_emb = no_temporal_pos_emb
        self.mlp_ratio = mlp_ratio
        self.depth = depth
        assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in DiT"

        self.register_buffer("pos_embed_spatial", self.get_spatial_pos_embed())
        self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())

        self.x_embedder = PatchEmbed3D(patch_size, in_channels, embed_dim=hidden_size)
        if not self.use_text_encoder:
            num_classes = int(condition.split("_")[-1])
            self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
        else:
            self.y_embedder = CaptionEmbedder(
                in_channels=caption_channels,
                hidden_size=hidden_size,
                uncond_prob=class_dropout_prob,
                act_layer=approx_gelu,
                token_num=1,  # pooled token
            )
        self.t_embedder = TimestepEmbedder(hidden_size)
        self.blocks = nn.ModuleList(
            [
                DiTBlock(
                    hidden_size,
                    num_heads,
                    mlp_ratio=mlp_ratio,
                    enable_flash_attn=enable_flash_attn,
                    enable_layernorm_kernel=enable_layernorm_kernel,
                )
                for _ in range(depth)
            ]
        )
        self.final_layer = FinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels)

        self.initialize_weights()
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel

    def get_spatial_pos_embed(self):
        pos_embed = get_2d_sincos_pos_embed(
            self.hidden_size,
            self.input_size[1] // self.patch_size[1],
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def get_temporal_pos_embed(self):
        pos_embed = get_1d_sincos_pos_embed(
            self.hidden_size,
            self.input_size[0] // self.patch_size[0],
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def unpatchify(self, x):
        c = self.out_channels
        t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        pt, ph, pw = self.patch_size

        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
        x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
        return imgs

    def forward(self, x, t, y):
        """
        Forward pass of DiT.
        x: (B, C, T, H, W) tensor of inputs
        t: (B,) tensor of diffusion timesteps
        y: list of text
        """
        # origin inputs should be float32, cast to specified dtype
        x = x.to(self.dtype)
        if self.use_text_encoder:
            y = y.to(self.dtype)

        # embedding
        x = self.x_embedder(x)  # (B, N, D)
        x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
        x = x + self.pos_embed_spatial
        if not self.no_temporal_pos_emb:
            x = rearrange(x, "b t s d -> b s t d")
            x = x + self.pos_embed_temporal
            x = rearrange(x, "b s t d -> b (t s) d")
        else:
            x = rearrange(x, "b t s d -> b (t s) d")

        t = self.t_embedder(t, dtype=x.dtype)  # (N, D)
        y = self.y_embedder(y, self.training)  # (N, D)
        if self.use_text_encoder:
            y = y.squeeze(1).squeeze(1)
        condition = t + y

        # blocks
        for _, block in enumerate(self.blocks):
            c = condition
            x = auto_grad_checkpoint(block, x, c)  # (B, N, D)

        # final process
        x = self.final_layer(x, condition)  # (B, N, num_patches * out_channels)
        x = self.unpatchify(x)  # (B, out_channels, T, H, W)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                if module.weight.requires_grad_:
                    torch.nn.init.xavier_uniform_(module.weight)
                    if module.bias is not None:
                        nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
        nn.init.constant_(self.x_embedder.proj.bias, 0)

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)

        # Zero-out adaLN modulation layers in DiT blocks:
        for block in self.blocks:
            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)

        # Zero-out text embedding layers:
        if self.use_text_encoder:
            nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
            nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)


@MODELS.register_module("DiT-XL/2")
def DiT_XL_2(from_pretrained=None, **kwargs):
    model = DiT(
        depth=28,
        hidden_size=1152,
        patch_size=(1, 2, 2),
        num_heads=16,
        **kwargs,
    )
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("DiT-XL/2x2")
def DiT_XL_2x2(from_pretrained=None, **kwargs):
    model = DiT(
        depth=28,
        hidden_size=1152,
        patch_size=(2, 2, 2),
        num_heads=16,
        **kwargs,
    )
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/opensora/models/latte/__init__.py
================================================
from .latte import Latte, Latte_XL_2, Latte_XL_2x2


================================================
FILE: Open-Sora/opensora/models/latte/latte.py
================================================
# Copyright 2024 Vchitect/Latte
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.# Modified from Latte
#
#
# This file is mofied from https://github.com/Vchitect/Latte/blob/main/models/latte.py
#
# With references to:
# Latte:  https://github.com/Vchitect/Latte
# DiT:    https://github.com/facebookresearch/DiT/tree/main


import torch
from einops import rearrange, repeat

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.models.dit import DiT
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


@MODELS.register_module()
class Latte(DiT):
    def forward(self, x, t, y):
        """
        Forward pass of DiT.
        x: (B, C, T, H, W) tensor of inputs
        t: (B,) tensor of diffusion timesteps
        y: list of text
        """
        # origin inputs should be float32, cast to specified dtype
        x = x.to(self.dtype)

        # embedding
        x = self.x_embedder(x)  # (B, N, D)
        x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
        x = x + self.pos_embed_spatial
        x = rearrange(x, "b t s d -> b (t s) d")

        t = self.t_embedder(t, dtype=x.dtype)  # (N, D)
        y = self.y_embedder(y, self.training)  # (N, D)
        if self.use_text_encoder:
            y = y.squeeze(1).squeeze(1)
        condition = t + y
        condition_spatial = repeat(condition, "b d -> (b t) d", t=self.num_temporal)
        condition_temporal = repeat(condition, "b d -> (b s) d", s=self.num_spatial)

        # blocks
        for i, block in enumerate(self.blocks):
            if i % 2 == 0:
                # spatial
                x = rearrange(x, "b (t s) d -> (b t) s d", t=self.num_temporal, s=self.num_spatial)
                c = condition_spatial
            else:
                # temporal
                x = rearrange(x, "b (t s) d -> (b s) t d", t=self.num_temporal, s=self.num_spatial)
                c = condition_temporal
                if i == 1:
                    x = x + self.pos_embed_temporal

            x = auto_grad_checkpoint(block, x, c)  # (B, N, D)

            if i % 2 == 0:
                x = rearrange(x, "(b t) s d -> b (t s) d", t=self.num_temporal, s=self.num_spatial)
            else:
                x = rearrange(x, "(b s) t d -> b (t s) d", t=self.num_temporal, s=self.num_spatial)

        # final process
        x = self.final_layer(x, condition)  # (B, N, num_patches * out_channels)
        x = self.unpatchify(x)  # (B, out_channels, T, H, W)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x


@MODELS.register_module("Latte-XL/2")
def Latte_XL_2(from_pretrained=None, **kwargs):
    model = Latte(
        depth=28,
        hidden_size=1152,
        patch_size=(1, 2, 2),
        num_heads=16,
        **kwargs,
    )
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("Latte-XL/2x2")
def Latte_XL_2x2(from_pretrained=None, **kwargs):
    model = Latte(
        depth=28,
        hidden_size=1152,
        patch_size=(2, 2, 2),
        num_heads=16,
        **kwargs,
    )
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/opensora/models/layers/__init__.py
================================================


================================================
FILE: Open-Sora/opensora/models/layers/blocks.py
================================================
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
# Latte:  https://github.com/Vchitect/Latte
# DiT:    https://github.com/facebookresearch/DiT/tree/main
# GLIDE:  https://github.com/openai/glide-text2im
# MAE:    https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------

import functools
import math
from typing import Optional

import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
import xformers.ops
from einops import rearrange
from timm.models.vision_transformer import Mlp

from opensora.acceleration.communications import all_to_all, split_forward_gather_backward
from opensora.acceleration.parallel_states import get_sequence_parallel_group

from ..cache_functions.attention import cached_attention_forward

approx_gelu = lambda: nn.GELU(approximate="tanh")


class LlamaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


def get_layernorm(hidden_size: torch.Tensor, eps: float, affine: bool, use_kernel: bool):
    if use_kernel:
        try:
            from apex.normalization import FusedLayerNorm

            return FusedLayerNorm(hidden_size, elementwise_affine=affine, eps=eps)
        except ImportError:
            raise RuntimeError("FusedLayerNorm not available. Please install apex.")
    else:
        return nn.LayerNorm(hidden_size, eps, elementwise_affine=affine)


def modulate(norm_func, x, shift, scale):
    # Suppose x is (B, N, D), shift is (B, D), scale is (B, D)
    dtype = x.dtype
    x = norm_func(x.to(torch.float32)).to(dtype)
    x = x * (scale.unsqueeze(1) + 1) + shift.unsqueeze(1)
    x = x.to(dtype)
    return x


def t2i_modulate(x, shift, scale):
    return x * (1 + scale) + shift


# ===============================================
# General-purpose Layers
# ===============================================


class PatchEmbed3D(nn.Module):
    """Video to Patch Embedding.

    Args:
        patch_size (int): Patch token size. Default: (2,4,4).
        in_chans (int): Number of input video channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    """

    def __init__(
        self,
        patch_size=(2, 4, 4),
        in_chans=3,
        embed_dim=96,
        norm_layer=None,
        flatten=True,
    ):
        super().__init__()
        self.patch_size = patch_size
        self.flatten = flatten

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
            self.norm = None

    def forward(self, x):
        """Forward function."""
        # padding
        _, _, D, H, W = x.size()
        if W % self.patch_size[2] != 0:
            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
        if H % self.patch_size[1] != 0:
            x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
        if D % self.patch_size[0] != 0:
            x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))

        x = self.proj(x)  # (B C T H W)
        if self.norm is not None:
            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
            x = x.flatten(2).transpose(1, 2)
            x = self.norm(x)
            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # BCTHW -> BNC
        return x


class Attention(nn.Module):
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Module = LlamaRMSNorm,
        enable_flash_attn: bool = False,
        rope=None,
        qk_norm_legacy: bool = False,
    ) -> None:
        super().__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads"
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim**-0.5
        self.enable_flash_attn = enable_flash_attn

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.qk_norm_legacy = qk_norm_legacy
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.rope = False
        if rope is not None:
            self.rope = True
            self.rotary_emb = rope
        
        self.is_causal = False
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, N, C = x.shape
        # flash attn is not memory efficient for small sequences, this is empirical
        enable_flash_attn = self.enable_flash_attn and (N > B)
        qkv = self.qkv(x)
        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)

        qkv = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(0)
        if self.qk_norm_legacy:
            # WARNING: this may be a bug
            if self.rope:
                q = self.rotary_emb(q)
                k = self.rotary_emb(k)
            q, k = self.q_norm(q), self.k_norm(k)
        else:
            q, k = self.q_norm(q), self.k_norm(k)
            if self.rope:
                q = self.rotary_emb(q)
                k = self.rotary_emb(k)

        if enable_flash_attn:
            from flash_attn import flash_attn_func

            # (B, #heads, N, #dim) -> (B, N, #heads, #dim)
            q = q.permute(0, 2, 1, 3)
            k = k.permute(0, 2, 1, 3)
            v = v.permute(0, 2, 1, 3)
            x = flash_attn_func(
                q,
                k,
                v,
                dropout_p=self.attn_drop.p if self.training else 0.0,
                softmax_scale=self.scale,
                causal=self.is_causal,
            )
        else:
            dtype = q.dtype
            q = q * self.scale
            #attn = q @ k.transpose(-2, -1)  # translate attn to float32
            attn = torch.matmul(q,k.transpose(-2, -1))
            attn = attn.to(torch.float32)
            if self.is_causal:
                causal_mask = torch.tril(torch.ones_like(attn), diagonal=0)
                causal_mask = torch.where(causal_mask.bool(), 0, float('-inf'))
                attn += causal_mask
            attn = attn.softmax(dim=-1)
            attn = attn.to(dtype)  # cast back attn to original dtype
            attn = self.attn_drop(attn)
            #x = attn @ v
            x = torch.matmul(attn,v)

        x_output_shape = (B, N, C)
        if not enable_flash_attn:
            x = x.transpose(1, 2)
        x = x.reshape(x_output_shape)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class KVCompressAttention(nn.Module):
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Module = LlamaRMSNorm,
        enable_flash_attn: bool = False,
        sampling="conv",
        sr_ratio=1,
        mem_eff_attention=False,
        attn_half=False,
    ) -> None:
        super().__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads"
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim**-0.5
        self.enable_flash_attn = enable_flash_attn

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)

        self.sr_ratio = sr_ratio
        self.sampling = sampling
        if sr_ratio > 1 and sampling == "conv":
            # Avg Conv Init.
            self.sr = nn.Conv2d(dim, dim, groups=dim, kernel_size=sr_ratio, stride=sr_ratio)
            self.sr.weight.data.fill_(1 / sr_ratio**2)
            self.sr.bias.data.zero_()
            self.norm = nn.LayerNorm(dim)

        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.mem_eff_attention = mem_eff_attention
        self.attn_half = attn_half

    def downsample_2d(self, tensor, H, W, scale_factor, sampling=None):
        if sampling is None or scale_factor == 1:
            return tensor
        B, N, C = tensor.shape

        if sampling == "uniform_every":
            return tensor[:, ::scale_factor], int(N // scale_factor)

        tensor = tensor.reshape(B, H, W, C).permute(0, 3, 1, 2)
        new_H, new_W = int(H / scale_factor), int(W / scale_factor)
        new_N = new_H * new_W

        if sampling == "ave":
            tensor = F.interpolate(tensor, scale_factor=1 / scale_factor, mode="nearest").permute(0, 2, 3, 1)
        elif sampling == "uniform":
            tensor = tensor[:, :, ::scale_factor, ::scale_factor].permute(0, 2, 3, 1)
        elif sampling == "conv":
            tensor = self.sr(tensor).reshape(B, C, -1).permute(0, 2, 1)
            tensor = self.norm(tensor)
        else:
            raise ValueError

        return tensor.reshape(B, new_N, C).contiguous(), new_N

    def forward(self, x: torch.Tensor, mask=None, HW=None, block_id=None, **kwargs) -> torch.Tensor:
        B, N, C = x.shape
        new_N = N
        H, W = HW
        # flash attn is not memory efficient for small sequences, this is empirical
        enable_flash_attn = self.enable_flash_attn and (N > B)

        qkv = self.qkv(x).reshape(B, N, 3, C)
        q, k, v = qkv.unbind(2)
        dtype = q.dtype
        # KV compression
        if self.sr_ratio > 1:
            k, new_N = self.downsample_2d(k, H, W, self.sr_ratio, sampling=self.sampling)
            v, new_N = self.downsample_2d(v, H, W, self.sr_ratio, sampling=self.sampling)

        q = q.reshape(B, N, self.num_heads, C // self.num_heads).to(dtype)
        k = k.reshape(B, new_N, self.num_heads, C // self.num_heads).to(dtype)
        v = v.reshape(B, new_N, self.num_heads, C // self.num_heads).to(dtype)

        q, k = self.q_norm(q), self.k_norm(k)

        if enable_flash_attn:
            from flash_attn import flash_attn_func

            x = flash_attn_func(
                q,
                k,
                v,
                dropout_p=self.attn_drop.p if self.training else 0.0,
                softmax_scale=self.scale,
            )

        elif self.mem_eff_attention:
            attn_bias = None
            if mask is not None:
                attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
                attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float("-inf"))
            x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
        else:
            # (B, N, #heads, #dim) -> (B, #heads, N, #dim)
            q = q.permute(0, 2, 1, 3)
            k = k.permute(0, 2, 1, 3)
            v = v.permute(0, 2, 1, 3)
            dtype = q.dtype
            q = q * self.scale
            attn = q @ k.transpose(-2, -1)  # translate attn to float32
            if not self.attn_half:
                attn = attn.to(torch.float32)
            attn = attn.softmax(dim=-1)
            attn = attn.to(dtype)  # cast back attn to original dtype
            attn = self.attn_drop(attn)
            x = attn @ v

        x_output_shape = (B, N, C)
        if not enable_flash_attn:
            x = x.transpose(1, 2)
        x = x.reshape(x_output_shape)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class SeqParallelAttention(Attention):
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Module = LlamaRMSNorm,
        enable_flash_attn: bool = False,
        rope=None,
    ) -> None:
        assert rope is None, "Rope is not supported in SeqParallelAttention"
        super().__init__(
            dim=dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
            norm_layer=norm_layer,
            enable_flash_attn=enable_flash_attn,
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, N, C = x.shape  # for sequence parallel here, the N is a local sequence length
        qkv = self.qkv(x)
        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
        qkv = qkv.view(qkv_shape)

        sp_group = get_sequence_parallel_group()

        # apply all_to_all to gather sequence and split attention heads
        # [B, SUB_N, 3, NUM_HEAD, HEAD_DIM] -> [B, N, 3, NUM_HEAD_PER_DEVICE, HEAD_DIM]
        qkv = all_to_all(qkv, sp_group, scatter_dim=3, gather_dim=1)

        if self.enable_flash_attn:
            qkv_permute_shape = (
                2,
                0,
                1,
                3,
                4,
            )  # [3, B, N, NUM_HEAD_PER_DEVICE, HEAD_DIM]
        else:
            qkv_permute_shape = (
                2,
                0,
                3,
                1,
                4,
            )  # [3, B, NUM_HEAD_PER_DEVICE, N, HEAD_DIM]
        qkv = qkv.permute(qkv_permute_shape)

        # ERROR: Should qk_norm first
        q, k, v = qkv.unbind(0)
        q, k = self.q_norm(q), self.k_norm(k)
        if self.enable_flash_attn:
            from flash_attn import flash_attn_func

            x = flash_attn_func(
                q,
                k,
                v,
                dropout_p=self.attn_drop.p if self.training else 0.0,
                softmax_scale=self.scale,
            )
        else:
            dtype = q.dtype
            q = q * self.scale
            attn = q @ k.transpose(-2, -1)  # translate attn to float32
            attn = attn.to(torch.float32)
            attn = attn.softmax(dim=-1)
            attn = attn.to(dtype)  # cast back attn to original dtype
            attn = self.attn_drop(attn)
            x = attn @ v

        if not self.enable_flash_attn:
            x = x.transpose(1, 2)

        # apply all to all to gather back attention heads and split sequence
        # [B, N, NUM_HEAD_PER_DEVICE, HEAD_DIM]  -> [B, SUB_N, NUM_HEAD, HEAD_DIM]
        x = all_to_all(x, sp_group, scatter_dim=1, gather_dim=2)

        # reshape outputs back to [B, N, C]
        x_output_shape = (B, N, C)
        x = x.reshape(x_output_shape)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0):
        super(MultiHeadCrossAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.kv_linear = nn.Linear(d_model, d_model * 2)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(d_model, d_model)
        self.proj_drop = nn.Dropout(proj_drop)
    
    def forward(self, x, cond, mask=None):
        #start = torch.cuda.Event(enable_timing=True)
        #end = torch.cuda.Event(enable_timing=True)
        # query/value: img tokens; key: condition; mask: if padding tokens
        B, N, C = x.shape
        #start.record()
        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
        k, v = kv.unbind(2)

        attn_bias = None
        if mask is not None:
            attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
        #x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)

        x, cross_attn_map = cached_attention_forward(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
        x = x.view(B, -1, C)
        cross_attn_map = cross_attn_map.view(B, -1, cross_attn_map.shape[-1])
        x = self.proj(x)
        x = self.proj_drop(x)
        #end.record()
        #torch.cuda.synchronize()
        #print(start.elapsed_time(end))
        return x, cross_attn_map


class SeqParallelMultiHeadCrossAttention(MultiHeadCrossAttention):
    def __init__(
        self,
        d_model,
        num_heads,
        attn_drop=0.0,
        proj_drop=0.0,
    ):
        super().__init__(
            d_model=d_model,
            num_heads=num_heads,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
        )

    def forward(self, x, cond, mask=None):
        # query/value: img tokens; key: condition; mask: if padding tokens
        sp_group = get_sequence_parallel_group()
        sp_size = dist.get_world_size(sp_group)
        B, SUB_N, C = x.shape  # [B, TS/p, C]
        N = SUB_N * sp_size

        # shape:
        # q, k, v: [B, SUB_N, NUM_HEADS, HEAD_DIM]
        q = self.q_linear(x).view(B, -1, self.num_heads, self.head_dim)
        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
        kv = split_forward_gather_backward(kv, get_sequence_parallel_group(), dim=3, grad_scale="down")
        k, v = kv.unbind(2)

        # apply all_to_all to gather sequence and split attention heads
        q = all_to_all(q, sp_group, scatter_dim=2, gather_dim=1)

        q = q.view(1, -1, self.num_heads // sp_size, self.head_dim)
        k = k.view(1, -1, self.num_heads // sp_size, self.head_dim)
        v = v.view(1, -1, self.num_heads // sp_size, self.head_dim)

        # compute attention
        attn_bias = None
        if mask is not None:
            attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
        x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)

        # apply all to all to gather back attention heads and scatter sequence
        x = x.view(B, -1, self.num_heads // sp_size, self.head_dim)
        x = all_to_all(x, sp_group, scatter_dim=1, gather_dim=2)

        # apply output projection
        x = x.view(B, -1, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class FinalLayer(nn.Module):
    """
    The final layer of DiT.
    """

    def __init__(self, hidden_size, num_patch, out_channels):
        super().__init__()
        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))

    def forward(self, x, c):
        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
        x = modulate(self.norm_final, x, shift, scale)
        x = self.linear(x)
        return x


class T2IFinalLayer(nn.Module):
    """
    The final layer of PixArt.
    """

    def __init__(self, hidden_size, num_patch, out_channels, d_t=None, d_s=None):
        super().__init__()
        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True)
        self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5)
        self.out_channels = out_channels
        self.d_t = d_t
        self.d_s = d_s

    def t_mask_select(self, x_mask, x, masked_x, T, S):
        # x: [B, (T, S), C]
        # mased_x: [B, (T, S), C]
        # x_mask: [B, T]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S)
        x = torch.where(x_mask[:, :, None, None], x, masked_x)
        x = rearrange(x, "B T S C -> B (T S) C")
        return x

    def forward(self, x, t, x_mask=None, t0=None, T=None, S=None):
        if T is None:
            T = self.d_t
        if S is None:
            S = self.d_s
        shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
        x = t2i_modulate(self.norm_final(x), shift, scale)
        if x_mask is not None:
            shift_zero, scale_zero = (self.scale_shift_table[None] + t0[:, None]).chunk(2, dim=1)
            x_zero = t2i_modulate(self.norm_final(x), shift_zero, scale_zero)
            x = self.t_mask_select(x_mask, x, x_zero, T, S)
        x = self.linear(x)
        return x


# ===============================================
# Embedding Layers for Timesteps and Class Labels
# ===============================================


class TimestepEmbedder(nn.Module):
    """
    Embeds scalar timesteps into vector representations.
    """

    def __init__(self, hidden_size, frequency_embedding_size=256):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
            nn.SiLU(),
            nn.Linear(hidden_size, hidden_size, bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size

    @staticmethod
    def timestep_embedding(t, dim, max_period=10000):
        """
        Create sinusoidal timestep embeddings.
        :param t: a 1-D Tensor of N indices, one per batch element.
                          These may be fractional.
        :param dim: the dimension of the output.
        :param max_period: controls the minimum frequency of the embeddings.
        :return: an (N, D) Tensor of positional embeddings.
        """
        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
        half = dim // 2
        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half)
        freqs = freqs.to(device=t.device)
        args = t[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
        return embedding

    def forward(self, t, dtype):
        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
        if t_freq.dtype != dtype:
            t_freq = t_freq.to(dtype)
        t_emb = self.mlp(t_freq)
        return t_emb


class LabelEmbedder(nn.Module):
    """
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
    """

    def __init__(self, num_classes, hidden_size, dropout_prob):
        super().__init__()
        use_cfg_embedding = dropout_prob > 0
        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
        self.num_classes = num_classes
        self.dropout_prob = dropout_prob

    def token_drop(self, labels, force_drop_ids=None):
        """
        Drops labels to enable classifier-free guidance.
        """
        if force_drop_ids is None:
            drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob
        else:
            drop_ids = force_drop_ids == 1
        labels = torch.where(drop_ids, self.num_classes, labels)
        return labels

    def forward(self, labels, train, force_drop_ids=None):
        use_dropout = self.dropout_prob > 0
        if (train and use_dropout) or (force_drop_ids is not None):
            labels = self.token_drop(labels, force_drop_ids)
        return self.embedding_table(labels)


class SizeEmbedder(TimestepEmbedder):
    """
    Embeds scalar timesteps into vector representations.
    """

    def __init__(self, hidden_size, frequency_embedding_size=256):
        super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size)
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
            nn.SiLU(),
            nn.Linear(hidden_size, hidden_size, bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size
        self.outdim = hidden_size

    def forward(self, s, bs):
        if s.ndim == 1:
            s = s[:, None]
        assert s.ndim == 2
        if s.shape[0] != bs:
            s = s.repeat(bs // s.shape[0], 1)
            assert s.shape[0] == bs
        b, dims = s.shape[0], s.shape[1]
        s = rearrange(s, "b d -> (b d)")
        s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype)
        s_emb = self.mlp(s_freq)
        s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
        return s_emb

    @property
    def dtype(self):
        return next(self.parameters()).dtype


class CaptionEmbedder(nn.Module):
    """
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
    """

    def __init__(
        self,
        in_channels,
        hidden_size,
        uncond_prob,
        act_layer=nn.GELU(approximate="tanh"),
        token_num=120,
    ):
        super().__init__()
        self.y_proj = Mlp(
            in_features=in_channels,
            hidden_features=hidden_size,
            out_features=hidden_size,
            act_layer=act_layer,
            drop=0,
        )
        self.register_buffer(
            "y_embedding",
            torch.randn(token_num, in_channels) / in_channels**0.5,
        )
        self.uncond_prob = uncond_prob

    def token_drop(self, caption, force_drop_ids=None):
        """
        Drops labels to enable classifier-free guidance.
        """
        if force_drop_ids is None:
            drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob
        else:
            drop_ids = force_drop_ids == 1
        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
        return caption

    def forward(self, caption, train, force_drop_ids=None):
        if train:
            assert caption.shape[2:] == self.y_embedding.shape
        use_dropout = self.uncond_prob > 0
        if (train and use_dropout) or (force_drop_ids is not None):
            caption = self.token_drop(caption, force_drop_ids)
        caption = self.y_proj(caption)
        return caption


class PositionEmbedding2D(nn.Module):
    def __init__(self, dim: int) -> None:
        super().__init__()
        self.dim = dim
        assert dim % 4 == 0, "dim must be divisible by 4"
        half_dim = dim // 2
        inv_freq = 1.0 / (10000 ** (torch.arange(0, half_dim, 2).float() / half_dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

    def _get_sin_cos_emb(self, t: torch.Tensor):
        out = torch.einsum("i,d->id", t, self.inv_freq)
        emb_cos = torch.cos(out)
        emb_sin = torch.sin(out)
        return torch.cat((emb_sin, emb_cos), dim=-1)

    @functools.lru_cache(maxsize=512)
    def _get_cached_emb(
        self,
        device: torch.device,
        dtype: torch.dtype,
        h: int,
        w: int,
        scale: float = 1.0,
        base_size: Optional[int] = None,
    ):
        grid_h = torch.arange(h, device=device) / scale
        grid_w = torch.arange(w, device=device) / scale
        if base_size is not None:
            grid_h *= base_size / h
            grid_w *= base_size / w
        grid_h, grid_w = torch.meshgrid(
            grid_w,
            grid_h,
            indexing="ij",
        )  # here w goes first
        grid_h = grid_h.t().reshape(-1)
        grid_w = grid_w.t().reshape(-1)
        emb_h = self._get_sin_cos_emb(grid_h)
        emb_w = self._get_sin_cos_emb(grid_w)
        return torch.concat([emb_h, emb_w], dim=-1).unsqueeze(0).to(dtype)

    def forward(
        self,
        x: torch.Tensor,
        h: int,
        w: int,
        scale: Optional[float] = 1.0,
        base_size: Optional[int] = None,
    ) -> torch.Tensor:
        return self._get_cached_emb(x.device, x.dtype, h, w, scale, base_size)


# ===============================================
# Sine/Cosine Positional Embedding Functions
# ===============================================
# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py


def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, scale=1.0, base_size=None):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    if not isinstance(grid_size, tuple):
        grid_size = (grid_size, grid_size)

    grid_h = np.arange(grid_size[0], dtype=np.float32) / scale
    grid_w = np.arange(grid_size[1], dtype=np.float32) / scale
    if base_size is not None:
        grid_h *= base_size / grid_size[0]
        grid_w *= base_size / grid_size[1]
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)

    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token and extra_tokens > 0:
        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
    return pos_embed


def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0

    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)

    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
    return emb


def get_1d_sincos_pos_embed(embed_dim, length, scale=1.0):
    pos = np.arange(0, length)[..., None] / scale
    return get_1d_sincos_pos_embed_from_grid(embed_dim, pos)


def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float64)
    omega /= embed_dim / 2.0
    omega = 1.0 / 10000**omega  # (D/2,)

    pos = pos.reshape(-1)  # (M,)
    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product

    emb_sin = np.sin(out)  # (M, D/2)
    emb_cos = np.cos(out)  # (M, D/2)

    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
    return emb


================================================
FILE: Open-Sora/opensora/models/pixart/pixart.py
================================================
# Adapted from PixArt
#
# Copyright (C) 2023  PixArt-alpha/PixArt-alpha
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
# DiT:    https://github.com/facebookresearch/DiT/tree/main
# --------------------------------------------------------

import numpy as np
import torch
import torch.nn as nn
from einops import rearrange
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp

# from .builder import MODELS
from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    MultiHeadCrossAttention,
    PatchEmbed3D,
    SeqParallelAttention,
    SeqParallelMultiHeadCrossAttention,
    SizeEmbedder,
    T2IFinalLayer,
    TimestepEmbedder,
    approx_gelu,
    get_1d_sincos_pos_embed,
    get_2d_sincos_pos_embed,
    get_layernorm,
    t2i_modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


class PixArtBlock(nn.Module):
    """
    A PixArt block with adaptive layer norm (adaLN-single) conditioning.
    """

    def __init__(
        self,
        hidden_size,
        num_heads,
        mlp_ratio=4.0,
        drop_path=0.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.enable_flash_attn = enable_flash_attn
        self._enable_sequence_parallelism = enable_sequence_parallelism

        if enable_sequence_parallelism:
            self.attn_cls = SeqParallelAttention
            self.mha_cls = SeqParallelMultiHeadCrossAttention
        else:
            self.attn_cls = Attention
            self.mha_cls = MultiHeadCrossAttention

        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = self.attn_cls(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=enable_flash_attn,
        )
        self.cross_attn = self.mha_cls(hidden_size, num_heads)
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(
            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)

    def forward(self, x, y, t, mask=None):
        B, N, C = x.shape

        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
            self.scale_shift_table[None] + t.reshape(B, 6, -1)
        ).chunk(6, dim=1)
        x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
        x = x + self.cross_attn(x, y, mask)
        x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))

        return x


@MODELS.register_module()
class PixArt(nn.Module):
    """
    Diffusion model with a Transformer backbone.
    """

    def __init__(
        self,
        input_size=(1, 32, 32),
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        pred_sigma=True,
        drop_path: float = 0.0,
        no_temporal_pos_emb=False,
        caption_channels=4096,
        model_max_length=120,
        dtype=torch.float32,
        freeze=None,
        space_scale=1.0,
        time_scale=1.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
        base_size=None,
    ):
        super().__init__()
        assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in this version."
        self.pred_sigma = pred_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if pred_sigma else in_channels
        self.hidden_size = hidden_size
        self.patch_size = patch_size
        self.input_size = input_size
        num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
        self.num_patches = num_patches
        self.num_temporal = input_size[0] // patch_size[0]
        self.num_spatial = num_patches // self.num_temporal
        if base_size is None:
            self.base_size = int(np.sqrt(self.num_spatial))
        else:
            self.base_size = base_size // patch_size[1]
        self.num_heads = num_heads
        self.dtype = dtype
        self.no_temporal_pos_emb = no_temporal_pos_emb
        self.depth = depth
        self.mlp_ratio = mlp_ratio
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel
        self.space_scale = space_scale
        self.time_scale = time_scale

        self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
        self.t_embedder = TimestepEmbedder(hidden_size)
        self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
        self.y_embedder = CaptionEmbedder(
            in_channels=caption_channels,
            hidden_size=hidden_size,
            uncond_prob=class_dropout_prob,
            act_layer=approx_gelu,
            token_num=model_max_length,
        )

        self.register_buffer("pos_embed", self.get_spatial_pos_embed())
        self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())

        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList(
            [
                PixArtBlock(
                    hidden_size,
                    num_heads,
                    mlp_ratio=mlp_ratio,
                    drop_path=drop_path[i],
                    enable_flash_attn=enable_flash_attn,
                    enable_layernorm_kernel=enable_layernorm_kernel,
                )
                for i in range(depth)
            ]
        )
        self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels)

        self.initialize_weights()
        if freeze is not None:
            assert freeze in ["text"]
            if freeze == "text":
                self.freeze_text()

    def forward(self, x, timestep, y, mask=None, **kwargs):
        """
        Forward pass of PixArt.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N, 1, 120, C) tensor of class labels
        """
        dtype = self.x_embedder.proj.weight.dtype
        B = x.size(0)
        x = x.to(dtype)
        timestep = timestep.to(dtype)
        y = y.to(dtype)

        # embedding
        x = self.x_embedder(x)  # (B, N, D)
        x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
        x = x + self.pos_embed
        if not self.no_temporal_pos_emb:
            x = rearrange(x, "b t s d -> b s t d")
            x = x + self.pos_embed_temporal
            x = rearrange(x, "b s t d -> b (t s) d")
        else:
            x = rearrange(x, "b t s d -> b (t s) d")

        t = self.t_embedder(timestep, dtype=x.dtype)  # (N, D)
        t0 = self.t_block(t)
        y = self.y_embedder(y, self.training)  # (N, 1, L, D)
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # blocks
        for block in self.blocks:
            x = auto_grad_checkpoint(block, x, y, t0, y_lens)

        # final process
        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)  # (N, out_channels, H, W)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def unpatchify(self, x):
        c = self.out_channels
        t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        pt, ph, pw = self.patch_size

        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
        x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
        return imgs

    def get_spatial_pos_embed(self, grid_size=None):
        if grid_size is None:
            grid_size = self.input_size[1:]
        pos_embed = get_2d_sincos_pos_embed(
            self.hidden_size,
            (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]),
            scale=self.space_scale,
            base_size=self.base_size,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def get_temporal_pos_embed(self):
        pos_embed = get_1d_sincos_pos_embed(
            self.hidden_size,
            self.input_size[0] // self.patch_size[0],
            scale=self.time_scale,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def freeze_text(self):
        for n, p in self.named_parameters():
            if "cross_attn" in n:
                p.requires_grad = False

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
        nn.init.normal_(self.t_block[1].weight, std=0.02)

        # Initialize caption embedding MLP:
        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)

        # Zero-out adaLN modulation layers in PixArt blocks:
        for block in self.blocks:
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)


@MODELS.register_module()
class PixArtMS(PixArt):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3"
        self.csize_embedder = SizeEmbedder(self.hidden_size // 3)
        self.ar_embedder = SizeEmbedder(self.hidden_size // 3)

    def forward(self, x, timestep, y, mask=None, data_info=None):
        """
        Forward pass of PixArt.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N, 1, 120, C) tensor of class labels
        """
        x = x.to(self.dtype)
        timestep = timestep.to(self.dtype)
        y = y.to(self.dtype)

        c_size = data_info["hw"]
        ar = data_info["ar"]
        pos_embed = self.get_spatial_pos_embed((x.shape[-2], x.shape[-1])).to(x.dtype)

        # embedding
        x = self.x_embedder(x)  # (B, N, D)
        x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
        x = x + pos_embed.to(x.device)
        if not self.no_temporal_pos_emb:
            x = rearrange(x, "b t s d -> b s t d")
            x = x + self.pos_embed_temporal
            x = rearrange(x, "b s t d -> b (t s) d")
        else:
            x = rearrange(x, "b t s d -> b (t s) d")

        t = self.t_embedder(timestep, dtype=x.dtype)  # (N, D)
        B = x.shape[0]
        csize = self.csize_embedder(c_size, B)
        ar = self.ar_embedder(ar, B)
        t = t + torch.cat([csize, ar], dim=1)

        t0 = self.t_block(t)
        y = self.y_embedder(y, self.training)  # (N, 1, L, D)
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # blocks
        for block in self.blocks:
            x = block(x, y, t0, y_lens)

        # final process
        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)  # (N, out_channels, H, W)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x


@MODELS.register_module("PixArt-XL/2")
def PixArt_XL_2(from_pretrained=None, **kwargs):
    model = PixArt(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("PixArt-1B/2")
def PixArt_1B_2(from_pretrained=None, **kwargs):
    model = PixArt(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs)
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("PixArtMS-XL/2")
def PixArtMS_XL_2(from_pretrained=None, **kwargs):
    model = PixArtMS(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/opensora/models/stdit/__init__.py
================================================
from .stdit import STDiT
from .stdit2 import STDiT2
from .stdit3 import STDiT3


================================================
FILE: Open-Sora/opensora/models/stdit/stdit.py
================================================
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
from einops import rearrange
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
from opensora.acceleration.parallel_states import get_sequence_parallel_group
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    MultiHeadCrossAttention,
    PatchEmbed3D,
    SeqParallelAttention,
    SeqParallelMultiHeadCrossAttention,
    T2IFinalLayer,
    TimestepEmbedder,
    approx_gelu,
    get_1d_sincos_pos_embed,
    get_2d_sincos_pos_embed,
    get_layernorm,
    t2i_modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


class STDiTBlock(nn.Module):
    def __init__(
        self,
        hidden_size,
        num_heads,
        d_s=None,
        d_t=None,
        mlp_ratio=4.0,
        drop_path=0.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.enable_flash_attn = enable_flash_attn
        self._enable_sequence_parallelism = enable_sequence_parallelism

        if enable_sequence_parallelism:
            self.attn_cls = SeqParallelAttention
            self.mha_cls = SeqParallelMultiHeadCrossAttention
        else:
            self.attn_cls = Attention
            self.mha_cls = MultiHeadCrossAttention

        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = self.attn_cls(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=enable_flash_attn,
        )
        self.cross_attn = self.mha_cls(hidden_size, num_heads)
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(
            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)

        # temporal attention
        self.d_s = d_s
        self.d_t = d_t

        if self._enable_sequence_parallelism:
            sp_size = dist.get_world_size(get_sequence_parallel_group())
            # make sure d_t is divisible by sp_size
            assert d_t % sp_size == 0
            self.d_t = d_t // sp_size

        self.attn_temp = self.attn_cls(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=self.enable_flash_attn,
        )

    def t_mask_select(self, x, masked_x, x_mask):
        # x: [B, (T, S), C]
        # mased_x: [B, (T, S), C]
        # x_mask: [B, T]
        x = rearrange(x, "B (T S) C -> B T S C", T=self.d_t, S=self.d_s)
        masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=self.d_t, S=self.d_s)
        x = torch.where(x_mask[:, :, None, None], x, masked_x)
        x = rearrange(x, "B T S C -> B (T S) C")
        return x

    def forward(self, x, y, t, mask=None, tpe=None, x_mask=None, t0=None):
        B, N, C = x.shape

        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
            self.scale_shift_table[None] + t.reshape(B, 6, -1)
        ).chunk(6, dim=1)
        x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
        if x_mask is not None:
            shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = (
                self.scale_shift_table[None] + t0.reshape(B, 6, -1)
            ).chunk(6, dim=1)
            x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero)
            x_m = self.t_mask_select(x_m, x_m_zero, x_mask)

        # spatial branch
        x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=self.d_t, S=self.d_s)
        x_s = self.attn(x_s)
        x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=self.d_t, S=self.d_s)

        if x_mask is not None:
            x_s_zero = gate_msa_zero * x_s
            x_s = gate_msa * x_s
            x_s = self.t_mask_select(x_s, x_s_zero, x_mask)
        else:
            x_s = gate_msa * x_s

        x = x + self.drop_path(x_s)

        # temporal branch
        x_t = rearrange(x, "B (T S) C -> (B S) T C", T=self.d_t, S=self.d_s)
        if tpe is not None:
            x_t = x_t + tpe
        x_t = self.attn_temp(x_t)
        x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=self.d_t, S=self.d_s)
        x = x + self.drop_path(gate_msa * x_t)

        # cross attn
        x = x + self.cross_attn(x, y, mask)

        # mlp
        x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)
        if x_mask is not None:
            x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero)
            x_m = self.t_mask_select(x_m, x_m_zero, x_mask)

        x_mlp = self.mlp(x_m)
        if x_mask is not None:
            x_mlp_zero = gate_mlp_zero * x_mlp
            x_mlp = gate_mlp * x_mlp
            x_mlp = self.t_mask_select(x_mlp, x_mlp_zero, x_mask)
        else:
            x_mlp = gate_mlp * x_mlp

        x = x + self.drop_path(x_mlp)

        return x


@MODELS.register_module()
class STDiT(nn.Module):
    def __init__(
        self,
        input_size=(1, 32, 32),
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        pred_sigma=True,
        drop_path=0.0,
        no_temporal_pos_emb=False,
        caption_channels=4096,
        model_max_length=120,
        dtype=torch.float32,
        space_scale=1.0,
        time_scale=1.0,
        freeze=None,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.pred_sigma = pred_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if pred_sigma else in_channels
        self.hidden_size = hidden_size
        self.patch_size = patch_size
        self.input_size = input_size
        num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
        self.num_patches = num_patches
        self.num_temporal = input_size[0] // patch_size[0]
        self.num_spatial = num_patches // self.num_temporal
        self.num_heads = num_heads
        self.dtype = dtype
        self.no_temporal_pos_emb = no_temporal_pos_emb
        self.depth = depth
        self.mlp_ratio = mlp_ratio
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel
        self.space_scale = space_scale
        self.time_scale = time_scale

        self.register_buffer("pos_embed", self.get_spatial_pos_embed())
        self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())

        self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
        self.t_embedder = TimestepEmbedder(hidden_size)
        self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
        self.y_embedder = CaptionEmbedder(
            in_channels=caption_channels,
            hidden_size=hidden_size,
            uncond_prob=class_dropout_prob,
            act_layer=approx_gelu,
            token_num=model_max_length,
        )

        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]
        self.blocks = nn.ModuleList(
            [
                STDiTBlock(
                    self.hidden_size,
                    self.num_heads,
                    mlp_ratio=self.mlp_ratio,
                    drop_path=drop_path[i],
                    enable_flash_attn=self.enable_flash_attn,
                    enable_layernorm_kernel=self.enable_layernorm_kernel,
                    enable_sequence_parallelism=enable_sequence_parallelism,
                    d_t=self.num_temporal,
                    d_s=self.num_spatial,
                )
                for i in range(self.depth)
            ]
        )
        self.final_layer = T2IFinalLayer(
            hidden_size,
            np.prod(self.patch_size),
            self.out_channels,
            d_t=self.num_temporal,
            d_s=self.num_spatial,
        )

        # init model
        self.initialize_weights()
        self.initialize_temporal()
        if freeze is not None:
            assert freeze in ["not_temporal", "text"]
            if freeze == "not_temporal":
                self.freeze_not_temporal()
            elif freeze == "text":
                self.freeze_text()

        # sequence parallel related configs
        self.enable_sequence_parallelism = enable_sequence_parallelism
        if enable_sequence_parallelism:
            self.sp_rank = dist.get_rank(get_sequence_parallel_group())
        else:
            self.sp_rank = None

    def forward(self, x, timestep, y, mask=None, x_mask=None, **kwargs):
        """
        Forward pass of STDiT.
        Args:
            x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W]
            timestep (torch.Tensor): diffusion time steps; of shape [B]
            y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C]
            mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token]

        Returns:
            x (torch.Tensor): output latent representation; of shape [B, C, T, H, W]
        """
        dtype = self.x_embedder.proj.weight.dtype
        x = x.to(dtype)
        timestep = timestep.to(dtype)
        y = y.to(dtype)

        # embedding
        x = self.x_embedder(x)  # [B, N, C]
        x = rearrange(x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial)
        x = x + self.pos_embed
        x = rearrange(x, "B T S C -> B (T S) C")

        # shard over the sequence dim if sp is enabled
        if self.enable_sequence_parallelism:
            x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="down")

        t = self.t_embedder(timestep, dtype=x.dtype)  # [B, C]
        t_mlp = self.t_block(t)  # [B, C]
        if x_mask is not None:
            t0_timestep = torch.zeros_like(timestep)
            t0 = self.t_embedder(t0_timestep, dtype=x.dtype)
            t0_mlp = self.t_block(t0)
        else:
            t0 = None
            t0_mlp = None
        y = self.y_embedder(y, self.training)  # [B, 1, N_token, C]

        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # blocks
        for i, block in enumerate(self.blocks):
            if i == 0:
                if self.enable_sequence_parallelism:
                    tpe = torch.chunk(
                        self.pos_embed_temporal, dist.get_world_size(get_sequence_parallel_group()), dim=1
                    )[self.sp_rank].contiguous()
                else:
                    tpe = self.pos_embed_temporal
            else:
                tpe = None
            x = auto_grad_checkpoint(block, x, y, t_mlp, y_lens, tpe, x_mask, t0_mlp)

        if self.enable_sequence_parallelism:
            x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="up")
        # x.shape: [B, N, C]

        # final process
        x = self.final_layer(x, t, x_mask, t0)  # [B, N, C=T_p * H_p * W_p * C_out]
        x = self.unpatchify(x)  # [B, C_out, T, H, W]

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def unpatchify(self, x):
        """
        Args:
            x (torch.Tensor): of shape [B, N, C]

        Return:
            x (torch.Tensor): of shape [B, C_out, T, H, W]
        """

        N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        T_p, H_p, W_p = self.patch_size
        x = rearrange(
            x,
            "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
            N_t=N_t,
            N_h=N_h,
            N_w=N_w,
            T_p=T_p,
            H_p=H_p,
            W_p=W_p,
            C_out=self.out_channels,
        )
        return x

    def unpatchify_old(self, x):
        c = self.out_channels
        t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        pt, ph, pw = self.patch_size

        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
        x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
        return imgs

    def get_spatial_pos_embed(self, grid_size=None):
        if grid_size is None:
            grid_size = self.input_size[1:]
        pos_embed = get_2d_sincos_pos_embed(
            self.hidden_size,
            (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]),
            scale=self.space_scale,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def get_temporal_pos_embed(self):
        pos_embed = get_1d_sincos_pos_embed(
            self.hidden_size,
            self.input_size[0] // self.patch_size[0],
            scale=self.time_scale,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def freeze_not_temporal(self):
        for n, p in self.named_parameters():
            if "attn_temp" not in n:
                p.requires_grad = False

    def freeze_text(self):
        for n, p in self.named_parameters():
            if "cross_attn" in n:
                p.requires_grad = False

    def initialize_temporal(self):
        for block in self.blocks:
            nn.init.constant_(block.attn_temp.proj.weight, 0)
            nn.init.constant_(block.attn_temp.proj.bias, 0)

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
        nn.init.normal_(self.t_block[1].weight, std=0.02)

        # Initialize caption embedding MLP:
        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)

        # Zero-out adaLN modulation layers in PixArt blocks:
        for block in self.blocks:
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)


@MODELS.register_module("STDiT-XL/2")
def STDiT_XL_2(from_pretrained=None, **kwargs):
    model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/opensora/models/stdit/stdit2.py
================================================
import os

import numpy as np
import torch
import torch.nn as nn
from einops import rearrange
from rotary_embedding_torch import RotaryEmbedding
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp
from transformers import PretrainedConfig, PreTrainedModel

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    MultiHeadCrossAttention,
    PatchEmbed3D,
    PositionEmbedding2D,
    SizeEmbedder,
    T2IFinalLayer,
    TimestepEmbedder,
    approx_gelu,
    get_2d_sincos_pos_embed,
    get_layernorm,
    t2i_modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint


class STDiT2Block(nn.Module):
    def __init__(
        self,
        hidden_size,
        num_heads,
        mlp_ratio=4.0,
        drop_path=0.0,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
        rope=None,
        qk_norm=False,
        qk_norm_legacy=False,
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.enable_flash_attn = enable_flash_attn
        self._enable_sequence_parallelism = enable_sequence_parallelism

        # spatial branch
        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = Attention(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=enable_flash_attn,
            qk_norm=qk_norm,
            qk_norm_legacy=qk_norm_legacy,
        )
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)

        # cross attn
        self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads)

        # mlp branch
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(
            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

        # temporal branch
        self.norm_temp = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)  # new
        self.attn_temp = Attention(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            enable_flash_attn=self.enable_flash_attn,
            rope=rope,
            qk_norm=qk_norm,
            qk_norm_legacy=qk_norm_legacy,
        )
        self.scale_shift_table_temporal = nn.Parameter(torch.randn(3, hidden_size) / hidden_size**0.5)  # new

    def t_mask_select(self, x_mask, x, masked_x, T, S):
        # x: [B, (T, S), C]
        # mased_x: [B, (T, S), C]
        # x_mask: [B, T]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S)
        x = torch.where(x_mask[:, :, None, None], x, masked_x)
        x = rearrange(x, "B T S C -> B (T S) C")
        return x

    def forward(self, x, y, t, t_tmp, mask=None, x_mask=None, t0=None, t0_tmp=None, T=None, S=None):
        B, N, C = x.shape

        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
            self.scale_shift_table[None] + t.reshape(B, 6, -1)
        ).chunk(6, dim=1)
        shift_tmp, scale_tmp, gate_tmp = (self.scale_shift_table_temporal[None] + t_tmp.reshape(B, 3, -1)).chunk(
            3, dim=1
        )
        if x_mask is not None:
            shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = (
                self.scale_shift_table[None] + t0.reshape(B, 6, -1)
            ).chunk(6, dim=1)
            shift_tmp_zero, scale_tmp_zero, gate_tmp_zero = (
                self.scale_shift_table_temporal[None] + t0_tmp.reshape(B, 3, -1)
            ).chunk(3, dim=1)

        # modulate
        x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
        if x_mask is not None:
            x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero)
            x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

        # spatial branch
        x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S)
        x_s = self.attn(x_s)
        x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=T, S=S)
        if x_mask is not None:
            x_s_zero = gate_msa_zero * x_s
            x_s = gate_msa * x_s
            x_s = self.t_mask_select(x_mask, x_s, x_s_zero, T, S)
        else:
            x_s = gate_msa * x_s
        x = x + self.drop_path(x_s)

        # modulate
        x_m = t2i_modulate(self.norm_temp(x), shift_tmp, scale_tmp)
        if x_mask is not None:
            x_m_zero = t2i_modulate(self.norm_temp(x), shift_tmp_zero, scale_tmp_zero)
            x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

        # temporal branch
        x_t = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S)
        x_t = self.attn_temp(x_t)
        x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=T, S=S)
        if x_mask is not None:
            x_t_zero = gate_tmp_zero * x_t
            x_t = gate_tmp * x_t
            x_t = self.t_mask_select(x_mask, x_t, x_t_zero, T, S)
        else:
            x_t = gate_tmp * x_t
        x = x + self.drop_path(x_t)

        # cross attn
        x = x + self.cross_attn(x, y, mask)

        # modulate
        x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)
        if x_mask is not None:
            x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero)
            x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

        # mlp
        x_mlp = self.mlp(x_m)
        if x_mask is not None:
            x_mlp_zero = gate_mlp_zero * x_mlp
            x_mlp = gate_mlp * x_mlp
            x_mlp = self.t_mask_select(x_mask, x_mlp, x_mlp_zero, T, S)
        else:
            x_mlp = gate_mlp * x_mlp
        x = x + self.drop_path(x_mlp)

        return x


class STDiT2Config(PretrainedConfig):
    model_type = "STDiT2"

    def __init__(
        self,
        input_size=(None, None, None),
        input_sq_size=32,
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        pred_sigma=True,
        drop_path=0.0,
        no_temporal_pos_emb=False,
        caption_channels=4096,
        model_max_length=120,
        freeze=None,
        qk_norm=False,
        qk_norm_legacy=False,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        **kwargs,
    ):
        self.input_size = input_size
        self.input_sq_size = input_sq_size
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.hidden_size = hidden_size
        self.depth = depth
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.class_dropout_prob = class_dropout_prob
        self.pred_sigma = pred_sigma
        self.drop_path = drop_path
        self.no_temporal_pos_emb = no_temporal_pos_emb
        self.caption_channels = caption_channels
        self.model_max_length = model_max_length
        self.freeze = freeze
        self.qk_norm = qk_norm
        self.qk_norm_legacy = qk_norm_legacy
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel
        super().__init__(**kwargs)


@MODELS.register_module()
class STDiT2(PreTrainedModel):
    config_class = STDiT2Config

    def __init__(self, config):
        super().__init__(config)
        self.pred_sigma = config.pred_sigma
        self.in_channels = config.in_channels
        self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_heads
        self.no_temporal_pos_emb = config.no_temporal_pos_emb
        self.depth = config.depth
        self.mlp_ratio = config.mlp_ratio
        self.enable_flash_attn = config.enable_flash_attn
        self.enable_layernorm_kernel = config.enable_layernorm_kernel

        # support dynamic input
        self.patch_size = config.patch_size
        self.input_size = config.input_size
        self.input_sq_size = config.input_sq_size
        self.pos_embed = PositionEmbedding2D(config.hidden_size)

        self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size)
        self.t_embedder = TimestepEmbedder(config.hidden_size)
        self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True))
        self.t_block_temp = nn.Sequential(
            nn.SiLU(), nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=True)
        )  # new
        self.y_embedder = CaptionEmbedder(
            in_channels=config.caption_channels,
            hidden_size=config.hidden_size,
            uncond_prob=config.class_dropout_prob,
            act_layer=approx_gelu,
            token_num=config.model_max_length,
        )

        drop_path = [x.item() for x in torch.linspace(0, config.drop_path, config.depth)]
        self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads)  # new
        self.blocks = nn.ModuleList(
            [
                STDiT2Block(
                    self.hidden_size,
                    self.num_heads,
                    mlp_ratio=self.mlp_ratio,
                    drop_path=drop_path[i],
                    enable_flash_attn=self.enable_flash_attn,
                    enable_layernorm_kernel=self.enable_layernorm_kernel,
                    rope=self.rope.rotate_queries_or_keys,
                    qk_norm=config.qk_norm,
                    qk_norm_legacy=config.qk_norm_legacy,
                )
                for i in range(self.depth)
            ]
        )
        self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels)

        # multi_res
        assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3"
        self.csize_embedder = SizeEmbedder(self.hidden_size // 3)
        self.ar_embedder = SizeEmbedder(self.hidden_size // 3)
        self.fl_embedder = SizeEmbedder(self.hidden_size)  # new
        self.fps_embedder = SizeEmbedder(self.hidden_size)  # new

        # init model
        self.initialize_weights()
        self.initialize_temporal()
        if config.freeze is not None:
            assert config.freeze in ["not_temporal", "text"]
            if config.freeze == "not_temporal":
                self.freeze_not_temporal()
            elif config.freeze == "text":
                self.freeze_text()

    def get_dynamic_size(self, x):
        _, _, T, H, W = x.size()
        if T % self.patch_size[0] != 0:
            T += self.patch_size[0] - T % self.patch_size[0]
        if H % self.patch_size[1] != 0:
            H += self.patch_size[1] - H % self.patch_size[1]
        if W % self.patch_size[2] != 0:
            W += self.patch_size[2] - W % self.patch_size[2]
        T = T // self.patch_size[0]
        H = H // self.patch_size[1]
        W = W // self.patch_size[2]
        return (T, H, W)

    def forward(
        self, x, timestep, y, mask=None, x_mask=None, num_frames=None, height=None, width=None, ar=None, fps=None
    ):
        """
        Forward pass of STDiT.
        Args:
            x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W]
            timestep (torch.Tensor): diffusion time steps; of shape [B]
            y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C]
            mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token]

        Returns:
            x (torch.Tensor): output latent representation; of shape [B, C, T, H, W]
        """
        B = x.shape[0]
        dtype = self.x_embedder.proj.weight.dtype
        x = x.to(dtype)
        timestep = timestep.to(dtype)
        y = y.to(dtype)

        # === process data info ===
        # 1. get dynamic size
        hw = torch.cat([height[:, None], width[:, None]], dim=1)
        rs = (height[0].item() * width[0].item()) ** 0.5
        csize = self.csize_embedder(hw, B)

        # 2. get aspect ratio
        ar = ar.unsqueeze(1)
        ar = self.ar_embedder(ar, B)
        data_info = torch.cat([csize, ar], dim=1)

        # 3. get number of frames
        fl = num_frames.unsqueeze(1)
        fps = fps.unsqueeze(1)
        fl = self.fl_embedder(fl, B)
        fl = fl + self.fps_embedder(fps, B)

        # === get dynamic shape size ===
        _, _, Tx, Hx, Wx = x.size()
        T, H, W = self.get_dynamic_size(x)
        S = H * W
        scale = rs / self.input_sq_size
        base_size = round(S**0.5)
        pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size)

        # embedding
        x = self.x_embedder(x)  # [B, N, C]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        x = x + pos_emb
        x = rearrange(x, "B T S C -> B (T S) C")

        # prepare adaIN
        t = self.t_embedder(timestep, dtype=x.dtype)  # [B, C]
        t_spc = t + data_info  # [B, C]
        t_tmp = t + fl  # [B, C]
        t_spc_mlp = self.t_block(t_spc)  # [B, 6*C]
        t_tmp_mlp = self.t_block_temp(t_tmp)  # [B, 3*C]
        if x_mask is not None:
            t0_timestep = torch.zeros_like(timestep)
            t0 = self.t_embedder(t0_timestep, dtype=x.dtype)
            t0_spc = t0 + data_info
            t0_tmp = t0 + fl
            t0_spc_mlp = self.t_block(t0_spc)
            t0_tmp_mlp = self.t_block_temp(t0_tmp)
        else:
            t0_spc = None
            t0_tmp = None
            t0_spc_mlp = None
            t0_tmp_mlp = None

        # prepare y
        y = self.y_embedder(y, self.training)  # [B, 1, N_token, C]

        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # blocks
        for _, block in enumerate(self.blocks):
            x = auto_grad_checkpoint(
                block,
                x,
                y,
                t_spc_mlp,
                t_tmp_mlp,
                y_lens,
                x_mask,
                t0_spc_mlp,
                t0_tmp_mlp,
                T,
                S,
            )
            # x.shape: [B, N, C]

        # final process
        x = self.final_layer(x, t, x_mask, t0_spc, T, S)  # [B, N, C=T_p * H_p * W_p * C_out]
        x = self.unpatchify(x, T, H, W, Tx, Hx, Wx)  # [B, C_out, T, H, W]

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w):
        """
        Args:
            x (torch.Tensor): of shape [B, N, C]

        Return:
            x (torch.Tensor): of shape [B, C_out, T, H, W]
        """

        # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        T_p, H_p, W_p = self.patch_size
        x = rearrange(
            x,
            "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
            N_t=N_t,
            N_h=N_h,
            N_w=N_w,
            T_p=T_p,
            H_p=H_p,
            W_p=W_p,
            C_out=self.out_channels,
        )
        # unpad
        x = x[:, :, :R_t, :R_h, :R_w]
        return x

    def unpatchify_old(self, x):
        c = self.out_channels
        t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        pt, ph, pw = self.patch_size

        x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
        x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
        return imgs

    def get_spatial_pos_embed(self, H, W, scale=1.0, base_size=None):
        pos_embed = get_2d_sincos_pos_embed(
            self.hidden_size,
            (H, W),
            scale=scale,
            base_size=base_size,
        )
        pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
        return pos_embed

    def freeze_not_temporal(self):
        for n, p in self.named_parameters():
            if "attn_temp" not in n:
                p.requires_grad = False

    def freeze_text(self):
        for n, p in self.named_parameters():
            if "cross_attn" in n:
                p.requires_grad = False

    def initialize_temporal(self):
        for block in self.blocks:
            nn.init.constant_(block.attn_temp.proj.weight, 0)
            nn.init.constant_(block.attn_temp.proj.bias, 0)

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
        nn.init.normal_(self.t_block[1].weight, std=0.02)
        nn.init.normal_(self.t_block_temp[1].weight, std=0.02)

        # Initialize caption embedding MLP:
        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)

        # Zero-out adaLN modulation layers in PixArt blocks:
        for block in self.blocks:
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)


@MODELS.register_module("STDiT2-XL/2")
def STDiT2_XL_2(from_pretrained=None, **kwargs):
    if from_pretrained is not None:
        if os.path.isdir(from_pretrained) or os.path.isfile(from_pretrained):
            # if it is a directory or a file, we load the checkpoint manually
            config = STDiT2Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
            model = STDiT2(config)
            load_checkpoint(model, from_pretrained)
            return model
        else:
            # otherwise, we load the model from hugging face hub
            return STDiT2.from_pretrained(from_pretrained)
    else:
        # create a new model
        config = STDiT2Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
        model = STDiT2(config)
    return model


================================================
FILE: Open-Sora/opensora/models/stdit/stdit3.py
================================================
import os

import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from rotary_embedding_torch import RotaryEmbedding
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp
from transformers import PretrainedConfig, PreTrainedModel

from opensora.acceleration.checkpoint import auto_grad_checkpoint
from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
from opensora.acceleration.parallel_states import get_sequence_parallel_group
from opensora.models.layers.blocks import (
    Attention,
    CaptionEmbedder,
    MultiHeadCrossAttention,
    PatchEmbed3D,
    PositionEmbedding2D,
    SeqParallelAttention,
    SeqParallelMultiHeadCrossAttention,
    SizeEmbedder,
    T2IFinalLayer,
    TimestepEmbedder,
    approx_gelu,
    get_layernorm,
    t2i_modulate,
)
from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint

from ...models.cache_functions import global_force_fresh, cache_cutfresh, update_cache, force_init, score_evaluate

class STDiT3Block(nn.Module):
    def __init__(
        self,
        hidden_size,
        num_heads,
        mlp_ratio=4.0,
        drop_path=0.0,
        rope=None,
        qk_norm=False,
        temporal=False,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
    ):
        super().__init__()
        self.temporal = temporal
        self.hidden_size = hidden_size
        self.enable_flash_attn = enable_flash_attn
        self.enable_sequence_parallelism = enable_sequence_parallelism

        if self.enable_sequence_parallelism and not temporal:
            attn_cls = SeqParallelAttention
            mha_cls = SeqParallelMultiHeadCrossAttention
        else:
            attn_cls = Attention
            mha_cls = MultiHeadCrossAttention

        self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.attn = attn_cls(
            hidden_size,
            num_heads=num_heads,
            qkv_bias=True,
            qk_norm=qk_norm,
            rope=rope,
            enable_flash_attn=enable_flash_attn,
        )
        self.cross_attn = mha_cls(hidden_size, num_heads)
        self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
        self.mlp = Mlp(
            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)

    def t_mask_select(self, x_mask, x, masked_x, T, S):
        # x: [B, (T, S), C]
        # mased_x: [B, (T, S), C]
        # x_mask: [B, T]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S)
        x = torch.where(x_mask[:, :, None, None], x, masked_x)
        x = rearrange(x, "B T S C -> B (T S) C")
        return x

    def forward(
        self,
        x,
        y,
        t,
        current,
        cache_dic,
        mask=None,  # text mask
        x_mask=None,  # temporal mask
        t0=None,  # t with timestamp=0
        T=None,  # number of frames
        S=None,  # number of pixel patches
    ):
        '''
        Forward for video models.
        Note that the Force Activation Cycle is slightly different from DiT-ToCa and PixArt-alpha-ToCa.
        This is because of a discovery: The Force Activation Cycle of different modules can be different for OpenSora model. 
        (This cause decrease in performance in DiT and PixArt). 
        '''


        # prepare modulate parameters
        B, N, C = x.shape
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
            self.scale_shift_table[None] + t.reshape(B, 6, -1)
        ).chunk(6, dim=1)
        if x_mask is not None:
            shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = (
                self.scale_shift_table[None] + t0.reshape(B, 6, -1)
            ).chunk(6, dim=1)

        if self.temporal:
            current['flag'] = -1
        else:
            current['flag'] = 0
        is_force_fresh = global_force_fresh(cache_dic, current)
        current['is_force_fresh'] = is_force_fresh
        
        # modulate (attention)
        current['module'] = 'attn'

        if is_force_fresh[current['module']]:
            x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
            if x_mask is not None:
                x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero)
                x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)

            # attention
            if self.temporal:
                x_m = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S)
                x_m = self.attn(x_m)
                x_m = rearrange(x_m, "(B S) T C -> B (T S) C", T=T, S=S)
            else:
                x_m = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S)
                x_m = self.attn(x_m)
                x_m = rearrange(x_m, "(B T) S C -> B (T S) C", T=T, S=S)

            cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m
            force_init(cache_dic, current, x)
        else:            
            x_m = cache_dic['cache'][current['flag']][current['layer']][current['module']]
            
        # modulate (attention)
        x_m_s = gate_msa * x_m
        if x_mask is not None:
            x_m_s_zero = gate_msa_zero * x_m
            x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S)
        # residual
        x = x + self.drop_path(x_m_s)

        # cross attention
        current['module'] = 'cross-attn'

        if is_force_fresh[current['module']]:
            cache_dic['cache'][current['flag']][current['layer']][current['module']], cache_dic['cross_attn_map'][current['flag']][current['layer']] = self.cross_attn(x, y, mask)
            force_init(cache_dic, current, x)

        else:
            fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current)
            fresh_tokens, fresh_cross_attn_map = self.cross_attn(fresh_tokens, y, mask)
            update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_cross_attn_map)

        x = x + cache_dic['cache'][current['flag']][current['layer']][current['module']]

        # modulate (MLP)
        current['module'] = 'mlp'

        #mlp_tick.record()
        x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)
        if x_mask is not None:
            x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero)
            x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)
        
        # MLP
        if is_force_fresh[current['module']]:
            x_m = self.mlp(x_m)
            cache_dic['cache'][current['flag']][current['layer']][current['module']] = x_m
            force_init(cache_dic, current, x)
        
        else:
            fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x_m, current)
            fresh_tokens = self.mlp(fresh_tokens)
            update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current)

        # modulate (MLP)
        x_m_s = gate_mlp * cache_dic['cache'][current['flag']][current['layer']][current['module']]

        if x_mask is not None:
            x_m_s_zero = gate_mlp_zero * x_m
            x_m_s = self.t_mask_select(x_mask, x_m_s, x_m_s_zero, T, S)

            # residual    
        x = x + self.drop_path(x_m_s)

        return x


class STDiT3Config(PretrainedConfig):
    model_type = "STDiT3"

    def __init__(
        self,
        input_size=(None, None, None),
        input_sq_size=512,
        in_channels=4,
        patch_size=(1, 2, 2),
        hidden_size=1152,
        depth=28,
        num_heads=16,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        pred_sigma=True,
        drop_path=0.0,
        caption_channels=4096,
        model_max_length=300,
        qk_norm=True,
        enable_flash_attn=False,
        enable_layernorm_kernel=False,
        enable_sequence_parallelism=False,
        only_train_temporal=False,
        freeze_y_embedder=False,
        skip_y_embedder=False,
        **kwargs,
    ):
        self.input_size = input_size
        self.input_sq_size = input_sq_size
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.hidden_size = hidden_size
        self.depth = depth
        self.num_heads = num_heads
        self.mlp_ratio = mlp_ratio
        self.class_dropout_prob = class_dropout_prob
        self.pred_sigma = pred_sigma
        self.drop_path = drop_path
        self.caption_channels = caption_channels
        self.model_max_length = model_max_length
        self.qk_norm = qk_norm
        self.enable_flash_attn = enable_flash_attn
        self.enable_layernorm_kernel = enable_layernorm_kernel
        self.enable_sequence_parallelism = enable_sequence_parallelism
        self.only_train_temporal = only_train_temporal
        self.freeze_y_embedder = freeze_y_embedder
        self.skip_y_embedder = skip_y_embedder
        super().__init__(**kwargs)


class STDiT3(PreTrainedModel):
    config_class = STDiT3Config

    def __init__(self, config):
        super().__init__(config)
        self.pred_sigma = config.pred_sigma
        self.in_channels = config.in_channels
        self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels

        # model size related
        self.depth = config.depth
        self.mlp_ratio = config.mlp_ratio
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_heads

        # computation related
        self.drop_path = config.drop_path
        self.enable_flash_attn = config.enable_flash_attn
        self.enable_layernorm_kernel = config.enable_layernorm_kernel
        self.enable_sequence_parallelism = config.enable_sequence_parallelism

        # input size related
        self.patch_size = config.patch_size
        self.input_sq_size = config.input_sq_size
        self.pos_embed = PositionEmbedding2D(config.hidden_size)
        self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads)

        # embedding
        self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size)
        self.t_embedder = TimestepEmbedder(config.hidden_size)
        self.fps_embedder = SizeEmbedder(self.hidden_size)
        self.t_block = nn.Sequential(
            nn.SiLU(),
            nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True),
        )
        self.y_embedder = CaptionEmbedder(
            in_channels=config.caption_channels,
            hidden_size=config.hidden_size,
            uncond_prob=config.class_dropout_prob,
            act_layer=approx_gelu,
            token_num=config.model_max_length,
        )

        # spatial blocks
        drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)]
        self.spatial_blocks = nn.ModuleList(
            [
                STDiT3Block(
                    hidden_size=config.hidden_size,
                    num_heads=config.num_heads,
                    mlp_ratio=config.mlp_ratio,
                    drop_path=drop_path[i],
                    qk_norm=config.qk_norm,
                    enable_flash_attn=config.enable_flash_attn,
                    enable_layernorm_kernel=config.enable_layernorm_kernel,
                    enable_sequence_parallelism=config.enable_sequence_parallelism,
                )
                for i in range(config.depth)
            ]
        )

        # temporal blocks
        drop_path = [x.item() for x in torch.linspace(0, self.drop_path, config.depth)]
        self.temporal_blocks = nn.ModuleList(
            [
                STDiT3Block(
                    hidden_size=config.hidden_size,
                    num_heads=config.num_heads,
                    mlp_ratio=config.mlp_ratio,
                    drop_path=drop_path[i],
                    qk_norm=config.qk_norm,
                    enable_flash_attn=config.enable_flash_attn,
                    enable_layernorm_kernel=config.enable_layernorm_kernel,
                    enable_sequence_parallelism=config.enable_sequence_parallelism,
                    # temporal
                    temporal=True,
                    rope=self.rope.rotate_queries_or_keys,
                )
                for i in range(config.depth)
            ]
        )

        # final layer
        self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels)

        self.initialize_weights()
        if config.only_train_temporal:
            for param in self.parameters():
                param.requires_grad = False
            for block in self.temporal_blocks:
                for param in block.parameters():
                    param.requires_grad = True

        if config.freeze_y_embedder:
            for param in self.y_embedder.parameters():
                param.requires_grad = False

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize fps_embedder
        nn.init.normal_(self.fps_embedder.mlp[0].weight, std=0.02)
        nn.init.constant_(self.fps_embedder.mlp[0].bias, 0)
        nn.init.constant_(self.fps_embedder.mlp[2].weight, 0)
        nn.init.constant_(self.fps_embedder.mlp[2].bias, 0)

        # Initialize timporal blocks
        for block in self.temporal_blocks:
            nn.init.constant_(block.attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.mlp.fc2.weight, 0)

    def get_dynamic_size(self, x):
        _, _, T, H, W = x.size()
        if T % self.patch_size[0] != 0:
            T += self.patch_size[0] - T % self.patch_size[0]
        if H % self.patch_size[1] != 0:
            H += self.patch_size[1] - H % self.patch_size[1]
        if W % self.patch_size[2] != 0:
            W += self.patch_size[2] - W % self.patch_size[2]
        T = T // self.patch_size[0]
        H = H // self.patch_size[1]
        W = W // self.patch_size[2]
        return (T, H, W)

    def encode_text(self, y, mask=None):
        y = self.y_embedder(y, self.training)  # [B, 1, N_token, C]
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, self.hidden_size)
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, self.hidden_size)
        return y, y_lens

    def forward(self, x, timestep, y, mask=None, x_mask=None, fps=None, height=None, width=None, cache_dic=None, current=None, **kwargs):
        dtype = self.x_embedder.proj.weight.dtype
        B = x.size(0)
        x = x.to(dtype)
        timestep = timestep.to(dtype)
        y = y.to(dtype)

        # === get pos embed ===
        _, _, Tx, Hx, Wx = x.size()
        T, H, W = self.get_dynamic_size(x)
        cache_dic['dynamic_size'] = (B,T,H,W)
        # adjust for sequence parallelism
        # we need to ensure H * W is divisible by sequence parallel size
        # for simplicity, we can adjust the height to make it divisible
        if self.enable_sequence_parallelism:
            sp_size = dist.get_world_size(get_sequence_parallel_group())
            if H % sp_size != 0:
                h_pad_size = sp_size - H % sp_size
            else:
                h_pad_size = 0

            if h_pad_size > 0:
                hx_pad_size = h_pad_size * self.patch_size[1]

                # pad x along the H dimension
                H += h_pad_size
                x = F.pad(x, (0, 0, 0, hx_pad_size))

        S = H * W
        base_size = round(S**0.5)
        resolution_sq = (height[0].item() * width[0].item()) ** 0.5
        scale = resolution_sq / self.input_sq_size
        pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size)

        # === get timestep embed ===
        t = self.t_embedder(timestep, dtype=x.dtype)  # [B, C]
        fps = self.fps_embedder(fps.unsqueeze(1), B)
        t = t + fps
        t_mlp = self.t_block(t)
        t0 = t0_mlp = None
        if x_mask is not None:
            t0_timestep = torch.zeros_like(timestep)
            t0 = self.t_embedder(t0_timestep, dtype=x.dtype)
            t0 = t0 + fps
            t0_mlp = self.t_block(t0)

        # === get y embed ===
        if self.config.skip_y_embedder:
            y_lens = mask
            if isinstance(y_lens, torch.Tensor):
                y_lens = y_lens.long().tolist()
        else:
            y, y_lens = self.encode_text(y, mask)

        # === get x embed ===
        x = self.x_embedder(x)  # [B, N, C]
        x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
        x = x + pos_emb

        # shard over the sequence dim if sp is enabled
        if self.enable_sequence_parallelism:
            x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="down")
            S = S // dist.get_world_size(get_sequence_parallel_group())

        x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S)

        # === blocks ===
        for i, (spatial_block, temporal_block) in enumerate(zip(self.spatial_blocks, self.temporal_blocks)):
            current['layer'] = i
            #x = auto_grad_checkpoint(spatial_block,  x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)
            #x = auto_grad_checkpoint(temporal_block, x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)
            x = spatial_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)
            x = temporal_block(x, y, t_mlp, current, cache_dic, y_lens, x_mask, t0_mlp, T, S)

        if self.enable_sequence_parallelism:
            x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
            x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=2, grad_scale="up")
            S = S * dist.get_world_size(get_sequence_parallel_group())
            x = rearrange(x, "B T S C -> B (T S) C", T=T, S=S)

        # === final layer ===
        x = self.final_layer(x, t, x_mask, t0, T, S)
        x = self.unpatchify(x, T, H, W, Tx, Hx, Wx)

        # cast to float32 for better accuracy
        x = x.to(torch.float32)
        return x

    def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w):
        """
        Args:
            x (torch.Tensor): of shape [B, N, C]

        Return:
            x (torch.Tensor): of shape [B, C_out, T, H, W]
        """

        # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
        T_p, H_p, W_p = self.patch_size
        x = rearrange(
            x,
            "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
            N_t=N_t,
            N_h=N_h,
            N_w=N_w,
            T_p=T_p,
            H_p=H_p,
            W_p=W_p,
            C_out=self.out_channels,
        )
        # unpad
        x = x[:, :, :R_t, :R_h, :R_w]
        return x


@MODELS.register_module("STDiT3-XL/2")
def STDiT3_XL_2(from_pretrained=None, **kwargs):
    force_huggingface = kwargs.pop("force_huggingface", False)
    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
        model = STDiT3.from_pretrained(from_pretrained, **kwargs)
    else:
        config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
        model = STDiT3(config)
        if from_pretrained is not None:
            load_checkpoint(model, from_pretrained)
    return model


@MODELS.register_module("STDiT3-3B/2")
def STDiT3_3B_2(from_pretrained=None, **kwargs):
    force_huggingface = kwargs.pop("force_huggingface", False)
    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
        model = STDiT3.from_pretrained(from_pretrained, **kwargs)
    else:
        config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs)
        model = STDiT3(config)
        if from_pretrained is not None:
            load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/opensora/models/text_encoder/__init__.py
================================================
from .classes import ClassEncoder
from .clip import ClipEncoder
from .t5 import T5Encoder


================================================
FILE: Open-Sora/opensora/models/text_encoder/classes.py
================================================
import torch

from opensora.registry import MODELS


@MODELS.register_module("classes")
class ClassEncoder:
    def __init__(self, num_classes, model_max_length=None, device="cuda", dtype=torch.float):
        self.num_classes = num_classes
        self.y_embedder = None

        self.model_max_length = model_max_length
        self.output_dim = None
        self.device = device

    def encode(self, text):
        return dict(y=torch.tensor([int(t) for t in text]).to(self.device))

    def null(self, n):
        return torch.tensor([self.num_classes] * n).to(self.device)


================================================
FILE: Open-Sora/opensora/models/text_encoder/clip.py
================================================
# Copyright 2024 Vchitect/Latte
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.# Modified from Latte
#
# This file is adapted from the Latte project.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# Latte: https://github.com/Vchitect/Latte
# DiT:   https://github.com/facebookresearch/DiT/tree/main
# --------------------------------------------------------


import torch
import torch.nn as nn
import transformers
from transformers import CLIPTextModel, CLIPTokenizer

from opensora.registry import MODELS

transformers.logging.set_verbosity_error()


class AbstractEncoder(nn.Module):
    def __init__(self):
        super().__init__()

    def encode(self, *args, **kwargs):
        raise NotImplementedError


class FrozenCLIPEmbedder(AbstractEncoder):
    """Uses the CLIP transformer encoder for text (from Hugging Face)"""

    def __init__(self, path="openai/clip-vit-huge-patch14", device="cuda", max_length=77):
        super().__init__()
        self.tokenizer = CLIPTokenizer.from_pretrained(path)
        self.transformer = CLIPTextModel.from_pretrained(path)
        self.device = device
        self.max_length = max_length
        self._freeze()

    def _freeze(self):
        self.transformer = self.transformer.eval()
        for param in self.parameters():
            param.requires_grad = False

    def forward(self, text):
        batch_encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            return_length=True,
            return_overflowing_tokens=False,
            padding="max_length",
            return_tensors="pt",
        )
        tokens = batch_encoding["input_ids"].to(self.device)
        outputs = self.transformer(input_ids=tokens)

        z = outputs.last_hidden_state
        pooled_z = outputs.pooler_output
        return z, pooled_z

    def encode(self, text):
        return self(text)


@MODELS.register_module("clip")
class ClipEncoder:
    """
    Embeds text prompt into vector representations. Also handles text dropout for classifier-free guidance.
    """

    def __init__(
        self,
        from_pretrained,
        model_max_length=77,
        device="cuda",
        dtype=torch.float,
    ):
        super().__init__()
        assert from_pretrained is not None, "Please specify the path to the T5 model"

        self.text_encoder = FrozenCLIPEmbedder(path=from_pretrained, max_length=model_max_length).to(device, dtype)
        self.y_embedder = None

        self.model_max_length = model_max_length
        self.output_dim = self.text_encoder.transformer.config.hidden_size

    def encode(self, text):
        _, pooled_embeddings = self.text_encoder.encode(text)
        y = pooled_embeddings.unsqueeze(1).unsqueeze(1)
        return dict(y=y)

    def null(self, n):
        null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None]
        return null_y

    def to(self, dtype):
        self.text_encoder = self.text_encoder.to(dtype)
        return self


================================================
FILE: Open-Sora/opensora/models/text_encoder/t5.py
================================================
# Adapted from PixArt
#
# Copyright (C) 2023  PixArt-alpha/PixArt-alpha
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
# T5:     https://github.com/google-research/text-to-text-transfer-transformer
# --------------------------------------------------------

import html
import re

import ftfy
import torch
from transformers import AutoTokenizer, T5EncoderModel

from opensora.registry import MODELS


class T5Embedder:
    def __init__(
        self,
        device,
        from_pretrained=None,
        *,
        cache_dir=None,
        hf_token=None,
        use_text_preprocessing=True,
        t5_model_kwargs=None,
        torch_dtype=None,
        use_offload_folder=None,
        model_max_length=120,
        local_files_only=False,
    ):
        self.device = torch.device(device)
        self.torch_dtype = torch_dtype or torch.bfloat16
        self.cache_dir = cache_dir

        if t5_model_kwargs is None:
            t5_model_kwargs = {
                "low_cpu_mem_usage": True,
                "torch_dtype": self.torch_dtype,
            }

            if use_offload_folder is not None:
                t5_model_kwargs["offload_folder"] = use_offload_folder
                t5_model_kwargs["device_map"] = {
                    "shared": self.device,
                    "encoder.embed_tokens": self.device,
                    "encoder.block.0": self.device,
                    "encoder.block.1": self.device,
                    "encoder.block.2": self.device,
                    "encoder.block.3": self.device,
                    "encoder.block.4": self.device,
                    "encoder.block.5": self.device,
                    "encoder.block.6": self.device,
                    "encoder.block.7": self.device,
                    "encoder.block.8": self.device,
                    "encoder.block.9": self.device,
                    "encoder.block.10": self.device,
                    "encoder.block.11": self.device,
                    "encoder.block.12": "disk",
                    "encoder.block.13": "disk",
                    "encoder.block.14": "disk",
                    "encoder.block.15": "disk",
                    "encoder.block.16": "disk",
                    "encoder.block.17": "disk",
                    "encoder.block.18": "disk",
                    "encoder.block.19": "disk",
                    "encoder.block.20": "disk",
                    "encoder.block.21": "disk",
                    "encoder.block.22": "disk",
                    "encoder.block.23": "disk",
                    "encoder.final_layer_norm": "disk",
                    "encoder.dropout": "disk",
                }
            else:
                t5_model_kwargs["device_map"] = {
                    "shared": self.device,
                    "encoder": self.device,
                }

        self.use_text_preprocessing = use_text_preprocessing
        self.hf_token = hf_token

        self.tokenizer = AutoTokenizer.from_pretrained(
            from_pretrained,
            cache_dir=cache_dir,
            local_files_only=local_files_only,
        )
        self.model = T5EncoderModel.from_pretrained(
            from_pretrained,
            cache_dir=cache_dir,
            local_files_only=local_files_only,
            **t5_model_kwargs,
        ).eval()
        self.model_max_length = model_max_length

    def get_text_embeddings(self, texts):
        text_tokens_and_mask = self.tokenizer(
            texts,
            max_length=self.model_max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )

        input_ids = text_tokens_and_mask["input_ids"].to(self.device)
        attention_mask = text_tokens_and_mask["attention_mask"].to(self.device)
        with torch.no_grad():
            text_encoder_embs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )["last_hidden_state"].detach()
        return text_encoder_embs, attention_mask


@MODELS.register_module("t5")
class T5Encoder:
    def __init__(
        self,
        from_pretrained=None,
        model_max_length=120,
        device="cuda",
        dtype=torch.float,
        cache_dir=None,
        shardformer=False,
        local_files_only=False,
    ):
        assert from_pretrained is not None, "Please specify the path to the T5 model"

        self.t5 = T5Embedder(
            device=device,
            torch_dtype=dtype,
            from_pretrained=from_pretrained,
            cache_dir=cache_dir,
            model_max_length=model_max_length,
            local_files_only=local_files_only,
        )
        self.t5.model.to(dtype=dtype)
        self.y_embedder = None

        self.model_max_length = model_max_length
        self.output_dim = self.t5.model.config.d_model
        self.dtype = dtype

        if shardformer:
            self.shardformer_t5()

    def shardformer_t5(self):
        from colossalai.shardformer import ShardConfig, ShardFormer

        from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy
        from opensora.utils.misc import requires_grad

        shard_config = ShardConfig(
            tensor_parallel_process_group=None,
            pipeline_stage_manager=None,
            enable_tensor_parallelism=False,
            enable_fused_normalization=False,
            enable_flash_attention=False,
            enable_jit_fused=True,
            enable_sequence_parallelism=False,
            enable_sequence_overlap=False,
        )
        shard_former = ShardFormer(shard_config=shard_config)
        optim_model, _ = shard_former.optimize(self.t5.model, policy=T5EncoderPolicy())
        self.t5.model = optim_model.to(self.dtype)

        # ensure the weights are frozen
        requires_grad(self.t5.model, False)

    def encode(self, text):
        caption_embs, emb_masks = self.t5.get_text_embeddings(text)
        caption_embs = caption_embs[:, None]
        return dict(y=caption_embs, mask=emb_masks)

    def null(self, n):
        null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None]
        return null_y


def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


BAD_PUNCT_REGEX = re.compile(
    r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
)  # noqa


def clean_caption(caption):
    import urllib.parse as ul

    from bs4 import BeautifulSoup

    caption = str(caption)
    caption = ul.unquote_plus(caption)
    caption = caption.strip().lower()
    caption = re.sub("<person>", "person", caption)
    # urls:
    caption = re.sub(
        r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    caption = re.sub(
        r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    # html:
    caption = BeautifulSoup(caption, features="html.parser").text

    # @<nickname>
    caption = re.sub(r"@[\w\d]+\b", "", caption)

    # 31C0—31EF CJK Strokes
    # 31F0—31FF Katakana Phonetic Extensions
    # 3200—32FF Enclosed CJK Letters and Months
    # 3300—33FF CJK Compatibility
    # 3400—4DBF CJK Unified Ideographs Extension A
    # 4DC0—4DFF Yijing Hexagram Symbols
    # 4E00—9FFF CJK Unified Ideographs
    caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
    caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
    caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
    caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
    caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
    caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
    caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
    #######################################################

    # все виды тире / all types of dash --> "-"
    caption = re.sub(
        r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
        "-",
        caption,
    )

    # кавычки к одному стандарту
    caption = re.sub(r"[`´«»“”¨]", '"', caption)
    caption = re.sub(r"[‘’]", "'", caption)

    # &quot;
    caption = re.sub(r"&quot;?", "", caption)
    # &amp
    caption = re.sub(r"&amp", "", caption)

    # ip adresses:
    caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)

    # article ids:
    caption = re.sub(r"\d:\d\d\s+$", "", caption)

    # \n
    caption = re.sub(r"\\n", " ", caption)

    # "#123"
    caption = re.sub(r"#\d{1,3}\b", "", caption)
    # "#12345.."
    caption = re.sub(r"#\d{5,}\b", "", caption)
    # "123456.."
    caption = re.sub(r"\b\d{6,}\b", "", caption)
    # filenames:
    caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)

    #
    caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
    caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""

    caption = re.sub(BAD_PUNCT_REGEX, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
    caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "

    # this-is-my-cute-cat / this_is_my_cute_cat
    regex2 = re.compile(r"(?:\-|\_)")
    if len(re.findall(regex2, caption)) > 3:
        caption = re.sub(regex2, " ", caption)

    caption = basic_clean(caption)

    caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
    caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
    caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231

    caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
    caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
    caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
    caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
    caption = re.sub(r"\bpage\s+\d+\b", "", caption)

    caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...

    caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)

    caption = re.sub(r"\b\s+\:\s+", r": ", caption)
    caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
    caption = re.sub(r"\s+", " ", caption)

    caption.strip()

    caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
    caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
    caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
    caption = re.sub(r"^\.\S+$", "", caption)

    return caption.strip()


def text_preprocessing(text, use_text_preprocessing: bool = True):
    if use_text_preprocessing:
        # The exact text cleaning as was in the training stage:
        text = clean_caption(text)
        text = clean_caption(text)
        return text
    else:
        return text.lower().strip()


================================================
FILE: Open-Sora/opensora/models/vae/__init__.py
================================================
from .discriminator import DISCRIMINATOR_3D
from .vae import VideoAutoencoderKL, VideoAutoencoderKLTemporalDecoder
from .vae_temporal import VAE_Temporal


================================================
FILE: Open-Sora/opensora/models/vae/discriminator.py
================================================
import functools
import math

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from opensora.registry import MODELS
from opensora.utils.ckpt_utils import find_model, load_checkpoint


def cast_tuple(t, length=1):
    return t if isinstance(t, tuple) else ((t,) * length)


def xavier_uniform_weight_init(m):
    if isinstance(m, nn.Conv3d) or isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain("relu"))
        if m.bias is not None:
            nn.init.zeros_(m.bias)
        # print("initialized module to xavier_uniform:", m)


# SCH: taken from Open Sora Plan
def n_layer_disc_weights_init(m):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find("BatchNorm") != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)


# SCH: own implementation modified on top of: discriminator with anti-aliased downsampling (blurpool Zhang et al.)
class BlurPool3D(nn.Module):
    def __init__(
        self,
        channels,
        pad_type="reflect",
        filt_size=3,
        stride=2,
        pad_off=0,
        device="cpu",
        dtype=torch.bfloat16,
    ):
        super(BlurPool3D, self).__init__()
        self.filt_size = filt_size
        self.pad_off = pad_off
        self.pad_sizes = [
            int(1.0 * (filt_size - 1) / 2),
            int(np.ceil(1.0 * (filt_size - 1) / 2)),
            int(1.0 * (filt_size - 1) / 2),
            int(np.ceil(1.0 * (filt_size - 1) / 2)),
            int(1.0 * (filt_size - 1) / 2),
            int(np.ceil(1.0 * (filt_size - 1) / 2)),
        ]
        self.pad_sizes = [pad_size + pad_off for pad_size in self.pad_sizes]
        self.stride = stride
        self.off = int((self.stride - 1) / 2.0)
        self.channels = channels

        if self.filt_size == 1:
            a = np.array(
                [
                    1.0,
                ]
            )
        elif self.filt_size == 2:
            a = np.array([1.0, 1.0])
        elif self.filt_size == 3:
            a = np.array([1.0, 2.0, 1.0])
        elif self.filt_size == 4:
            a = np.array([1.0, 3.0, 3.0, 1.0])
        elif self.filt_size == 5:
            a = np.array([1.0, 4.0, 6.0, 4.0, 1.0])
        elif self.filt_size == 6:
            a = np.array([1.0, 5.0, 10.0, 10.0, 5.0, 1.0])
        elif self.filt_size == 7:
            a = np.array([1.0, 6.0, 15.0, 20.0, 15.0, 6.0, 1.0])

        filt_2d = a[:, None] * a[None, :]
        filt_3d = torch.Tensor(a[:, None, None] * filt_2d[None, :, :]).to(device, dtype)

        filt = filt_3d / torch.sum(filt_3d)  # SCH: modified to it 3D
        self.register_buffer("filt", filt[None, None, :, :, :].repeat((self.channels, 1, 1, 1, 1)))

        self.pad = get_pad_layer(pad_type)(self.pad_sizes)

    def forward(self, inp):
        if self.filt_size == 1:
            if self.pad_off == 0:
                return inp[:, :, :: self.stride, :: self.stride]
            else:
                return self.pad(inp)[:, :, :: self.stride, :: self.stride]
        else:
            return F.conv3d(self.pad(inp), self.filt, stride=self.stride, groups=inp.shape[1])


class ResBlockDown(nn.Module):
    """3D StyleGAN ResBlock for D."""

    def __init__(
        self,
        in_channels,
        filters,
        activation_fn,
        num_groups=32,
        device="cpu",
        dtype=torch.bfloat16,
    ):
        super().__init__()

        self.filters = filters
        self.activation_fn = activation_fn

        # SCH: NOTE: although paper says conv (X->Y, Y->Y), original code implementation is (X->X, X->Y), we follow code
        self.conv1 = nn.Conv3d(
            in_channels, in_channels, (3, 3, 3), padding=1, device=device, dtype=dtype
        )  # NOTE: init to xavier_uniform
        self.norm1 = nn.GroupNorm(num_groups, in_channels, device=device, dtype=dtype)

        self.blur = BlurPool3D(in_channels, device=device, dtype=dtype)

        self.conv2 = nn.Conv3d(
            in_channels, self.filters, (1, 1, 1), bias=False, device=device, dtype=dtype
        )  # NOTE: init to xavier_uniform
        self.conv3 = nn.Conv3d(
            in_channels, self.filters, (3, 3, 3), padding=1, device=device, dtype=dtype
        )  # NOTE: init to xavier_uniform
        self.norm2 = nn.GroupNorm(num_groups, self.filters, device=device, dtype=dtype)

        # self.apply(xavier_uniform_weight_init)

    def forward(self, x):
        residual = x
        x = self.conv1(x)
        x = self.norm1(x)
        x = self.activation_fn(x)

        residual = self.blur(residual)
        residual = self.conv2(residual)

        x = self.blur(x)
        x = self.conv3(x)
        x = self.norm2(x)
        x = self.activation_fn(x)
        out = (residual + x) / math.sqrt(2)
        return out


@MODELS.register_module()
class NLayerDiscriminator(nn.Module):
    """Defines a PatchGAN discriminator as in Pix2Pix
    --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
    """

    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False, from_pretrained=None):
        """Construct a PatchGAN discriminator
        Parameters:
            input_nc (int)  -- the number of channels in input images
            ndf (int)       -- the number of filters in the last conv layer
            n_layers (int)  -- the number of conv layers in the discriminator
            norm_layer      -- normalization layer
        """
        super(NLayerDiscriminator, self).__init__()

        norm_layer = nn.BatchNorm2d

        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm2d has affine parameters
            use_bias = norm_layer.func != nn.BatchNorm2d
        else:
            use_bias = norm_layer != nn.BatchNorm2d

        kw = 4
        padw = 1
        sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
        nf_mult = 1
        nf_mult_prev = 1
        for n in range(1, n_layers):  # gradually increase the number of filters
            nf_mult_prev = nf_mult
            nf_mult = min(2**n, 8)
            sequence += [
                nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
                norm_layer(ndf * nf_mult),
                nn.LeakyReLU(0.2, True),
            ]

        nf_mult_prev = nf_mult
        nf_mult = min(2**n_layers, 8)
        sequence += [
            nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
            norm_layer(ndf * nf_mult),
            nn.LeakyReLU(0.2, True),
        ]

        sequence += [
            nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)
        ]  # output 1 channel prediction map
        self.main = nn.Sequential(*sequence)

        if from_pretrained is not None:
            load_checkpoint(self, from_pretrained)

    def forward(self, input):
        """Standard forward."""
        return self.main(input)


class NLayerDiscriminator3D(nn.Module):
    """Defines a 3D PatchGAN discriminator as in Pix2Pix but for 3D inputs."""

    def __init__(self, input_nc=1, ndf=64, n_layers=3, use_actnorm=False):
        """
        Construct a 3D PatchGAN discriminator

        Parameters:
            input_nc (int)  -- the number of channels in input volumes
            ndf (int)       -- the number of filters in the last conv layer
            n_layers (int)  -- the number of conv layers in the discriminator
            use_actnorm (bool) -- flag to use actnorm instead of batchnorm
        """
        super(NLayerDiscriminator3D, self).__init__()
        if not use_actnorm:
            norm_layer = nn.BatchNorm3d
        else:
            raise NotImplementedError("Not implemented.")
        if type(norm_layer) == functools.partial:
            use_bias = norm_layer.func != nn.BatchNorm3d
        else:
            use_bias = norm_layer != nn.BatchNorm3d

        kw = 4
        padw = 1
        sequence = [nn.Conv3d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
        nf_mult = 1
        nf_mult_prev = 1
        for n in range(1, n_layers):  # gradually increase the number of filters
            nf_mult_prev = nf_mult
            nf_mult = min(2**n, 8)
            sequence += [
                nn.Conv3d(
                    ndf * nf_mult_prev,
                    ndf * nf_mult,
                    kernel_size=(kw, kw, kw),
                    stride=(1, 2, 2),
                    padding=padw,
                    bias=use_bias,
                ),
                norm_layer(ndf * nf_mult),
                nn.LeakyReLU(0.2, True),
            ]

        nf_mult_prev = nf_mult
        nf_mult = min(2**n_layers, 8)
        sequence += [
            nn.Conv3d(
                ndf * nf_mult_prev, ndf * nf_mult, kernel_size=(kw, kw, kw), stride=1, padding=padw, bias=use_bias
            ),
            norm_layer(ndf * nf_mult),
            nn.LeakyReLU(0.2, True),
        ]

        sequence += [
            nn.Conv3d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)
        ]  # output 1 channel prediction map
        self.main = nn.Sequential(*sequence)

    def forward(self, input):
        """Standard forward."""
        return self.main(input)


class StyleGANDiscriminatorBlur(nn.Module):
    """StyleGAN Discriminator.

    SCH: NOTE:
        this discriminator requries the num_frames to be fixed during training;
        in case we pre-train with image then train on video, this disciminator's Linear layer would have to be re-trained!
    """

    def __init__(
        self,
        image_size=(128, 128),
        num_frames=17,
        in_channels=3,
        filters=128,
        channel_multipliers=(2, 4, 4, 4, 4),
        num_groups=32,
        dtype=torch.bfloat16,
        device="cpu",
    ):
        super().__init__()

        self.dtype = dtype
        self.input_size = cast_tuple(image_size, 2)
        self.filters = filters
        self.activation_fn = nn.LeakyReLU(negative_slope=0.2)
        self.channel_multipliers = channel_multipliers

        self.conv1 = nn.Conv3d(
            in_channels, self.filters, (3, 3, 3), padding=1, device=device, dtype=dtype
        )  # NOTE: init to xavier_uniform

        prev_filters = self.filters  # record in_channels
        self.num_blocks = len(self.channel_multipliers)
        self.res_block_list = nn.ModuleList([])
        for i in range(self.num_blocks):
            filters = self.filters * self.channel_multipliers[i]
            self.res_block_list.append(
                ResBlockDown(prev_filters, filters, self.activation_fn, device=device, dtype=dtype).apply(
                    xavier_uniform_weight_init
                )
            )
            prev_filters = filters  # update in_channels

        self.conv2 = nn.Conv3d(
            prev_filters, prev_filters, (3, 3, 3), padding=1, device=device, dtype=dtype
        )  # NOTE: init to xavier_uniform
        # torch.nn.init.xavier_uniform_(self.conv2.weight)

        self.norm1 = nn.GroupNorm(num_groups, prev_filters, dtype=dtype, device=device)

        scale_factor = 2**self.num_blocks
        if num_frames % scale_factor != 0:  # SCH: NOTE: has first frame which would be padded before usage
            time_scaled = num_frames // scale_factor + 1
        else:
            time_scaled = num_frames / scale_factor

        assert (
            self.input_size[0] % scale_factor == 0
        ), f"image width {self.input_size[0]} is not divisible by scale factor {scale_factor}"
        assert (
            self.input_size[1] % scale_factor == 0
        ), f"image height {self.input_size[1]} is not divisible by scale factor {scale_factor}"
        w_scaled, h_scaled = self.input_size[0] / scale_factor, self.input_size[1] / scale_factor
        in_features = int(prev_filters * time_scaled * w_scaled * h_scaled)  # (C*T*W*H)
        self.linear1 = nn.Linear(in_features, prev_filters, device=device, dtype=dtype)  # NOTE: init to xavier_uniform
        self.linear2 = nn.Linear(prev_filters, 1, device=device, dtype=dtype)  # NOTE: init to xavier_uniform

        # self.apply(xavier_uniform_weight_init)

    def forward(self, x):
        x = self.conv1(x)
        # print("discriminator aft conv:", x.size())
        x = self.activation_fn(x)

        for i in range(self.num_blocks):
            x = self.res_block_list[i](x)
            # print("discriminator resblock down:", x.size())

        x = self.conv2(x)
        # print("discriminator aft conv2:", x.size())
        x = self.norm1(x)
        x = self.activation_fn(x)
        x = x.reshape((x.shape[0], -1))  # SCH: [B, (C * T * W * H)] ?

        # print("discriminator reshape:", x.size())
        x = self.linear1(x)
        # print("discriminator aft linear1:", x.size())

        x = self.activation_fn(x)
        x = self.linear2(x)
        # print("discriminator aft linear2:", x.size())
        return x


def load_checkpoint_with_inflation(model, ckpt_path):
    """
    pre-train using image, then inflate to 3D videos
    """
    if ckpt_path.endswith(".pt") or ckpt_path.endswith(".pth"):
        state_dict = find_model(ckpt_path)
        with torch.no_grad():
            for key in state_dict:
                if key in model:
                    # central inflation
                    if state_dict[key].size() == model[key][:, :, 0, :, :].size():
                        # temporal dimension
                        val = torch.zeros_like(model[key])
                        centre = int(model[key].size(2) // 2)
                        val[:, :, centre, :, :] = state_dict[key]
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
        print(f"Missing keys: {missing_keys}")
        print(f"Unexpected keys: {unexpected_keys}")
    else:
        load_checkpoint(model, ckpt_path)  # use the default function


@MODELS.register_module("DISCRIMINATOR_3D")
def DISCRIMINATOR_3D(from_pretrained=None, inflate_from_2d=False, use_pretrained=True, **kwargs):
    model = StyleGANDiscriminatorBlur(**kwargs).apply(xavier_uniform_weight_init)
    if from_pretrained is not None:
        if use_pretrained:
            if inflate_from_2d:
                load_checkpoint_with_inflation(model, from_pretrained)
            else:
                load_checkpoint(model, from_pretrained, model_name="discriminator")
                print("loaded discriminator")
        else:
            print(f"discriminator use_pretrained={use_pretrained}, initializing new discriminator")

    return model


@MODELS.register_module("N_Layer_DISCRIMINATOR_3D")
def DISCRIMINATOR_3D_N_Layer(from_pretrained=None, inflate_from_2d=False, use_pretrained=True, **kwargs):
    model = NLayerDiscriminator3D(
        input_nc=3,
        n_layers=3,
    ).apply(n_layer_disc_weights_init)
    if from_pretrained is not None:
        if use_pretrained:
            if inflate_from_2d:
                load_checkpoint_with_inflation(model, from_pretrained)
            else:
                load_checkpoint(model, from_pretrained, model_name="discriminator")
                print("loaded discriminator")
        else:
            print(f"discriminator use_pretrained={use_pretrained}, initializing new discriminator")

    return model


================================================
FILE: Open-Sora/opensora/models/vae/losses.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat

from .lpips import LPIPS


def hinge_d_loss(logits_real, logits_fake):
    loss_real = torch.mean(F.relu(1.0 - logits_real))
    loss_fake = torch.mean(F.relu(1.0 + logits_fake))
    d_loss = 0.5 * (loss_real + loss_fake)
    return d_loss


def vanilla_d_loss(logits_real, logits_fake):
    d_loss = 0.5 * (
        torch.mean(torch.nn.functional.softplus(-logits_real)) + torch.mean(torch.nn.functional.softplus(logits_fake))
    )
    return d_loss


# from MAGVIT, used in place hof hinge_d_loss
def sigmoid_cross_entropy_with_logits(labels, logits):
    # The final formulation is: max(x, 0) - x * z + log(1 + exp(-abs(x)))
    zeros = torch.zeros_like(logits, dtype=logits.dtype)
    condition = logits >= zeros
    relu_logits = torch.where(condition, logits, zeros)
    neg_abs_logits = torch.where(condition, -logits, logits)
    return relu_logits - logits * labels + torch.log1p(torch.exp(neg_abs_logits))


def lecam_reg(real_pred, fake_pred, ema_real_pred, ema_fake_pred):
    assert real_pred.ndim == 0 and ema_fake_pred.ndim == 0
    lecam_loss = torch.mean(torch.pow(nn.ReLU()(real_pred - ema_fake_pred), 2))
    lecam_loss += torch.mean(torch.pow(nn.ReLU()(ema_real_pred - fake_pred), 2))
    return lecam_loss


def gradient_penalty_fn(images, output):
    gradients = torch.autograd.grad(
        outputs=output,
        inputs=images,
        grad_outputs=torch.ones(output.size(), device=images.device),
        create_graph=True,
        retain_graph=True,
        only_inputs=True,
    )[0]

    gradients = rearrange(gradients, "b ... -> b (...)")
    return ((gradients.norm(2, dim=1) - 1) ** 2).mean()


class VAELoss(nn.Module):
    def __init__(
        self,
        logvar_init=0.0,
        perceptual_loss_weight=0.1,
        kl_loss_weight=0.000001,
        device="cpu",
        dtype="bf16",
    ):
        super().__init__()

        if type(dtype) == str:
            if dtype == "bf16":
                dtype = torch.bfloat16
            elif dtype == "fp16":
                dtype = torch.float16
            else:
                raise NotImplementedError(f"dtype: {dtype}")

        # KL Loss
        self.kl_loss_weight = kl_loss_weight
        # Perceptual Loss
        self.perceptual_loss_fn = LPIPS().eval().to(device, dtype)
        self.perceptual_loss_weight = perceptual_loss_weight
        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)

    def forward(
        self,
        video,
        recon_video,
        posterior,
        nll_weights=None,
        no_perceptual=False,
    ):
        video = rearrange(video, "b c t h w -> (b t) c h w").contiguous()
        recon_video = rearrange(recon_video, "b c t h w -> (b t) c h w").contiguous()

        # reconstruction loss
        recon_loss = torch.abs(video - recon_video)

        # perceptual loss
        if self.perceptual_loss_weight is not None and self.perceptual_loss_weight > 0.0 and not no_perceptual:
            # handle channels
            channels = video.shape[1]
            assert channels in {1, 3}
            if channels == 1:
                input_vgg_input = repeat(video, "b 1 h w -> b c h w", c=3)
                recon_vgg_input = repeat(recon_video, "b 1 h w -> b c h w", c=3)
            else:
                input_vgg_input = video
                recon_vgg_input = recon_video

            perceptual_loss = self.perceptual_loss_fn(input_vgg_input, recon_vgg_input)
            recon_loss = recon_loss + self.perceptual_loss_weight * perceptual_loss

        nll_loss = recon_loss / torch.exp(self.logvar) + self.logvar

        weighted_nll_loss = nll_loss
        if nll_weights is not None:
            weighted_nll_loss = nll_weights * nll_loss
        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]

        # KL Loss
        weighted_kl_loss = 0
        if self.kl_loss_weight is not None and self.kl_loss_weight > 0.0:
            kl_loss = posterior.kl()
            kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
            weighted_kl_loss = kl_loss * self.kl_loss_weight

        return nll_loss, weighted_nll_loss, weighted_kl_loss


def adopt_weight(weight, global_step, threshold=0, value=0.0):
    if global_step < threshold:
        weight = value
    return weight


class AdversarialLoss(nn.Module):
    def __init__(
        self,
        discriminator_factor=1.0,
        discriminator_start=50001,
        generator_factor=0.5,
        generator_loss_type="non-saturating",
    ):
        super().__init__()
        self.discriminator_factor = discriminator_factor
        self.discriminator_start = discriminator_start
        self.generator_factor = generator_factor
        self.generator_loss_type = generator_loss_type

    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer):
        nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
        g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
        d_weight = d_weight * self.generator_factor
        return d_weight

    def forward(
        self,
        fake_logits,
        nll_loss,
        last_layer,
        global_step,
        is_training=True,
    ):
        # NOTE: following MAGVIT to allow non_saturating
        assert self.generator_loss_type in ["hinge", "vanilla", "non-saturating"]

        if self.generator_loss_type == "hinge":
            gen_loss = -torch.mean(fake_logits)
        elif self.generator_loss_type == "non-saturating":
            gen_loss = torch.mean(
                sigmoid_cross_entropy_with_logits(labels=torch.ones_like(fake_logits), logits=fake_logits)
            )
        else:
            raise ValueError("Generator loss {} not supported".format(self.generator_loss_type))

        if self.discriminator_factor is not None and self.discriminator_factor > 0.0:
            try:
                d_weight = self.calculate_adaptive_weight(nll_loss, gen_loss, last_layer)
            except RuntimeError:
                assert not is_training
                d_weight = torch.tensor(0.0)
        else:
            d_weight = torch.tensor(0.0)

        disc_factor = adopt_weight(self.discriminator_factor, global_step, threshold=self.discriminator_start)
        weighted_gen_loss = d_weight * disc_factor * gen_loss

        return weighted_gen_loss


class LeCamEMA:
    def __init__(self, ema_real=0.0, ema_fake=0.0, decay=0.999, dtype=torch.bfloat16, device="cpu"):
        self.decay = decay
        self.ema_real = torch.tensor(ema_real).to(device, dtype)
        self.ema_fake = torch.tensor(ema_fake).to(device, dtype)

    def update(self, ema_real, ema_fake):
        self.ema_real = self.ema_real * self.decay + ema_real * (1 - self.decay)
        self.ema_fake = self.ema_fake * self.decay + ema_fake * (1 - self.decay)

    def get(self):
        return self.ema_real, self.ema_fake


class DiscriminatorLoss(nn.Module):
    def __init__(
        self,
        discriminator_factor=1.0,
        discriminator_start=50001,
        discriminator_loss_type="non-saturating",
        lecam_loss_weight=None,
        gradient_penalty_loss_weight=None,  # SCH: following MAGVIT config.vqgan.grad_penalty_cost
    ):
        super().__init__()

        assert discriminator_loss_type in ["hinge", "vanilla", "non-saturating"]
        self.discriminator_factor = discriminator_factor
        self.discriminator_start = discriminator_start
        self.lecam_loss_weight = lecam_loss_weight
        self.gradient_penalty_loss_weight = gradient_penalty_loss_weight
        self.discriminator_loss_type = discriminator_loss_type

    def forward(
        self,
        real_logits,
        fake_logits,
        global_step,
        lecam_ema_real=None,
        lecam_ema_fake=None,
        real_video=None,
        split="train",
    ):
        if self.discriminator_factor is not None and self.discriminator_factor > 0.0:
            disc_factor = adopt_weight(self.discriminator_factor, global_step, threshold=self.discriminator_start)

            if self.discriminator_loss_type == "hinge":
                disc_loss = hinge_d_loss(real_logits, fake_logits)
            elif self.discriminator_loss_type == "non-saturating":
                if real_logits is not None:
                    real_loss = sigmoid_cross_entropy_with_logits(
                        labels=torch.ones_like(real_logits), logits=real_logits
                    )
                else:
                    real_loss = 0.0
                if fake_logits is not None:
                    fake_loss = sigmoid_cross_entropy_with_logits(
                        labels=torch.zeros_like(fake_logits), logits=fake_logits
                    )
                else:
                    fake_loss = 0.0
                disc_loss = 0.5 * (torch.mean(real_loss) + torch.mean(fake_loss))
            elif self.discriminator_loss_type == "vanilla":
                disc_loss = vanilla_d_loss(real_logits, fake_logits)
            else:
                raise ValueError(f"Unknown GAN loss '{self.discriminator_loss_type}'.")

            weighted_d_adversarial_loss = disc_factor * disc_loss

        else:
            weighted_d_adversarial_loss = 0

        lecam_loss = torch.tensor(0.0)
        if self.lecam_loss_weight is not None and self.lecam_loss_weight > 0.0:
            real_pred = torch.mean(real_logits)
            fake_pred = torch.mean(fake_logits)
            lecam_loss = lecam_reg(real_pred, fake_pred, lecam_ema_real, lecam_ema_fake)
            lecam_loss = lecam_loss * self.lecam_loss_weight

        gradient_penalty = torch.tensor(0.0)
        if self.gradient_penalty_loss_weight is not None and self.gradient_penalty_loss_weight > 0.0:
            assert real_video is not None
            gradient_penalty = gradient_penalty_fn(real_video, real_logits)
            gradient_penalty *= self.gradient_penalty_loss_weight

        return (weighted_d_adversarial_loss, lecam_loss, gradient_penalty)


================================================
FILE: Open-Sora/opensora/models/vae/lpips.py
================================================
import hashlib
import os
from collections import namedtuple

import requests
import torch
import torch.nn as nn
from torchvision import models
from tqdm import tqdm

URL_MAP = {"vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"}

CKPT_MAP = {"vgg_lpips": "vgg.pth"}

MD5_MAP = {"vgg_lpips": "d507d7349b931f0638a25a48a722f98a"}


def md5_hash(path):
    with open(path, "rb") as f:
        content = f.read()
    return hashlib.md5(content).hexdigest()


def download(url, local_path, chunk_size=1024):
    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
    with requests.get(url, stream=True) as r:
        total_size = int(r.headers.get("content-length", 0))
        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
            with open(local_path, "wb") as f:
                for data in r.iter_content(chunk_size=chunk_size):
                    if data:
                        f.write(data)
                        pbar.update(chunk_size)


def get_ckpt_path(name, root, check=False):
    assert name in URL_MAP
    path = os.path.join(root, CKPT_MAP[name])
    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
        download(URL_MAP[name], path)
        md5 = md5_hash(path)
        assert md5 == MD5_MAP[name], md5
    return path


class LPIPS(nn.Module):
    # Learned perceptual metric
    def __init__(self, use_dropout=True):
        super().__init__()
        self.scaling_layer = ScalingLayer()
        self.chns = [64, 128, 256, 512, 512]  # vg16 features
        self.net = vgg16(pretrained=True, requires_grad=False)
        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
        self.load_from_pretrained()
        for param in self.parameters():
            param.requires_grad = False

    def load_from_pretrained(self, name="vgg_lpips"):
        ckpt = get_ckpt_path(name, "pretrained_models/taming/modules/autoencoder/lpips")
        self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
        # print("loaded pretrained LPIPS loss from {}".format(ckpt))

    @classmethod
    def from_pretrained(cls, name="vgg_lpips"):
        if name != "vgg_lpips":
            raise NotImplementedError
        model = cls()
        ckpt = get_ckpt_path(name)
        model.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
        return model

    def forward(self, input, target):
        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
        outs0, outs1 = self.net(in0_input), self.net(in1_input)
        feats0, feats1, diffs = {}, {}, {}
        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
        for kk in range(len(self.chns)):
            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2

        res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
        val = res[0]
        for l in range(1, len(self.chns)):
            val += res[l]
        return val


class ScalingLayer(nn.Module):
    def __init__(self):
        super(ScalingLayer, self).__init__()
        self.register_buffer("shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None])
        self.register_buffer("scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None])

    def forward(self, inp):
        return (inp - self.shift) / self.scale


class NetLinLayer(nn.Module):
    """A single linear layer which does a 1x1 conv"""

    def __init__(self, chn_in, chn_out=1, use_dropout=False):
        super(NetLinLayer, self).__init__()
        layers = (
            [
                nn.Dropout(),
            ]
            if (use_dropout)
            else []
        )
        layers += [
            nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
        ]
        self.model = nn.Sequential(*layers)


class vgg16(torch.nn.Module):
    def __init__(self, requires_grad=False, pretrained=True):
        super(vgg16, self).__init__()
        vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
        self.slice1 = torch.nn.Sequential()
        self.slice2 = torch.nn.Sequential()
        self.slice3 = torch.nn.Sequential()
        self.slice4 = torch.nn.Sequential()
        self.slice5 = torch.nn.Sequential()
        self.N_slices = 5
        for x in range(4):
            self.slice1.add_module(str(x), vgg_pretrained_features[x])
        for x in range(4, 9):
            self.slice2.add_module(str(x), vgg_pretrained_features[x])
        for x in range(9, 16):
            self.slice3.add_module(str(x), vgg_pretrained_features[x])
        for x in range(16, 23):
            self.slice4.add_module(str(x), vgg_pretrained_features[x])
        for x in range(23, 30):
            self.slice5.add_module(str(x), vgg_pretrained_features[x])
        if not requires_grad:
            for param in self.parameters():
                param.requires_grad = False

    def forward(self, X):
        h = self.slice1(X)
        h_relu1_2 = h
        h = self.slice2(h)
        h_relu2_2 = h
        h = self.slice3(h)
        h_relu3_3 = h
        h = self.slice4(h)
        h_relu4_3 = h
        h = self.slice5(h)
        h_relu5_3 = h
        vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
        return out


def normalize_tensor(x, eps=1e-10):
    norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True))
    return x / (norm_factor + eps)


def spatial_average(x, keepdim=True):
    return x.mean([2, 3], keepdim=keepdim)


================================================
FILE: Open-Sora/opensora/models/vae/utils.py
================================================
import numpy as np
import torch

"""Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""


class DiagonalGaussianDistribution(object):
    def __init__(
        self,
        parameters,
        deterministic=False,
    ):
        self.parameters = parameters
        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
        self.deterministic = deterministic
        self.std = torch.exp(0.5 * self.logvar)
        self.var = torch.exp(self.logvar)
        if self.deterministic:
            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device, dtype=self.mean.dtype)

    def sample(self):
        # torch.randn: standard normal distribution
        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device, dtype=self.mean.dtype)
        return x

    def kl(self, other=None):
        if self.deterministic:
            return torch.Tensor([0.0])
        else:
            if other is None:  # SCH: assumes other is a standard normal distribution
                return 0.5 * torch.sum(torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=[1, 2, 3, 4])
            else:
                return 0.5 * torch.sum(
                    torch.pow(self.mean - other.mean, 2) / other.var
                    + self.var / other.var
                    - 1.0
                    - self.logvar
                    + other.logvar,
                    dim=[1, 2, 3, 4],
                )

    def nll(self, sample, dims=[1, 2, 3, 4]):
        if self.deterministic:
            return torch.Tensor([0.0])
        logtwopi = np.log(2.0 * np.pi)
        return 0.5 * torch.sum(logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, dim=dims)

    def mode(self):
        return self.mean


================================================
FILE: Open-Sora/opensora/models/vae/vae.py
================================================
import os

import torch
import torch.nn as nn
from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
from einops import rearrange
from transformers import PretrainedConfig, PreTrainedModel

from opensora.registry import MODELS, build_module
from opensora.utils.ckpt_utils import load_checkpoint


@MODELS.register_module()
class VideoAutoencoderKL(nn.Module):
    def __init__(
        self,
        from_pretrained=None,
        micro_batch_size=None,
        cache_dir=None,
        local_files_only=False,
        subfolder=None,
        scaling_factor=0.18215,
    ):
        super().__init__()
        self.module = AutoencoderKL.from_pretrained(
            from_pretrained,
            cache_dir=cache_dir,
            local_files_only=local_files_only,
            subfolder=subfolder,
        )
        self.out_channels = self.module.config.latent_channels
        self.patch_size = (1, 8, 8)
        self.micro_batch_size = micro_batch_size
        self.scaling_factor = scaling_factor

    def encode(self, x):
        # x: (B, C, T, H, W)
        B = x.shape[0]
        x = rearrange(x, "B C T H W -> (B T) C H W")

        if self.micro_batch_size is None:
            x = self.module.encode(x).latent_dist.sample().mul_(self.scaling_factor)
        else:
            # NOTE: cannot be used for training
            bs = self.micro_batch_size
            x_out = []
            for i in range(0, x.shape[0], bs):
                x_bs = x[i : i + bs]
                x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(self.scaling_factor)
                x_out.append(x_bs)
            x = torch.cat(x_out, dim=0)
        x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
        return x

    def decode(self, x, **kwargs):
        # x: (B, C, T, H, W)
        B = x.shape[0]
        x = rearrange(x, "B C T H W -> (B T) C H W")
        if self.micro_batch_size is None:
            x = self.module.decode(x / self.scaling_factor).sample
        else:
            # NOTE: cannot be used for training
            bs = self.micro_batch_size
            x_out = []
            for i in range(0, x.shape[0], bs):
                x_bs = x[i : i + bs]
                x_bs = self.module.decode(x_bs / self.scaling_factor).sample
                x_out.append(x_bs)
            x = torch.cat(x_out, dim=0)
        x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
        return x

    def get_latent_size(self, input_size):
        latent_size = []
        for i in range(3):
            # assert (
            #     input_size[i] is None or input_size[i] % self.patch_size[i] == 0
            # ), "Input size must be divisible by patch size"
            latent_size.append(input_size[i] // self.patch_size[i] if input_size[i] is not None else None)
        return latent_size

    @property
    def device(self):
        return next(self.parameters()).device

    @property
    def dtype(self):
        return next(self.parameters()).dtype


@MODELS.register_module()
class VideoAutoencoderKLTemporalDecoder(nn.Module):
    def __init__(self, from_pretrained=None, cache_dir=None, local_files_only=False):
        super().__init__()
        self.module = AutoencoderKLTemporalDecoder.from_pretrained(
            from_pretrained, cache_dir=cache_dir, local_files_only=local_files_only
        )
        self.out_channels = self.module.config.latent_channels
        self.patch_size = (1, 8, 8)

    def encode(self, x):
        raise NotImplementedError

    def decode(self, x, **kwargs):
        B, _, T = x.shape[:3]
        x = rearrange(x, "B C T H W -> (B T) C H W")
        x = self.module.decode(x / 0.18215, num_frames=T).sample
        x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
        return x

    def get_latent_size(self, input_size):
        latent_size = []
        for i in range(3):
            # assert (
            #     input_size[i] is None or input_size[i] % self.patch_size[i] == 0
            # ), "Input size must be divisible by patch size"
            latent_size.append(input_size[i] // self.patch_size[i] if input_size[i] is not None else None)
        return latent_size

    @property
    def device(self):
        return next(self.parameters()).device

    @property
    def dtype(self):
        return next(self.parameters()).dtype


class VideoAutoencoderPipelineConfig(PretrainedConfig):
    model_type = "VideoAutoencoderPipeline"

    def __init__(
        self,
        vae_2d=None,
        vae_temporal=None,
        from_pretrained=None,
        freeze_vae_2d=False,
        cal_loss=False,
        micro_frame_size=None,
        shift=0.0,
        scale=1.0,
        **kwargs,
    ):
        self.vae_2d = vae_2d
        self.vae_temporal = vae_temporal
        self.from_pretrained = from_pretrained
        self.freeze_vae_2d = freeze_vae_2d
        self.cal_loss = cal_loss
        self.micro_frame_size = micro_frame_size
        self.shift = shift
        self.scale = scale
        super().__init__(**kwargs)


class VideoAutoencoderPipeline(PreTrainedModel):
    config_class = VideoAutoencoderPipelineConfig

    def __init__(self, config: VideoAutoencoderPipelineConfig):
        super().__init__(config=config)
        self.spatial_vae = build_module(config.vae_2d, MODELS)
        self.temporal_vae = build_module(config.vae_temporal, MODELS)
        self.cal_loss = config.cal_loss
        self.micro_frame_size = config.micro_frame_size
        self.micro_z_frame_size = self.temporal_vae.get_latent_size([config.micro_frame_size, None, None])[0]

        if config.freeze_vae_2d:
            for param in self.spatial_vae.parameters():
                param.requires_grad = False

        self.out_channels = self.temporal_vae.out_channels

        # normalization parameters
        scale = torch.tensor(config.scale)
        shift = torch.tensor(config.shift)
        if len(scale.shape) > 0:
            scale = scale[None, :, None, None, None]
        if len(shift.shape) > 0:
            shift = shift[None, :, None, None, None]
        self.register_buffer("scale", scale)
        self.register_buffer("shift", shift)

    def encode(self, x):
        x_z = self.spatial_vae.encode(x)

        if self.micro_frame_size is None:
            posterior = self.temporal_vae.encode(x_z)
            z = posterior.sample()
        else:
            z_list = []
            for i in range(0, x_z.shape[2], self.micro_frame_size):
                x_z_bs = x_z[:, :, i : i + self.micro_frame_size]
                posterior = self.temporal_vae.encode(x_z_bs)
                z_list.append(posterior.sample())
            z = torch.cat(z_list, dim=2)

        if self.cal_loss:
            return z, posterior, x_z
        else:
            return (z - self.shift) / self.scale

    def decode(self, z, num_frames=None):
        if not self.cal_loss:
            z = z * self.scale.to(z.dtype) + self.shift.to(z.dtype)

        if self.micro_frame_size is None:
            x_z = self.temporal_vae.decode(z, num_frames=num_frames)
            x = self.spatial_vae.decode(x_z)
        else:
            x_z_list = []
            for i in range(0, z.size(2), self.micro_z_frame_size):
                z_bs = z[:, :, i : i + self.micro_z_frame_size]
                x_z_bs = self.temporal_vae.decode(z_bs, num_frames=min(self.micro_frame_size, num_frames))
                x_z_list.append(x_z_bs)
                num_frames -= self.micro_frame_size
            x_z = torch.cat(x_z_list, dim=2)
            x = self.spatial_vae.decode(x_z)

        if self.cal_loss:
            return x, x_z
        else:
            return x

    def forward(self, x):
        assert self.cal_loss, "This method is only available when cal_loss is True"
        z, posterior, x_z = self.encode(x)
        x_rec, x_z_rec = self.decode(z, num_frames=x_z.shape[2])
        return x_rec, x_z_rec, z, posterior, x_z

    def get_latent_size(self, input_size):
        if self.micro_frame_size is None or input_size[0] is None:
            return self.temporal_vae.get_latent_size(self.spatial_vae.get_latent_size(input_size))
        else:
            sub_input_size = [self.micro_frame_size, input_size[1], input_size[2]]
            sub_latent_size = self.temporal_vae.get_latent_size(self.spatial_vae.get_latent_size(sub_input_size))
            sub_latent_size[0] = sub_latent_size[0] * (input_size[0] // self.micro_frame_size)
            remain_temporal_size = [input_size[0] % self.micro_frame_size, None, None]
            if remain_temporal_size[0] > 0:
                remain_size = self.temporal_vae.get_latent_size(remain_temporal_size)
                sub_latent_size[0] += remain_size[0]
            return sub_latent_size

    def get_temporal_last_layer(self):
        return self.temporal_vae.decoder.conv_out.conv.weight

    @property
    def device(self):
        return next(self.parameters()).device

    @property
    def dtype(self):
        return next(self.parameters()).dtype


@MODELS.register_module()
def OpenSoraVAE_V1_2(
    micro_batch_size=4,
    micro_frame_size=17,
    from_pretrained=None,
    local_files_only=False,
    freeze_vae_2d=False,
    cal_loss=False,
    force_huggingface=False,
):
    vae_2d = dict(
        type="VideoAutoencoderKL",
        from_pretrained="/root/autodl-tmp/pretrained_models/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
        subfolder="vae",
        micro_batch_size=micro_batch_size,
        local_files_only=local_files_only,
    )
    vae_temporal = dict(
        type="VAE_Temporal_SD",
        from_pretrained=None,
    )
    shift = (-0.10, 0.34, 0.27, 0.98)
    scale = (3.85, 2.32, 2.33, 3.06)
    kwargs = dict(
        vae_2d=vae_2d,
        vae_temporal=vae_temporal,
        freeze_vae_2d=freeze_vae_2d,
        cal_loss=cal_loss,
        micro_frame_size=micro_frame_size,
        shift=shift,
        scale=scale,
    )

    if force_huggingface or (from_pretrained is not None and not os.path.exists(from_pretrained)):
        model = VideoAutoencoderPipeline.from_pretrained(from_pretrained, **kwargs)
    else:
        config = VideoAutoencoderPipelineConfig(**kwargs)
        model = VideoAutoencoderPipeline(config)

        if from_pretrained:
            load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/opensora/models/vae/vae_temporal.py
================================================
from typing import Tuple, Union

import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange

from opensora.registry import MODELS
from opensora.utils.ckpt_utils import load_checkpoint

from .utils import DiagonalGaussianDistribution


def cast_tuple(t, length=1):
    return t if isinstance(t, tuple) else ((t,) * length)


def divisible_by(num, den):
    return (num % den) == 0


def is_odd(n):
    return not divisible_by(n, 2)


def pad_at_dim(t, pad, dim=-1):
    dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1)
    zeros = (0, 0) * dims_from_right
    return F.pad(t, (*zeros, *pad), mode="constant")


def exists(v):
    return v is not None


class CausalConv3d(nn.Module):
    def __init__(
        self,
        chan_in,
        chan_out,
        kernel_size: Union[int, Tuple[int, int, int]],
        pad_mode="constant",
        strides=None,  # allow custom stride
        **kwargs,
    ):
        super().__init__()
        kernel_size = cast_tuple(kernel_size, 3)

        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size

        assert is_odd(height_kernel_size) and is_odd(width_kernel_size)

        dilation = kwargs.pop("dilation", 1)
        stride = strides[0] if strides is not None else kwargs.pop("stride", 1)

        self.pad_mode = pad_mode
        time_pad = dilation * (time_kernel_size - 1) + (1 - stride)
        height_pad = height_kernel_size // 2
        width_pad = width_kernel_size // 2

        self.time_pad = time_pad
        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)

        stride = strides if strides is not None else (stride, 1, 1)
        dilation = (dilation, 1, 1)
        self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs)

    def forward(self, x):
        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
        x = self.conv(x)
        return x


class ResBlock(nn.Module):
    def __init__(
        self,
        in_channels,  # SCH: added
        filters,
        conv_fn,
        activation_fn=nn.SiLU,
        use_conv_shortcut=False,
        num_groups=32,
    ):
        super().__init__()
        self.in_channels = in_channels
        self.filters = filters
        self.activate = activation_fn()
        self.use_conv_shortcut = use_conv_shortcut

        # SCH: MAGVIT uses GroupNorm by default
        self.norm1 = nn.GroupNorm(num_groups, in_channels)
        self.conv1 = conv_fn(in_channels, self.filters, kernel_size=(3, 3, 3), bias=False)
        self.norm2 = nn.GroupNorm(num_groups, self.filters)
        self.conv2 = conv_fn(self.filters, self.filters, kernel_size=(3, 3, 3), bias=False)
        if in_channels != filters:
            if self.use_conv_shortcut:
                self.conv3 = conv_fn(in_channels, self.filters, kernel_size=(3, 3, 3), bias=False)
            else:
                self.conv3 = conv_fn(in_channels, self.filters, kernel_size=(1, 1, 1), bias=False)

    def forward(self, x):
        residual = x
        x = self.norm1(x)
        x = self.activate(x)
        x = self.conv1(x)
        x = self.norm2(x)
        x = self.activate(x)
        x = self.conv2(x)
        if self.in_channels != self.filters:  # SCH: ResBlock X->Y
            residual = self.conv3(residual)
        return x + residual


def get_activation_fn(activation):
    if activation == "relu":
        activation_fn = nn.ReLU
    elif activation == "swish":
        activation_fn = nn.SiLU
    else:
        raise NotImplementedError
    return activation_fn


class Encoder(nn.Module):
    """Encoder Blocks."""

    def __init__(
        self,
        in_out_channels=4,
        latent_embed_dim=512,  # num channels for latent vector
        filters=128,
        num_res_blocks=4,
        channel_multipliers=(1, 2, 2, 4),
        temporal_downsample=(False, True, True),
        num_groups=32,  # for nn.GroupNorm
        activation_fn="swish",
    ):
        super().__init__()
        self.filters = filters
        self.num_res_blocks = num_res_blocks
        self.num_blocks = len(channel_multipliers)
        self.channel_multipliers = channel_multipliers
        self.temporal_downsample = temporal_downsample
        self.num_groups = num_groups
        self.embedding_dim = latent_embed_dim

        self.activation_fn = get_activation_fn(activation_fn)
        self.activate = self.activation_fn()
        self.conv_fn = CausalConv3d
        self.block_args = dict(
            conv_fn=self.conv_fn,
            activation_fn=self.activation_fn,
            use_conv_shortcut=False,
            num_groups=self.num_groups,
        )

        # first layer conv
        self.conv_in = self.conv_fn(
            in_out_channels,
            filters,
            kernel_size=(3, 3, 3),
            bias=False,
        )

        # ResBlocks and conv downsample
        self.block_res_blocks = nn.ModuleList([])
        self.conv_blocks = nn.ModuleList([])

        filters = self.filters
        prev_filters = filters  # record for in_channels
        for i in range(self.num_blocks):
            filters = self.filters * self.channel_multipliers[i]
            block_items = nn.ModuleList([])
            for _ in range(self.num_res_blocks):
                block_items.append(ResBlock(prev_filters, filters, **self.block_args))
                prev_filters = filters  # update in_channels
            self.block_res_blocks.append(block_items)

            if i < self.num_blocks - 1:
                if self.temporal_downsample[i]:
                    t_stride = 2 if self.temporal_downsample[i] else 1
                    s_stride = 1
                    self.conv_blocks.append(
                        self.conv_fn(
                            prev_filters, filters, kernel_size=(3, 3, 3), strides=(t_stride, s_stride, s_stride)
                        )
                    )
                    prev_filters = filters  # update in_channels
                else:
                    # if no t downsample, don't add since this does nothing for pipeline models
                    self.conv_blocks.append(nn.Identity(prev_filters))  # Identity
                    prev_filters = filters  # update in_channels

        # last layer res block
        self.res_blocks = nn.ModuleList([])
        for _ in range(self.num_res_blocks):
            self.res_blocks.append(ResBlock(prev_filters, filters, **self.block_args))
            prev_filters = filters  # update in_channels

        # MAGVIT uses Group Normalization
        self.norm1 = nn.GroupNorm(self.num_groups, prev_filters)

        self.conv2 = self.conv_fn(prev_filters, self.embedding_dim, kernel_size=(1, 1, 1), padding="same")

    def forward(self, x):
        x = self.conv_in(x)

        for i in range(self.num_blocks):
            for j in range(self.num_res_blocks):
                x = self.block_res_blocks[i][j](x)
            if i < self.num_blocks - 1:
                x = self.conv_blocks[i](x)
        for i in range(self.num_res_blocks):
            x = self.res_blocks[i](x)

        x = self.norm1(x)
        x = self.activate(x)
        x = self.conv2(x)
        return x


class Decoder(nn.Module):
    """Decoder Blocks."""

    def __init__(
        self,
        in_out_channels=4,
        latent_embed_dim=512,
        filters=128,
        num_res_blocks=4,
        channel_multipliers=(1, 2, 2, 4),
        temporal_downsample=(False, True, True),
        num_groups=32,  # for nn.GroupNorm
        activation_fn="swish",
    ):
        super().__init__()
        self.filters = filters
        self.num_res_blocks = num_res_blocks
        self.num_blocks = len(channel_multipliers)
        self.channel_multipliers = channel_multipliers
        self.temporal_downsample = temporal_downsample
        self.num_groups = num_groups
        self.embedding_dim = latent_embed_dim
        self.s_stride = 1

        self.activation_fn = get_activation_fn(activation_fn)
        self.activate = self.activation_fn()
        self.conv_fn = CausalConv3d
        self.block_args = dict(
            conv_fn=self.conv_fn,
            activation_fn=self.activation_fn,
            use_conv_shortcut=False,
            num_groups=self.num_groups,
        )

        filters = self.filters * self.channel_multipliers[-1]
        prev_filters = filters

        # last conv
        self.conv1 = self.conv_fn(self.embedding_dim, filters, kernel_size=(3, 3, 3), bias=True)

        # last layer res block
        self.res_blocks = nn.ModuleList([])
        for _ in range(self.num_res_blocks):
            self.res_blocks.append(ResBlock(filters, filters, **self.block_args))

        # ResBlocks and conv upsample
        self.block_res_blocks = nn.ModuleList([])
        self.num_blocks = len(self.channel_multipliers)
        self.conv_blocks = nn.ModuleList([])
        # reverse to keep track of the in_channels, but append also in a reverse direction
        for i in reversed(range(self.num_blocks)):
            filters = self.filters * self.channel_multipliers[i]
            # resblock handling
            block_items = nn.ModuleList([])
            for _ in range(self.num_res_blocks):
                block_items.append(ResBlock(prev_filters, filters, **self.block_args))
                prev_filters = filters  # SCH: update in_channels
            self.block_res_blocks.insert(0, block_items)  # SCH: append in front

            # conv blocks with upsampling
            if i > 0:
                if self.temporal_downsample[i - 1]:
                    t_stride = 2 if self.temporal_downsample[i - 1] else 1
                    # SCH: T-Causal Conv 3x3x3, f -> (t_stride * 2 * 2) * f, depth to space t_stride x 2 x 2
                    self.conv_blocks.insert(
                        0,
                        self.conv_fn(
                            prev_filters, prev_filters * t_stride * self.s_stride * self.s_stride, kernel_size=(3, 3, 3)
                        ),
                    )
                else:
                    self.conv_blocks.insert(
                        0,
                        nn.Identity(prev_filters),
                    )

        self.norm1 = nn.GroupNorm(self.num_groups, prev_filters)

        self.conv_out = self.conv_fn(filters, in_out_channels, 3)

    def forward(self, x):
        x = self.conv1(x)
        for i in range(self.num_res_blocks):
            x = self.res_blocks[i](x)
        for i in reversed(range(self.num_blocks)):
            for j in range(self.num_res_blocks):
                x = self.block_res_blocks[i][j](x)
            if i > 0:
                t_stride = 2 if self.temporal_downsample[i - 1] else 1
                x = self.conv_blocks[i - 1](x)
                x = rearrange(
                    x,
                    "B (C ts hs ws) T H W -> B C (T ts) (H hs) (W ws)",
                    ts=t_stride,
                    hs=self.s_stride,
                    ws=self.s_stride,
                )

        x = self.norm1(x)
        x = self.activate(x)
        x = self.conv_out(x)
        return x


@MODELS.register_module()
class VAE_Temporal(nn.Module):
    def __init__(
        self,
        in_out_channels=4,
        latent_embed_dim=4,
        embed_dim=4,
        filters=128,
        num_res_blocks=4,
        channel_multipliers=(1, 2, 2, 4),
        temporal_downsample=(True, True, False),
        num_groups=32,  # for nn.GroupNorm
        activation_fn="swish",
    ):
        super().__init__()

        self.time_downsample_factor = 2 ** sum(temporal_downsample)
        # self.time_padding = self.time_downsample_factor - 1
        self.patch_size = (self.time_downsample_factor, 1, 1)
        self.out_channels = in_out_channels

        # NOTE: following MAGVIT, conv in bias=False in encoder first conv
        self.encoder = Encoder(
            in_out_channels=in_out_channels,
            latent_embed_dim=latent_embed_dim * 2,
            filters=filters,
            num_res_blocks=num_res_blocks,
            channel_multipliers=channel_multipliers,
            temporal_downsample=temporal_downsample,
            num_groups=num_groups,  # for nn.GroupNorm
            activation_fn=activation_fn,
        )
        self.quant_conv = CausalConv3d(2 * latent_embed_dim, 2 * embed_dim, 1)

        self.post_quant_conv = CausalConv3d(embed_dim, latent_embed_dim, 1)
        self.decoder = Decoder(
            in_out_channels=in_out_channels,
            latent_embed_dim=latent_embed_dim,
            filters=filters,
            num_res_blocks=num_res_blocks,
            channel_multipliers=channel_multipliers,
            temporal_downsample=temporal_downsample,
            num_groups=num_groups,  # for nn.GroupNorm
            activation_fn=activation_fn,
        )

    def get_latent_size(self, input_size):
        latent_size = []
        for i in range(3):
            if input_size[i] is None:
                lsize = None
            elif i == 0:
                time_padding = (
                    0
                    if (input_size[i] % self.time_downsample_factor == 0)
                    else self.time_downsample_factor - input_size[i] % self.time_downsample_factor
                )
                lsize = (input_size[i] + time_padding) // self.patch_size[i]
            else:
                lsize = input_size[i] // self.patch_size[i]
            latent_size.append(lsize)
        return latent_size

    def encode(self, x):
        time_padding = (
            0
            if (x.shape[2] % self.time_downsample_factor == 0)
            else self.time_downsample_factor - x.shape[2] % self.time_downsample_factor
        )
        x = pad_at_dim(x, (time_padding, 0), dim=2)
        encoded_feature = self.encoder(x)
        moments = self.quant_conv(encoded_feature).to(x.dtype)
        posterior = DiagonalGaussianDistribution(moments)
        return posterior

    def decode(self, z, num_frames=None):
        time_padding = (
            0
            if (num_frames % self.time_downsample_factor == 0)
            else self.time_downsample_factor - num_frames % self.time_downsample_factor
        )
        z = self.post_quant_conv(z)
        x = self.decoder(z)
        x = x[:, :, time_padding:]
        return x

    def forward(self, x, sample_posterior=True):
        posterior = self.encode(x)
        if sample_posterior:
            z = posterior.sample()
        else:
            z = posterior.mode()
        recon_video = self.decode(z, num_frames=x.shape[2])
        return recon_video, posterior, z


@MODELS.register_module("VAE_Temporal_SD")
def VAE_Temporal_SD(from_pretrained=None, **kwargs):
    model = VAE_Temporal(
        in_out_channels=4,
        latent_embed_dim=4,
        embed_dim=4,
        filters=128,
        num_res_blocks=4,
        channel_multipliers=(1, 2, 2, 4),
        temporal_downsample=(False, True, True),
        **kwargs,
    )
    if from_pretrained is not None:
        load_checkpoint(model, from_pretrained)
    return model


================================================
FILE: Open-Sora/opensora/models/vae/video_sdxl/blocks.py
================================================
"""
Adapted from SDXL VAE (https://huggingface.co/stabilityai/sdxl-vae/blob/main/config.json)
All default values of kwargs are the same as SDXL
"""

from typing import Optional, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from diffusers.models.attention_processor import Attention
from einops import rearrange


def video_to_image(func):
    def wrapper(self, x, *args, **kwargs):
        if x.ndim == 5:
            B = x.shape[0]
            x = rearrange(x, 'B C T H W -> (B T) C H W')

            if hasattr(self, 'micro_batch_size') and self.micro_batch_size is None:
                x = func(self, x, *args, **kwargs)
            else:
                bs = self.micro_batch_size
                x_out = []
                for i in range(0, x.shape[0], bs):
                    x_i = func(self, x[i:i + bs], *args, **kwargs)
                    x_out.append(x_i)
                x = torch.cat(x_out, dim=0)

            x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
        return x
    return wrapper


class VideoConv2d(nn.Conv2d):
    def __init__(self, *args, micro_batch_size=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.micro_batch_size = micro_batch_size

    @video_to_image
    def forward(self, x):
        return super().forward(x)


class ResnetBlock2D(nn.Module):
    """
        Use nn.Conv2d
        Default activation is nn.SiLU()
        Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W]
        Support micro_batch_size
    """
    def __init__(
        self,
        in_channels: int,
        out_channels: Optional[int] = None,
        norm_groups: int = 32,
        norm_eps: float = 1e-6,
        micro_batch_size=None,
    ):
        super().__init__()
        self.in_channels = in_channels
        out_channels = in_channels if out_channels is None else out_channels
        self.out_channels = out_channels
        self.micro_batch_size = micro_batch_size

        conv_cls = nn.Conv2d
        self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True)
        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)

        self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True)
        self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)

        self.act = nn.SiLU()

        self.use_in_shortcut = self.in_channels != out_channels

        self.conv_shortcut = None
        if self.use_in_shortcut:
            self.conv_shortcut = conv_cls(
                in_channels,
                out_channels,
                kernel_size=1,
                stride=1,
                padding=0,
            )

    @video_to_image
    def forward(self, x):
        res = self.norm1(x)
        res = self.act(res)
        res = self.conv1(res)

        res = self.norm2(res)
        res = self.act(res)
        res = self.conv2(res)

        if self.conv_shortcut is not None:
            x = self.conv_shortcut(x)

        out = x + res
        return out


class ResnetBlock3D(nn.Module):
    """
        Use nn.Conv3d
        Default activation is nn.SiLU()
        Make sure input tensor is of shape [B, C, T, H, W]
    """
    def __init__(
        self,
        in_channels: int,
        out_channels: Optional[int] = None,
        norm_groups: int = 32,
        norm_eps: float = 1e-6,
    ):
        super().__init__()
        self.in_channels = in_channels
        out_channels = in_channels if out_channels is None else out_channels
        self.out_channels = out_channels

        conv_cls = nn.Conv3d
        self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True)
        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)

        self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True)
        self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)

        self.act = nn.SiLU()

        self.use_in_shortcut = self.in_channels != out_channels

        self.conv_shortcut = None
        if self.use_in_shortcut:
            self.conv_shortcut = conv_cls(
                in_channels,
                out_channels,
                kernel_size=1,
                stride=1,
                padding=0,
            )
        
    def forward(self, x):
        res = self.norm1(x)
        res = self.act(res)
        res = self.conv1(res)

        res = self.norm2(res)
        res = self.act(res)
        res = self.conv2(res)

        if self.conv_shortcut is not None:
            x = self.conv_shortcut(x)

        out = x + res
        return out


class SpatialDownsample2x(nn.Module):
    """
        Default downsample is Conv2d(stride=2)
        Make sure input tensor is of shape [B, C, T, H, W]
        Support micro_batch_size
    """
    def __init__(
        self,
        channels: int,
        use_conv: bool = True,
        micro_batch_size=None,
    ):
        super().__init__()
        self.channels = channels
        self.use_conv = use_conv
        self.micro_batch_size = micro_batch_size

        if use_conv:
            self.downsample = nn.Conv2d(
                self.channels, self.channels, kernel_size=3, stride=2, padding=0,
            )
        else:
            self.downsample = nn.AvgPool2d(kernel_size=2, stride=2)

    @video_to_image
    def forward(self, x):
        # implementation from SDXL
        pad = (0, 1, 0, 1)
        x = F.pad(x, pad, mode="constant", value=0)

        x = self.downsample(x)
        return x


class SpatialUpsample2x(nn.Module):
    """
        Default upsample is F.interpolate(scale_factor=2) + Conv2d(stride=1)
        Make sure input tensor is of shape [B, C, T, H, W]
        Support micro_batch_size
    """
    def __init__(
        self,
        channels: int,
        use_interpolate=True,
        micro_batch_size=None,
    ):
        super().__init__()
        self.channels = channels
        self.use_interpolate = use_interpolate
        self.micro_batch_size = micro_batch_size

        if use_interpolate:
            self.conv = nn.Conv2d(self.channels, self.channels, kernel_size=3, padding=1)
        else:
            raise NotImplementedError
            self.upsample = nn.ConvTranspose2d(channels, self.channels, kernel_size=4, stride=2, padding=1)
    
    def forward(self, x):
        B = x.shape[0]
        x = rearrange(x, 'B C T H W -> (B T) C H W')

        if self.micro_batch_size is None:
            x = self.forward_BCHW(x)
        else:
            bs = self.micro_batch_size
            x_out = []
            for i in range(0, x.shape[0], bs):
                x_i = self.forward_BCHW(x[i:i + bs])
                x_out.append(x_i)
            x = torch.cat(x_out, dim=0)

        x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
        return x

    def forward_BCHW(self, x):
        if self.use_interpolate:
            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
            if x.shape[0] >= 64:
                x = x.contiguous()

            # interpolate tensor of bfloat16 is fixed in pytorch 2.1. see https://github.com/pytorch/pytorch/issues/86679
            x = F.interpolate(x, scale_factor=2.0, mode="nearest")
            x = self.conv(x)
        else:
            x = self.upsample(x)

        return x


class TemporalDownsample2x(nn.Module):
    """
        Default downsample is Conv3d(stride=(2, 1, 1))
        Make sure input tensor is of shape [B, C, T, H, W]
    """
    def __init__(
        self,
        channels: int,
        use_conv: bool = True,
    ):
        super().__init__()
        self.channels = channels
        self.use_conv = use_conv

        if use_conv:
            self.downsample = nn.Conv3d(
                self.channels, self.channels, kernel_size=(3, 3, 3), stride=(2, 1, 1), padding=(1, 1, 1),
           )
        else:
            self.downsample = nn.AvgPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1))

    def forward(self, x):
        x = self.downsample(x)
        return x


class TemporalUpsample2x(nn.Module):
    """
        Default upsample is F.interpolate(scale_factor=(2, 1, 1)) + Conv3d(stride=1)
        Make sure input tensor is of shape [B, C, T, H, W]
        Support micro_batch_size
    """
    def __init__(
        self,
        channels,
    ):
        super().__init__()
        self.channels = channels
        self.conv = nn.Conv3d(channels, channels, kernel_size=3, padding=1)

    def forward(self, x):
        if x.shape[0] >= 64:
            x = x.contiguous()
        x = F.interpolate(x, scale_factor=(2, 1, 1), mode="trilinear")
        x = self.conv(x)
        return x


class UNetMidBlock2D(nn.Module):
    """
        default is ResnetBlock2D + Spatial Attention + ResnetBlock2D
        Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W]
    """
    def __init__(
        self,
        in_channels: int,
        num_layers: int = 1,
        norm_groups: int = 32,
        norm_eps: float = 1e-6,
        attn_groups: Optional[int] = None,
        add_attention: bool = True,
        attention_head_dim: int = 512,
    ):
        super().__init__()
        self.add_attention = add_attention

        if attn_groups is None:
            attn_groups = norm_groups

        if attention_head_dim is None:
            attention_head_dim = in_channels

        res_blocks = [
            ResnetBlock2D(
                in_channels=in_channels,
                out_channels=in_channels,
                norm_eps=norm_eps,
                norm_groups=norm_groups,
            )
        ]
        attn_blocks = []

        for _ in range(num_layers):
            if self.add_attention:
                attn_blocks.append(
                    Attention(
                        in_channels,
                        heads=in_channels // attention_head_dim,
                        dim_head=attention_head_dim,
                        # rescale_output_factor=output_scale_factor,
                        rescale_output_factor=1.0,
                        eps=norm_eps,
                        norm_num_groups=attn_groups,
                        # spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
                        spatial_norm_dim=None,
                        residual_connection=True,
                        bias=True,
                        upcast_softmax=True,
                        _from_deprecated_attn_block=True,
                    )
                )

            res_blocks.append(
                ResnetBlock2D(
                    in_channels=in_channels,
                    out_channels=in_channels,
                    norm_eps=norm_eps,
                    norm_groups=norm_groups,
                )
            )

        self.attn_blocks = nn.ModuleList(attn_blocks)
        self.res_blocks = nn.ModuleList(res_blocks)

    def forward(self, x):
        has_T = x.ndim == 5
        if has_T:
            B = x.shape[0]
            x = rearrange(x, 'B C T H W -> (B T) C H W')

        x = self.res_blocks[0](x)
        for attn, res_block in zip(self.attn_blocks, self.res_blocks[1:]):
            if attn is not None:
                x = attn(x)
            x = res_block(x)

        if has_T:
            x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
        return x


class Encoder(nn.Module):
    """
        default arch is conv_in + blocks + mid_block + out_block
        Make sure input tensor is of shape [B, C, T, H, W]
    """
    def __init__(
        self,
        in_channels=3,
        out_channels=4,
        norm_groups=32,
        norm_eps=1e-6,
        double_z=True,
        micro_batch_size=None,
    ):
        super().__init__()
        in_channels_encoder = in_channels
        out_channels_encoder = out_channels
        block_out_channels = [128, 256, 512, 512]

        # conv_in
        self.conv_in = VideoConv2d(
            in_channels_encoder,
            block_out_channels[0],
            kernel_size=3,
            stride=1,
            padding=1,
            micro_batch_size=micro_batch_size,
        )

        # blocks
        blocks = []

        # the first block: ResnetBlock2D
        in_channels = block_out_channels[0]
        out_channels = block_out_channels[0]
        blocks.append(
            nn.Sequential(
                ResnetBlock2D(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    norm_groups=norm_groups,
                    norm_eps=norm_eps,
                    micro_batch_size=micro_batch_size,
                ),
                ResnetBlock2D(
                    in_channels=out_channels,
                    out_channels=out_channels,
                    norm_groups=norm_groups,
                    norm_eps=norm_eps,
                    micro_batch_size=micro_batch_size,
                ),
                SpatialDownsample2x(
                    channels=out_channels,
                    use_conv=True,
                    micro_batch_size=micro_batch_size, 
                ),
            )
        )

        # the second block: ResnetBlock2D
        in_channels = block_out_channels[0]
        out_channels = block_out_channels[1]
        blocks.append(
            nn.Sequential(
                ResnetBlock2D(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    norm_groups=norm_groups,
                    norm_eps=norm_eps,
                    micro_batch_size=micro_batch_size,
                ),
                ResnetBlock2D(
                    in_channels=out_channels,
                    out_channels=out_channels,
                    norm_groups=norm_groups,
                    norm_eps=norm_eps,
                    micro_batch_size=micro_batch_size,
                ),
                SpatialDownsample2x(
                    channels=out_channels,
                    use_conv=True,
                    micro_batch_size=micro_batch_size, 
                ),
                TemporalDownsample2x(
                    channels=out_channels,
                    use_conv=True,
                )
            )
        )

        # the third block: ResnetBlock3D
        in_channels = block_out_channels[1]
        out_channels = block_out_channels[2]
        blocks.append(
            nn.Sequential(
                ResnetBlock3D(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    norm_groups=norm_groups,
                    norm_eps=norm_eps,
                ),
                ResnetBlock3D(
                    in_channels=out_channels,
                    out_channels=out_channels,
                    norm_groups=norm_groups,
                    norm_eps=norm_eps,
                ),
                SpatialDownsample2x(
                    channels=out_channels,
                    use_conv=True,
                ),
                TemporalDownsample2x(
                    channels=out_channels,
                    use_conv=True,
                )
            )
        )

        # the fourth block: ResnetBlock3D
        in_channels = block_out_channels[2]
        out_channels = block_out_channels[3]
        blocks.append(
            nn.Sequential(
                ResnetBlock3D(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    norm_groups=norm_groups,
                    norm_eps=norm_eps,
                ),
                ResnetBlock3D(
                    in_channels=out_channels,
                    out_channels=out_channels,
                    norm_groups=norm_groups,
                    norm_eps=norm_eps,
                ),
            )
        )

        self.blocks = nn.ModuleList(blocks)


        # mid_block
        in_channels = block_out_channels[-1]
        self.mid_block = UNetMidBlock2D(
            in_channels=in_channels,
            num_layers=1,
            norm_groups=norm_groups,
            norm_eps=norm_eps,
            add_attention=True,
            attention_head_dim=in_channels,
        )

        # out_block
        in_channels = block_out_channels[-1]
        out_channels = 2 * out_channels_encoder if double_z else out_channels_encoder
        self.out_block = nn.Sequential(
            nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps),
            nn.SiLU(),
            nn.Conv3d(in_channels, out_channels, kernel_size=3, padding=1),
        )
    
    def forward(self, x):
        x = self.conv_in(x)

        for block in self.blocks:
            x = block(x)

        x = self.mid_block(x)

        x = self.out_block(x)
        return x


class Decoder(nn.Module):
    """
        default arch is conv_in + mid_block + blocks + out_block
        Make sure input tensor is of shape [B, C, T, H, W]
    """
    def __init__(
        self,
        in_channels=4,
        out_channels=3,
        norm_groups=32,
        norm_eps=1e-6,
    ):
        super().__init__()
        in_channels_decoder = in_channels
        out_channels_decoder = out_channels
        block_out_channels = [512, 512, 256, 128]

        # conv_in
        self.conv_in = nn.Conv3d(
            in_channels_decoder,
            block_out_channels[0],
            kernel_size=3,
            stride=1,
            padding=1,
        )

        # mid_block
        in_channels = block_out_channels[0]
        self.mid_block = UNetMidBlock2D(
            in_channels=in_channels,
            num_layers=1,
            norm_groups=norm_groups,
            norm_eps=norm_eps,
            add_attention=True,
            attention_head_dim=in_channels,
        )

        # blocks
        blocks = []
        layer_per_block = 3

        # the first up block: ResnetBlock3D
        in_channels = block_out_channels[0]
        out_channels = block_out_channels[0]
        seq = [
            ResnetBlock3D(
                in_channels=in_channels if idx ==0 else out_channels,
                out_channels=out_channels,
                norm_groups=norm_groups,
                norm_eps=norm_eps,
            )
            for idx in range(layer_per_block)
        ] + [
            SpatialUpsample2x(
                channels=out_channels,
                use_interpolate=True,
            ),
            TemporalUpsample2x(
                channels=out_channels,
            ),
        ]
        blocks.append(nn.Sequential(*seq))

        # the second up block: ResnetBlock3D
        in_channels = block_out_channels[0]
        out_channels = block_out_channels[1]
        seq = [
            ResnetBlock3D(
                in_channels=in_channels if idx ==0 else out_channels,
                out_channels=out_channels,
                norm_groups=norm_groups,
                norm_eps=norm_eps,
            )
            for idx in range(layer_per_block)
        ] + [
            SpatialUpsample2x(
                channels=out_channels,
                use_interpolate=True,
            ),
            TemporalUpsample2x(
                channels=out_channels,
            ),
        ]
        blocks.append(nn.Sequential(*seq))

        # the third up block: ResnetBlock3D
        in_channels = block_out_channels[1]
        out_channels = block_out_channels[2]
        seq = [
            ResnetBlock3D(
                in_channels=in_channels if idx ==0 else out_channels,
                out_channels=out_channels,
                norm_groups=norm_groups,
                norm_eps=norm_eps,
            )
            for idx in range(layer_per_block)
        ] + [
            SpatialUpsample2x(
                channels=out_channels,
                use_interpolate=True,
            ),
        ]
        blocks.append(nn.Sequential(*seq))

        # the fourth up block: ResnetBlock2D
        in_channels = block_out_channels[2]
        out_channels = block_out_channels[3]
        seq = [
            ResnetBlock2D(
                in_channels=in_channels if idx ==0 else out_channels,
                out_channels=out_channels,
                norm_groups=norm_groups,
                norm_eps=norm_eps,
            )
            for idx in range(layer_per_block)
        ]
        blocks.append(nn.Sequential(*seq))

        self.blocks = nn.ModuleList(blocks)

        # out_block
        in_channels = block_out_channels[-1]
        out_channels = out_channels_decoder
        self.out_block = nn.Sequential(
            nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps),
            nn.SiLU(),
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
        )

    def forward(self, x):
        x = self.conv_in(x)
        print(torch.cuda.memory_allocated() /  1024 ** 3)

        x = self.mid_block(x)
        print(torch.cuda.memory_allocated() /  1024 ** 3)

        for block in self.blocks:
            x = block(x)
        print(torch.cuda.memory_allocated() /  1024 ** 3)

        x = self.out_block(x)
        print(torch.cuda.memory_allocated() /  1024 ** 3)
        return x

if __name__ == '__main__':
    from opensora.utils.misc import count_params
    device = 'cuda'
    dtype = torch.bfloat16

    encoder = Encoder(
        in_channels=3,
        out_channels=4,
        double_z=False,
        micro_batch_size=4,
    ).to(torch.bfloat16).to(device, dtype).eval()

    decoder = Decoder(
        in_channels=4,
        out_channels=3,
    ).to(torch.bfloat16).to(device, dtype).eval()
    num_params_enc = count_params(encoder)
    num_params_dec = count_params(decoder)
    print(f'Encoder #params: {num_params_enc}')
    print(f'Decoder #params: {num_params_dec}')

    # inference
    x = torch.rand(1, 3, 51, 720, 1080).to(device, dtype)
    with torch.inference_mode():
        x_enc = encoder(x)
        x_dec = decoder(x_enc)
    print(torch.cuda.memory_allocated() /  1024 ** 3)
    breakpoint()


================================================
FILE: Open-Sora/opensora/registry.py
================================================
from copy import deepcopy

import torch.nn as nn
from mmengine.registry import Registry


def build_module(module, builder, **kwargs):
    """Build module from config or return the module itself.

    Args:
        module (Union[dict, nn.Module]): The module to build.
        builder (Registry): The registry to build module.
        *args, **kwargs: Arguments passed to build function.

    Returns:
        Any: The built module.
    """
    if module is None:
        return None
    if isinstance(module, dict):
        cfg = deepcopy(module)
        for k, v in kwargs.items():
            cfg[k] = v
        return builder.build(cfg)
    elif isinstance(module, nn.Module):
        return module
    elif module is None:
        return None
    else:
        raise TypeError(f"Only support dict and nn.Module, but got {type(module)}.")


MODELS = Registry(
    "model",
    locations=["opensora.models"],
)

SCHEDULERS = Registry(
    "scheduler",
    locations=["opensora.schedulers"],
)

DATASETS = Registry(
    "dataset",
    locations=["opensora.datasets"],
)


================================================
FILE: Open-Sora/opensora/schedulers/__init__.py
================================================
from .dpms import DPMS
from .iddpm import IDDPM
from .rf import RFLOW


================================================
FILE: Open-Sora/opensora/schedulers/dpms/__init__.py
================================================
from functools import partial

import torch

from opensora.registry import SCHEDULERS

from .dpm_solver import DPMS


@SCHEDULERS.register_module("dpm-solver")
class DPM_SOLVER:
    def __init__(self, num_sampling_steps=None, cfg_scale=4.0):
        self.num_sampling_steps = num_sampling_steps
        self.cfg_scale = cfg_scale

    def sample(
        self,
        model,
        text_encoder,
        z,
        prompts,
        device,
        additional_args=None,
        mask=None,
        progress=True,
    ):
        if mask is not None:
            print("[WARNING] mask is not supported in dpm-solver, it will be ignored")
        n = len(prompts)
        model_args = text_encoder.encode(prompts)
        y = model_args.pop("y")
        null_y = text_encoder.null(n)
        if additional_args is not None:
            model_args.update(additional_args)

        dpms = DPMS(
            partial(forward_with_dpmsolver, model),
            condition=y,
            uncondition=null_y,
            cfg_scale=self.cfg_scale,
            model_kwargs=model_args,
        )
        samples = dpms.sample(
            z,
            steps=self.num_sampling_steps,
            order=2,
            skip_type="time_uniform",
            method="multistep",
            progress=progress,
        )
        return samples


def forward_with_dpmsolver(self, x, timestep, y, **kwargs):
    """
    dpm solver donnot need variance prediction
    """
    # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
    model_out = self.forward(x, timestep, y, **kwargs)
    return model_out.chunk(2, dim=1)[0]


================================================
FILE: Open-Sora/opensora/schedulers/dpms/dpm_solver.py
================================================
# MIT License
#
# Copyright (c) 2022 Cheng Lu
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
#
# This file is adapted from the dpm-solver project
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# PixArt:       https://github.com/PixArt-alpha/PixArt-alpha
# dpm-solver:   https://github.com/LuChengTHU/dpm-solver
# --------------------------------------------------------

import math

import numpy as np
import torch
from tqdm import tqdm


def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
    warmup_time = int(num_diffusion_timesteps * warmup_frac)
    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
    return betas


def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
    """
    This is the deprecated API for creating beta schedules.
    See get_named_beta_schedule() for the new library of schedules.
    """
    if beta_schedule == "quad":
        betas = (
            np.linspace(
                beta_start**0.5,
                beta_end**0.5,
                num_diffusion_timesteps,
                dtype=np.float64,
            )
            ** 2
        )
    elif beta_schedule == "linear":
        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
    elif beta_schedule == "warmup10":
        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
    elif beta_schedule == "warmup50":
        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
    elif beta_schedule == "const":
        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
        betas = 1.0 / np.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64)
    else:
        raise NotImplementedError(beta_schedule)
    assert betas.shape == (num_diffusion_timesteps,)
    return betas


def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
    """
    Get a pre-defined beta schedule for the given name.
    The beta schedule library consists of beta schedules which remain similar
    in the limit of num_diffusion_timesteps.
    Beta schedules may be added, but should not be removed or changed once
    they are committed to maintain backwards compatibility.
    """
    if schedule_name == "linear":
        # Linear schedule from Ho et al, extended to work for any number of
        # diffusion steps.
        scale = 1000 / num_diffusion_timesteps
        return get_beta_schedule(
            "linear",
            beta_start=scale * 0.0001,
            beta_end=scale * 0.02,
            num_diffusion_timesteps=num_diffusion_timesteps,
        )
    elif schedule_name == "squaredcos_cap_v2":
        return betas_for_alpha_bar(
            num_diffusion_timesteps,
            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
        )
    else:
        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")


def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
    """
    Create a beta schedule that discretizes the given alpha_t_bar function,
    which defines the cumulative product of (1-beta) over time from t = [0,1].
    :param num_diffusion_timesteps: the number of betas to produce.
    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
                      produces the cumulative product of (1-beta) up to that
                      part of the diffusion process.
    :param max_beta: the maximum beta to use; use values lower than 1 to
                     prevent singularities.
    """
    betas = []
    for i in range(num_diffusion_timesteps):
        t1 = i / num_diffusion_timesteps
        t2 = (i + 1) / num_diffusion_timesteps
        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
    return np.array(betas)


class NoiseScheduleVP:
    def __init__(
        self,
        schedule="discrete",
        betas=None,
        alphas_cumprod=None,
        continuous_beta_0=0.1,
        continuous_beta_1=20.0,
        dtype=torch.float32,
    ):
        """Create a wrapper class for the forward SDE (VP type).

        ***
        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
        ***

        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:

            log_alpha_t = self.marginal_log_mean_coeff(t)
            sigma_t = self.marginal_std(t)
            lambda_t = self.marginal_lambda(t)

        Moreover, as lambda(t) is an invertible function, we also support its inverse function:

            t = self.inverse_lambda(lambda_t)

        ===============================================================

        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).

        1. For discrete-time DPMs:

            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
                t_i = (i + 1) / N
            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.

            Args:
                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)

            Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.

            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
                and
                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).


        2. For continuous-time DPMs:

            We support the linear VPSDE for the continuous time setting. The hyperparameters for the noise
            schedule are the default settings in Yang Song's ScoreSDE:

            Args:
                beta_min: A `float` number. The smallest beta for the linear schedule.
                beta_max: A `float` number. The largest beta for the linear schedule.
                T: A `float` number. The ending time of the forward process.

        ===============================================================

        Args:
            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
                    'linear' for continuous-time DPMs.
        Returns:
            A wrapper object of the forward SDE (VP type).

        ===============================================================

        Example:

        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
        >>> ns = NoiseScheduleVP('discrete', betas=betas)

        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)

        # For continuous-time DPMs (VPSDE), linear schedule:
        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)

        """

        if schedule not in ["discrete", "linear"]:
            raise ValueError(f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear'")

        self.schedule = schedule
        if schedule == "discrete":
            if betas is not None:
                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
            else:
                assert alphas_cumprod is not None
                log_alphas = 0.5 * torch.log(alphas_cumprod)
            self.T = 1.0
            self.log_alpha_array = (
                self.numerical_clip_alpha(log_alphas)
                .reshape(
                    (
                        1,
                        -1,
                    )
                )
                .to(dtype=dtype)
            )
            self.total_N = self.log_alpha_array.shape[1]
            self.t_array = torch.linspace(0.0, 1.0, self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
        else:
            self.T = 1.0
            self.total_N = 1000
            self.beta_0 = continuous_beta_0
            self.beta_1 = continuous_beta_1

    def numerical_clip_alpha(self, log_alphas, clipped_lambda=-5.1):
        """
        For some beta schedules such as cosine schedule, the log-SNR has numerical isssues.
        We clip the log-SNR near t=T within -5.1 to ensure the stability.
        Such a trick is very useful for diffusion models with the cosine schedule, such as i-DDPM, guided-diffusion and GLIDE.
        """
        log_sigmas = 0.5 * torch.log(1.0 - torch.exp(2.0 * log_alphas))
        lambs = log_alphas - log_sigmas
        idx = torch.searchsorted(torch.flip(lambs, [0]), clipped_lambda)
        if idx > 0:
            log_alphas = log_alphas[:-idx]
        return log_alphas

    def marginal_log_mean_coeff(self, t):
        """
        Compute log(alpha_t) of a given continuous-time label t in [0, T].
        """
        if self.schedule == "discrete":
            return interpolate_fn(
                t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)
            ).reshape((-1))
        elif self.schedule == "linear":
            return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0

    def marginal_alpha(self, t):
        """
        Compute alpha_t of a given continuous-time label t in [0, T].
        """
        return torch.exp(self.marginal_log_mean_coeff(t))

    def marginal_std(self, t):
        """
        Compute sigma_t of a given continuous-time label t in [0, T].
        """
        return torch.sqrt(1.0 - torch.exp(2.0 * self.marginal_log_mean_coeff(t)))

    def marginal_lambda(self, t):
        """
        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
        """
        log_mean_coeff = self.marginal_log_mean_coeff(t)
        log_std = 0.5 * torch.log(1.0 - torch.exp(2.0 * log_mean_coeff))
        return log_mean_coeff - log_std

    def inverse_lambda(self, lamb):
        """
        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
        """
        if self.schedule == "linear":
            tmp = 2.0 * (self.beta_1 - self.beta_0) * torch.logaddexp(-2.0 * lamb, torch.zeros((1,)).to(lamb))
            Delta = self.beta_0**2 + tmp
            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
        elif self.schedule == "discrete":
            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2.0 * lamb)
            t = interpolate_fn(
                log_alpha.reshape((-1, 1)),
                torch.flip(self.log_alpha_array.to(lamb.device), [1]),
                torch.flip(self.t_array.to(lamb.device), [1]),
            )
            return t.reshape((-1,))


def model_wrapper(
    model,
    noise_schedule,
    model_type="noise",
    model_kwargs={},
    guidance_type="uncond",
    condition=None,
    unconditional_condition=None,
    guidance_scale=1.0,
    classifier_fn=None,
    classifier_kwargs={},
):
    """Create a wrapper function for the noise prediction model.

    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.

    We support four types of the diffusion model by setting `model_type`:

        1. "noise": noise prediction model. (Trained by predicting noise).

        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).

        3. "v": velocity prediction model. (Trained by predicting the velocity).
            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].

            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
                arXiv preprint arXiv:2202.00512 (2022).
            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
                arXiv preprint arXiv:2210.02303 (2022).

        4. "score": marginal score function. (Trained by denoising score matching).
            Note that the score function and the noise prediction model follows a simple relationship:
            ```
                noise(x_t, t) = -sigma_t * score(x_t, t)
            ```

    We support three types of guided sampling by DPMs by setting `guidance_type`:
        1. "uncond": unconditional sampling by DPMs.
            The input `model` has the following format:
            ``
                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
            ``

        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
            The input `model` has the following format:
            ``
                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
            ``

            The input `classifier_fn` has the following format:
            ``
                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
            ``

            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.

        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
            The input `model` has the following format:
            ``
                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
            ``
            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.

            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
                arXiv preprint arXiv:2207.12598 (2022).


    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
    or continuous-time labels (i.e. epsilon to T).

    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
    ``
        def model_fn(x, t_continuous) -> noise:
            t_input = get_model_input_time(t_continuous)
            return noise_pred(model, x, t_input, **model_kwargs)
    ``
    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.

    ===============================================================

    Args:
        model: A diffusion model with the corresponding format described above.
        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
        model_type: A `str`. The parameterization type of the diffusion model.
                    "noise" or "x_start" or "v" or "score".
        model_kwargs: A `dict`. A dict for the other inputs of the model function.
        guidance_type: A `str`. The type of the guidance for sampling.
                    "uncond" or "classifier" or "classifier-free".
        condition: A pytorch tensor. The condition for the guided sampling.
                    Only used for "classifier" or "classifier-free" guidance type.
        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
                    Only used for "classifier-free" guidance type.
        guidance_scale: A `float`. The scale for the guided sampling.
        classifier_fn: A classifier function. Only used for the classifier guidance.
        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
    Returns:
        A noise prediction model that accepts the noised data and the continuous time as the inputs.
    """

    def get_model_input_time(t_continuous):
        """
        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
        For continuous-time DPMs, we just use `t_continuous`.
        """
        if noise_schedule.schedule == "discrete":
            return (t_continuous - 1.0 / noise_schedule.total_N) * 1000.0
        else:
            return t_continuous

    def noise_pred_fn(x, t_continuous, cond=None):
        t_input = get_model_input_time(t_continuous)
        if cond is None:
            output = model(x, t_input, **model_kwargs)
        else:
            output = model(x, t_input, cond, **model_kwargs)
        if model_type == "noise":
            return output
        elif model_type == "x_start":
            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
            return (x - expand_dims(alpha_t, x.dim()) * output) / expand_dims(sigma_t, x.dim())
        elif model_type == "v":
            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
            return expand_dims(alpha_t, x.dim()) * output + expand_dims(sigma_t, x.dim()) * x
        elif model_type == "score":
            sigma_t = noise_schedule.marginal_std(t_continuous)
            return -expand_dims(sigma_t, x.dim()) * output

    def cond_grad_fn(x, t_input):
        """
        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
        """
        with torch.enable_grad():
            x_in = x.detach().requires_grad_(True)
            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
            return torch.autograd.grad(log_prob.sum(), x_in)[0]

    def model_fn(x, t_continuous):
        """
        The noise predicition model function that is used for DPM-Solver.
        """
        if guidance_type == "uncond":
            return noise_pred_fn(x, t_continuous)
        elif guidance_type == "classifier":
            assert classifier_fn is not None
            t_input = get_model_input_time(t_continuous)
            cond_grad = cond_grad_fn(x, t_input)
            sigma_t = noise_schedule.marginal_std(t_continuous)
            noise = noise_pred_fn(x, t_continuous)
            return noise - guidance_scale * expand_dims(sigma_t, x.dim()) * cond_grad
        elif guidance_type == "classifier-free":
            if guidance_scale == 1.0 or unconditional_condition is None:
                return noise_pred_fn(x, t_continuous, cond=condition)
            x_in = torch.cat([x] * 2)
            t_in = torch.cat([t_continuous] * 2)
            c_in = torch.cat([unconditional_condition, condition])
            noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
            return noise_uncond + guidance_scale * (noise - noise_uncond)

    assert model_type in ["noise", "x_start", "v", "score"]
    assert guidance_type in ["uncond", "classifier", "classifier-free"]
    return model_fn


class DPM_Solver:
    def __init__(
        self,
        model_fn,
        noise_schedule,
        algorithm_type="dpmsolver++",
        correcting_x0_fn=None,
        correcting_xt_fn=None,
        thresholding_max_val=1.0,
        dynamic_thresholding_ratio=0.995,
    ):
        """Construct a DPM-Solver.

        We support both DPM-Solver (`algorithm_type="dpmsolver"`) and DPM-Solver++ (`algorithm_type="dpmsolver++"`).

        We also support the "dynamic thresholding" method in Imagen[1]. For pixel-space diffusion models, you
        can set both `algorithm_type="dpmsolver++"` and `correcting_x0_fn="dynamic_thresholding"` to use the
        dynamic thresholding. The "dynamic thresholding" can greatly improve the sample quality for pixel-space
        DPMs with large guidance scales. Note that the thresholding method is **unsuitable** for latent-space
        DPMs (such as stable-diffusion).

        To support advanced algorithms in image-to-image applications, we also support corrector functions for
        both x0 and xt.

        Args:
            model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
                ``
                def model_fn(x, t_continuous):
                    return noise
                ``
                The shape of `x` is `(batch_size, **shape)`, and the shape of `t_continuous` is `(batch_size,)`.
            noise_schedule: A noise schedule object, such as NoiseScheduleVP.
            algorithm_type: A `str`. Either "dpmsolver" or "dpmsolver++".
            correcting_x0_fn: A `str` or a function with the following format:
                ```
                def correcting_x0_fn(x0, t):
                    x0_new = ...
                    return x0_new
                ```
                This function is to correct the outputs of the data prediction model at each sampling step. e.g.,
                ```
                x0_pred = data_pred_model(xt, t)
                if correcting_x0_fn is not None:
                    x0_pred = correcting_x0_fn(x0_pred, t)
                xt_1 = update(x0_pred, xt, t)
                ```
                If `correcting_x0_fn="dynamic_thresholding"`, we use the dynamic thresholding proposed in Imagen[1].
            correcting_xt_fn: A function with the following format:
                ```
                def correcting_xt_fn(xt, t, step):
                    x_new = ...
                    return x_new
                ```
                This function is to correct the intermediate samples xt at each sampling step. e.g.,
                ```
                xt = ...
                xt = correcting_xt_fn(xt, t, step)
                ```
            thresholding_max_val: A `float`. The max value for thresholding.
                Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
            dynamic_thresholding_ratio: A `float`. The ratio for dynamic thresholding (see Imagen[1] for details).
                Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.

        [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour,
            Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models
            with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
        """
        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
        self.noise_schedule = noise_schedule
        assert algorithm_type in ["dpmsolver", "dpmsolver++"]
        self.algorithm_type = algorithm_type
        if correcting_x0_fn == "dynamic_thresholding":
            self.correcting_x0_fn = self.dynamic_thresholding_fn
        else:
            self.correcting_x0_fn = correcting_x0_fn
        self.correcting_xt_fn = correcting_xt_fn
        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
        self.thresholding_max_val = thresholding_max_val

    def dynamic_thresholding_fn(self, x0, t):
        """
        The dynamic thresholding method.
        """
        dims = x0.dim()
        p = self.dynamic_thresholding_ratio
        s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
        s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
        x0 = torch.clamp(x0, -s, s) / s
        return x0

    def noise_prediction_fn(self, x, t):
        """
        Return the noise prediction model.
        """
        return self.model(x, t)

    def data_prediction_fn(self, x, t):
        """
        Return the data prediction model (with corrector).
        """
        noise = self.noise_prediction_fn(x, t)
        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
        x0 = (x - sigma_t * noise) / alpha_t
        if self.correcting_x0_fn is not None:
            x0 = self.correcting_x0_fn(x0, t)
        return x0

    def model_fn(self, x, t):
        """
        Convert the model to the noise prediction model or the data prediction model.
        """
        if self.algorithm_type == "dpmsolver++":
            return self.data_prediction_fn(x, t)
        else:
            return self.noise_prediction_fn(x, t)

    def get_time_steps(self, skip_type, t_T, t_0, N, device):
        """Compute the intermediate time steps for sampling.

        Args:
            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
                - 'logSNR': uniform logSNR for the time steps.
                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
            t_T: A `float`. The starting time of the sampling (default is T).
            t_0: A `float`. The ending time of the sampling (default is epsilon).
            N: A `int`. The total number of the spacing of the time steps.
            device: A torch device.
        Returns:
            A pytorch tensor of the time steps, with the shape (N + 1,).
        """
        if skip_type == "logSNR":
            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
            return self.noise_schedule.inverse_lambda(logSNR_steps)
        elif skip_type == "time_uniform":
            return torch.linspace(t_T, t_0, N + 1).to(device)
        elif skip_type == "time_quadratic":
            t_order = 2
            return torch.linspace(t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1).pow(t_order).to(device)
        else:
            raise ValueError(
                f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'"
            )

    def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
        """
        Get the order of each step for sampling by the singlestep DPM-Solver.

        We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast".
        Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is:
            - If order == 1:
                We take `steps` of DPM-Solver-1 (i.e. DDIM).
            - If order == 2:
                - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling.
                - If steps % 2 == 0, we use K steps of DPM-Solver-2.
                - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1.
            - If order == 3:
                - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
                - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
                - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
                - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.

        ============================================
        Args:
            order: A `int`. The max order for the solver (2 or 3).
            steps: A `int`. The total number of function evaluations (NFE).
            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
                - 'logSNR': uniform logSNR for the time steps.
                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
            t_T: A `float`. The starting time of the sampling (default is T).
            t_0: A `float`. The ending time of the sampling (default is epsilon).
            device: A torch device.
        Returns:
            orders: A list of the solver order of each step.
        """
        if order == 3:
            K = steps // 3 + 1
            if steps % 3 == 0:
                orders = [
                    3,
                ] * (
                    K - 2
                ) + [2, 1]
            elif steps % 3 == 1:
                orders = [
                    3,
                ] * (
                    K - 1
                ) + [1]
            else:
                orders = [
                    3,
                ] * (
                    K - 1
                ) + [2]
        elif order == 2:
            if steps % 2 == 0:
                K = steps // 2
                orders = [
                    2,
                ] * K
            else:
                K = steps // 2 + 1
                orders = [
                    2,
                ] * (
                    K - 1
                ) + [1]
        elif order == 1:
            K = 1
            orders = [
                1,
            ] * steps
        else:
            raise ValueError("'order' must be '1' or '2' or '3'.")
        if skip_type == "logSNR":
            # To reproduce the results in DPM-Solver paper
            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
        else:
            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
                torch.cumsum(
                    torch.tensor(
                        [
                            0,
                        ]
                        + orders
                    ),
                    0,
                ).to(device)
            ]
        return timesteps_outer, orders

    def denoise_to_zero_fn(self, x, s):
        """
        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
        """
        return self.data_prediction_fn(x, s)

    def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=False):
        """
        DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            s: A pytorch tensor. The starting time, with the shape (1,).
            t: A pytorch tensor. The ending time, with the shape (1,).
            model_s: A pytorch tensor. The model function evaluated at time `s`.
                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
            return_intermediate: A `bool`. If true, also return the model value at time `s`.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        ns = self.noise_schedule
        x.dim()
        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
        h = lambda_t - lambda_s
        log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
        sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
        alpha_t = torch.exp(log_alpha_t)

        if self.algorithm_type == "dpmsolver++":
            phi_1 = torch.expm1(-h)
            if model_s is None:
                model_s = self.model_fn(x, s)
            x_t = sigma_t / sigma_s * x - alpha_t * phi_1 * model_s
        else:
            phi_1 = torch.expm1(h)
            if model_s is None:
                model_s = self.model_fn(x, s)
            x_t = torch.exp(log_alpha_t - log_alpha_s) * x - (sigma_t * phi_1) * model_s
        return (x_t, {"model_s": model_s}) if return_intermediate else x_t

    def singlestep_dpm_solver_second_update(
        self, x, s, t, r1=0.5, model_s=None, return_intermediate=False, solver_type="dpmsolver"
    ):
        """
        Singlestep solver DPM-Solver-2 from time `s` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            s: A pytorch tensor. The starting time, with the shape (1,).
            t: A pytorch tensor. The ending time, with the shape (1,).
            r1: A `float`. The hyperparameter of the second-order solver.
            model_s: A pytorch tensor. The model function evaluated at time `s`.
                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
            return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        if solver_type not in ["dpmsolver", "taylor"]:
            raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
        if r1 is None:
            r1 = 0.5
        ns = self.noise_schedule
        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
        h = lambda_t - lambda_s
        lambda_s1 = lambda_s + r1 * h
        s1 = ns.inverse_lambda(lambda_s1)
        log_alpha_s, log_alpha_s1, log_alpha_t = (
            ns.marginal_log_mean_coeff(s),
            ns.marginal_log_mean_coeff(s1),
            ns.marginal_log_mean_coeff(t),
        )
        sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
        alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)

        if self.algorithm_type == "dpmsolver++":
            phi_11 = torch.expm1(-r1 * h)
            phi_1 = torch.expm1(-h)

            if model_s is None:
                model_s = self.model_fn(x, s)
            x_s1 = (sigma_s1 / sigma_s) * x - (alpha_s1 * phi_11) * model_s
            model_s1 = self.model_fn(x_s1, s1)
            if solver_type == "dpmsolver":
                x_t = (
                    (sigma_t / sigma_s) * x
                    - (alpha_t * phi_1) * model_s
                    - (0.5 / r1) * (alpha_t * phi_1) * (model_s1 - model_s)
                )
            elif solver_type == "taylor":
                x_t = (
                    (sigma_t / sigma_s) * x
                    - (alpha_t * phi_1) * model_s
                    + (1.0 / r1) * (alpha_t * (phi_1 / h + 1.0)) * (model_s1 - model_s)
                )
        else:
            phi_11 = torch.expm1(r1 * h)
            phi_1 = torch.expm1(h)

            if model_s is None:
                model_s = self.model_fn(x, s)
            x_s1 = torch.exp(log_alpha_s1 - log_alpha_s) * x - (sigma_s1 * phi_11) * model_s
            model_s1 = self.model_fn(x_s1, s1)
            if solver_type == "dpmsolver":
                x_t = (
                    torch.exp(log_alpha_t - log_alpha_s) * x
                    - (sigma_t * phi_1) * model_s
                    - (0.5 / r1) * (sigma_t * phi_1) * (model_s1 - model_s)
                )
            elif solver_type == "taylor":
                x_t = (
                    torch.exp(log_alpha_t - log_alpha_s) * x
                    - (sigma_t * phi_1) * model_s
                    - (1.0 / r1) * (sigma_t * (phi_1 / h - 1.0)) * (model_s1 - model_s)
                )
        if return_intermediate:
            return x_t, {"model_s": model_s, "model_s1": model_s1}
        else:
            return x_t

    def singlestep_dpm_solver_third_update(
        self,
        x,
        s,
        t,
        r1=1.0 / 3.0,
        r2=2.0 / 3.0,
        model_s=None,
        model_s1=None,
        return_intermediate=False,
        solver_type="dpmsolver",
    ):
        """
        Singlestep solver DPM-Solver-3 from time `s` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            s: A pytorch tensor. The starting time, with the shape (1,).
            t: A pytorch tensor. The ending time, with the shape (1,).
            r1: A `float`. The hyperparameter of the third-order solver.
            r2: A `float`. The hyperparameter of the third-order solver.
            model_s: A pytorch tensor. The model function evaluated at time `s`.
                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
            model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
                If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        if solver_type not in ["dpmsolver", "taylor"]:
            raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
        if r1 is None:
            r1 = 1.0 / 3.0
        if r2 is None:
            r2 = 2.0 / 3.0
        ns = self.noise_schedule
        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
        h = lambda_t - lambda_s
        lambda_s1 = lambda_s + r1 * h
        lambda_s2 = lambda_s + r2 * h
        s1 = ns.inverse_lambda(lambda_s1)
        s2 = ns.inverse_lambda(lambda_s2)
        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = (
            ns.marginal_log_mean_coeff(s),
            ns.marginal_log_mean_coeff(s1),
            ns.marginal_log_mean_coeff(s2),
            ns.marginal_log_mean_coeff(t),
        )
        sigma_s, sigma_s1, sigma_s2, sigma_t = (
            ns.marginal_std(s),
            ns.marginal_std(s1),
            ns.marginal_std(s2),
            ns.marginal_std(t),
        )
        alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)

        if self.algorithm_type == "dpmsolver++":
            phi_11 = torch.expm1(-r1 * h)
            phi_12 = torch.expm1(-r2 * h)
            phi_1 = torch.expm1(-h)
            phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.0
            phi_2 = phi_1 / h + 1.0
            phi_3 = phi_2 / h - 0.5

            if model_s is None:
                model_s = self.model_fn(x, s)
            if model_s1 is None:
                x_s1 = (sigma_s1 / sigma_s) * x - (alpha_s1 * phi_11) * model_s
                model_s1 = self.model_fn(x_s1, s1)
            x_s2 = (
                (sigma_s2 / sigma_s) * x
                - (alpha_s2 * phi_12) * model_s
                + r2 / r1 * (alpha_s2 * phi_22) * (model_s1 - model_s)
            )
            model_s2 = self.model_fn(x_s2, s2)
            if solver_type == "dpmsolver":
                x_t = (
                    (sigma_t / sigma_s) * x
                    - (alpha_t * phi_1) * model_s
                    + (1.0 / r2) * (alpha_t * phi_2) * (model_s2 - model_s)
                )
            elif solver_type == "taylor":
                D1_0 = (1.0 / r1) * (model_s1 - model_s)
                D1_1 = (1.0 / r2) * (model_s2 - model_s)
                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
                D2 = 2.0 * (D1_1 - D1_0) / (r2 - r1)
                x_t = (
                    (sigma_t / sigma_s) * x
                    - (alpha_t * phi_1) * model_s
                    + (alpha_t * phi_2) * D1
                    - (alpha_t * phi_3) * D2
                )
        else:
            phi_11 = torch.expm1(r1 * h)
            phi_12 = torch.expm1(r2 * h)
            phi_1 = torch.expm1(h)
            phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.0
            phi_2 = phi_1 / h - 1.0
            phi_3 = phi_2 / h - 0.5

            if model_s is None:
                model_s = self.model_fn(x, s)
            if model_s1 is None:
                x_s1 = (torch.exp(log_alpha_s1 - log_alpha_s)) * x - (sigma_s1 * phi_11) * model_s
                model_s1 = self.model_fn(x_s1, s1)
            x_s2 = (
                (torch.exp(log_alpha_s2 - log_alpha_s)) * x
                - (sigma_s2 * phi_12) * model_s
                - r2 / r1 * (sigma_s2 * phi_22) * (model_s1 - model_s)
            )
            model_s2 = self.model_fn(x_s2, s2)
            if solver_type == "dpmsolver":
                x_t = (
                    (torch.exp(log_alpha_t - log_alpha_s)) * x
                    - (sigma_t * phi_1) * model_s
                    - (1.0 / r2) * (sigma_t * phi_2) * (model_s2 - model_s)
                )
            elif solver_type == "taylor":
                D1_0 = (1.0 / r1) * (model_s1 - model_s)
                D1_1 = (1.0 / r2) * (model_s2 - model_s)
                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
                D2 = 2.0 * (D1_1 - D1_0) / (r2 - r1)
                x_t = (
                    (torch.exp(log_alpha_t - log_alpha_s)) * x
                    - (sigma_t * phi_1) * model_s
                    - (sigma_t * phi_2) * D1
                    - (sigma_t * phi_3) * D2
                )

        if return_intermediate:
            return x_t, {"model_s": model_s, "model_s1": model_s1, "model_s2": model_s2}
        else:
            return x_t

    def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"):
        """
        Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            model_prev_list: A list of pytorch tensor. The previous computed model values.
            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
            t: A pytorch tensor. The ending time, with the shape (1,).
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        if solver_type not in ["dpmsolver", "taylor"]:
            raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
        ns = self.noise_schedule
        model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1]
        t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1]
        lambda_prev_1, lambda_prev_0, lambda_t = (
            ns.marginal_lambda(t_prev_1),
            ns.marginal_lambda(t_prev_0),
            ns.marginal_lambda(t),
        )
        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
        alpha_t = torch.exp(log_alpha_t)

        h_0 = lambda_prev_0 - lambda_prev_1
        h = lambda_t - lambda_prev_0
        r0 = h_0 / h
        D1_0 = (1.0 / r0) * (model_prev_0 - model_prev_1)
        if self.algorithm_type == "dpmsolver++":
            phi_1 = torch.expm1(-h)
            if solver_type == "dpmsolver":
                x_t = (sigma_t / sigma_prev_0) * x - (alpha_t * phi_1) * model_prev_0 - 0.5 * (alpha_t * phi_1) * D1_0
            elif solver_type == "taylor":
                x_t = (
                    (sigma_t / sigma_prev_0) * x
                    - (alpha_t * phi_1) * model_prev_0
                    + (alpha_t * (phi_1 / h + 1.0)) * D1_0
                )
        else:
            phi_1 = torch.expm1(h)
            if solver_type == "dpmsolver":
                x_t = (
                    (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
                    - (sigma_t * phi_1) * model_prev_0
                    - 0.5 * (sigma_t * phi_1) * D1_0
                )
            elif solver_type == "taylor":
                x_t = (
                    (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
                    - (sigma_t * phi_1) * model_prev_0
                    - (sigma_t * (phi_1 / h - 1.0)) * D1_0
                )
        return x_t

    def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"):
        """
        Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            model_prev_list: A list of pytorch tensor. The previous computed model values.
            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
            t: A pytorch tensor. The ending time, with the shape (1,).
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        ns = self.noise_schedule
        model_prev_2, model_prev_1, model_prev_0 = model_prev_list
        t_prev_2, t_prev_1, t_prev_0 = t_prev_list
        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = (
            ns.marginal_lambda(t_prev_2),
            ns.marginal_lambda(t_prev_1),
            ns.marginal_lambda(t_prev_0),
            ns.marginal_lambda(t),
        )
        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
        alpha_t = torch.exp(log_alpha_t)

        h_1 = lambda_prev_1 - lambda_prev_2
        h_0 = lambda_prev_0 - lambda_prev_1
        h = lambda_t - lambda_prev_0
        r0, r1 = h_0 / h, h_1 / h
        D1_0 = (1.0 / r0) * (model_prev_0 - model_prev_1)
        D1_1 = (1.0 / r1) * (model_prev_1 - model_prev_2)
        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
        if self.algorithm_type == "dpmsolver++":
            phi_1 = torch.expm1(-h)
            phi_2 = phi_1 / h + 1.0
            phi_3 = phi_2 / h - 0.5
            return (
                (sigma_t / sigma_prev_0) * x
                - (alpha_t * phi_1) * model_prev_0
                + (alpha_t * phi_2) * D1
                - (alpha_t * phi_3) * D2
            )
        else:
            phi_1 = torch.expm1(h)
            phi_2 = phi_1 / h - 1.0
            phi_3 = phi_2 / h - 0.5
            return (
                (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
                - (sigma_t * phi_1) * model_prev_0
                - (sigma_t * phi_2) * D1
                - (sigma_t * phi_3) * D2
            )

    def singlestep_dpm_solver_update(
        self, x, s, t, order, return_intermediate=False, solver_type="dpmsolver", r1=None, r2=None
    ):
        """
        Singlestep DPM-Solver with the order `order` from time `s` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            s: A pytorch tensor. The starting time, with the shape (1,).
            t: A pytorch tensor. The ending time, with the shape (1,).
            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
            r1: A `float`. The hyperparameter of the second-order or third-order solver.
            r2: A `float`. The hyperparameter of the third-order solver.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        if order == 1:
            return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
        elif order == 2:
            return self.singlestep_dpm_solver_second_update(
                x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1
            )
        elif order == 3:
            return self.singlestep_dpm_solver_third_update(
                x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1, r2=r2
            )
        else:
            raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")

    def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type="dpmsolver"):
        """
        Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            model_prev_list: A list of pytorch tensor. The previous computed model values.
            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
            t: A pytorch tensor. The ending time, with the shape (1,).
            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        if order == 1:
            return self.dpm_solver_first_update(x, t_prev_list[-1], t, model_s=model_prev_list[-1])
        elif order == 2:
            return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
        elif order == 3:
            return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
        else:
            raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")

    def dpm_solver_adaptive(
        self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5, solver_type="dpmsolver"
    ):
        """
        The adaptive step size solver based on singlestep DPM-Solver.

        Args:
            x: A pytorch tensor. The initial value at time `t_T`.
            order: A `int`. The (higher) order of the solver. We only support order == 2 or 3.
            t_T: A `float`. The starting time of the sampling (default is T).
            t_0: A `float`. The ending time of the sampling (default is epsilon).
            h_init: A `float`. The initial step size (for logSNR).
            atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1].
            rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05.
            theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
            t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
                current time and `t_0` is less than `t_err`. The default setting is 1e-5.
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_0: A pytorch tensor. The approximated solution at time `t_0`.

        [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
        """
        ns = self.noise_schedule
        s = t_T * torch.ones((1,)).to(x)
        lambda_s = ns.marginal_lambda(s)
        lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
        h = h_init * torch.ones_like(s).to(x)
        x_prev = x
        nfe = 0
        if order == 2:
            r1 = 0.5
            lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(
                x, s, t, r1=r1, solver_type=solver_type, **kwargs
            )
        elif order == 3:
            r1, r2 = 1.0 / 3.0, 2.0 / 3.0
            lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(
                x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type
            )
            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(
                x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs
            )
        else:
            raise ValueError(f"For adaptive step size solver, order must be 2 or 3, got {order}")
        while torch.abs((s - t_0)).mean() > t_err:
            t = ns.inverse_lambda(lambda_s + h)
            x_lower, lower_noise_kwargs = lower_update(x, s, t)
            x_higher = higher_update(x, s, t, **lower_noise_kwargs)
            delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
            norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
            E = norm_fn((x_higher - x_lower) / delta).max()
            if torch.all(E <= 1.0):
                x = x_higher
                s = t
                x_prev = x_lower
                lambda_s = ns.marginal_lambda(s)
            h = torch.min(theta * h * torch.float_power(E, -1.0 / order).float(), lambda_0 - lambda_s)
            nfe += order
        print("adaptive solver nfe", nfe)
        return x

    def add_noise(self, x, t, noise=None):
        """
        Compute the noised input xt = alpha_t * x + sigma_t * noise.

        Args:
            x: A `torch.Tensor` with shape `(batch_size, *shape)`.
            t: A `torch.Tensor` with shape `(t_size,)`.
        Returns:
            xt with shape `(t_size, batch_size, *shape)`.
        """
        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
        if noise is None:
            noise = torch.randn((t.shape[0], *x.shape), device=x.device)
        x = x.reshape((-1, *x.shape))
        xt = expand_dims(alpha_t, x.dim()) * x + expand_dims(sigma_t, x.dim()) * noise
        return xt.squeeze(0) if t.shape[0] == 1 else xt

    def inverse(
        self,
        x,
        steps=20,
        t_start=None,
        t_end=None,
        order=2,
        skip_type="time_uniform",
        method="multistep",
        lower_order_final=True,
        denoise_to_zero=False,
        solver_type="dpmsolver",
        atol=0.0078,
        rtol=0.05,
        return_intermediate=False,
    ):
        """
        Inverse the sample `x` from time `t_start` to `t_end` by DPM-Solver.
        For discrete-time DPMs, we use `t_start=1/N`, where `N` is the total time steps during training.
        """
        t_0 = 1.0 / self.noise_schedule.total_N if t_start is None else t_start
        t_T = self.noise_schedule.T if t_end is None else t_end
        assert (
            t_0 > 0 and t_T > 0
        ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
        return self.sample(
            x,
            steps=steps,
            t_start=t_0,
            t_end=t_T,
            order=order,
            skip_type=skip_type,
            method=method,
            lower_order_final=lower_order_final,
            denoise_to_zero=denoise_to_zero,
            solver_type=solver_type,
            atol=atol,
            rtol=rtol,
            return_intermediate=return_intermediate,
        )

    def sample(
        self,
        x,
        steps=20,
        t_start=None,
        t_end=None,
        order=2,
        skip_type="time_uniform",
        method="multistep",
        lower_order_final=True,
        denoise_to_zero=False,
        solver_type="dpmsolver",
        atol=0.0078,
        rtol=0.05,
        return_intermediate=False,
        progress=True,
    ):
        """
        Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.

        =====================================================

        We support the following algorithms for both noise prediction model and data prediction model:
            - 'singlestep':
                Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver.
                We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps).
                The total number of function evaluations (NFE) == `steps`.
                Given a fixed NFE == `steps`, the sampling procedure is:
                    - If `order` == 1:
                        - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM).
                    - If `order` == 2:
                        - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling.
                        - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2.
                        - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
                    - If `order` == 3:
                        - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
                        - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
                        - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1.
                        - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2.
            - 'multistep':
                Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`.
                We initialize the first `order` values by lower order multistep solvers.
                Given a fixed NFE == `steps`, the sampling procedure is:
                    Denote K = steps.
                    - If `order` == 1:
                        - We use K steps of DPM-Solver-1 (i.e. DDIM).
                    - If `order` == 2:
                        - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2.
                    - If `order` == 3:
                        - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3.
            - 'singlestep_fixed':
                Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3).
                We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE.
            - 'adaptive':
                Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper).
                We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`.
                You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs
                (NFE) and the sample quality.
                    - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2.
                    - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3.

        =====================================================

        Some advices for choosing the algorithm:
            - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
                Use singlestep DPM-Solver or DPM-Solver++ ("DPM-Solver-fast" in the paper) with `order = 3`.
                e.g., DPM-Solver:
                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver")
                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
                            skip_type='time_uniform', method='singlestep')
                e.g., DPM-Solver++:
                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
                            skip_type='time_uniform', method='singlestep')
            - For **guided sampling with large guidance scale** by DPMs:
                Use multistep DPM-Solver with `algorithm_type="dpmsolver++"` and `order = 2`.
                e.g.
                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
                            skip_type='time_uniform', method='multistep')

        We support three types of `skip_type`:
            - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images**
            - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**.
            - 'time_quadratic': quadratic time for the time steps.

        =====================================================
        Args:
            x: A pytorch tensor. The initial value at time `t_start`
                e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution.
            steps: A `int`. The total number of function evaluations (NFE).
            t_start: A `float`. The starting time of the sampling.
                If `T` is None, we use self.noise_schedule.T (default is 1.0).
            t_end: A `float`. The ending time of the sampling.
                If `t_end` is None, we use 1. / self.noise_schedule.total_N.
                e.g. if total_N == 1000, we have `t_end` == 1e-3.
                For discrete-time DPMs:
                    - We recommend `t_end` == 1. / self.noise_schedule.total_N.
                For continuous-time DPMs:
                    - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15.
            order: A `int`. The order of DPM-Solver.
            skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
            method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
            denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step.
                Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1).

                This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and
                score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID
                for diffusion models sampling by diffusion SDEs for low-resolutional images
                (such as CIFAR-10). However, we observed that such trick does not matter for
                high-resolutional images. As it needs an additional NFE, we do not recommend
                it for high-resolutional images.
            lower_order_final: A `bool`. Whether to use lower order solvers at the final steps.
                Only valid for `method=multistep` and `steps < 15`. We empirically find that
                this trick is a key to stabilizing the sampling by DPM-Solver with very few steps
                (especially for steps <= 10). So we recommend to set it to be `True`.
            solver_type: A `str`. The taylor expansion type for the solver. `dpmsolver` or `taylor`. We recommend `dpmsolver`.
            atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
            rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
            return_intermediate: A `bool`. Whether to save the xt at each step.
                When set to `True`, method returns a tuple (x0, intermediates); when set to False, method returns only x0.
        Returns:
            x_end: A pytorch tensor. The approximated solution at time `t_end`.

        """
        t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end
        t_T = self.noise_schedule.T if t_start is None else t_start
        assert (
            t_0 > 0 and t_T > 0
        ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
        if return_intermediate:
            assert method in [
                "multistep",
                "singlestep",
                "singlestep_fixed",
            ], "Cannot use adaptive solver when saving intermediate values"
        if self.correcting_xt_fn is not None:
            assert method in [
                "multistep",
                "singlestep",
                "singlestep_fixed",
            ], "Cannot use adaptive solver when correcting_xt_fn is not None"
        device = x.device
        intermediates = []
        with torch.no_grad():
            if method == "adaptive":
                x = self.dpm_solver_adaptive(
                    x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol, solver_type=solver_type
                )
            elif method == "multistep":
                assert steps >= order
                timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
                assert timesteps.shape[0] - 1 == steps
                # Init the initial values.
                step = 0
                t = timesteps[step]
                t_prev_list = [t]
                model_prev_list = [self.model_fn(x, t)]
                if self.correcting_xt_fn is not None:
                    x = self.correcting_xt_fn(x, t, step)
                if return_intermediate:
                    intermediates.append(x)
                # Init the first `order` values by lower order multistep DPM-Solver.
                for step in range(1, order):
                    t = timesteps[step]
                    x = self.multistep_dpm_solver_update(
                        x, model_prev_list, t_prev_list, t, step, solver_type=solver_type
                    )
                    if self.correcting_xt_fn is not None:
                        x = self.correcting_xt_fn(x, t, step)
                    if return_intermediate:
                        intermediates.append(x)
                    t_prev_list.append(t)
                    model_prev_list.append(self.model_fn(x, t))
                # Compute the remaining values by `order`-th order multistep DPM-Solver.
                progress_fn = tqdm if progress else lambda x: x
                for step in progress_fn(range(order, steps + 1)):
                    t = timesteps[step]
                    # We only use lower order for steps < 10
                    if lower_order_final:  # recommended by Shuchen Xue
                        step_order = min(order, steps + 1 - step)
                    else:
                        step_order = order
                    x = self.multistep_dpm_solver_update(
                        x, model_prev_list, t_prev_list, t, step_order, solver_type=solver_type
                    )
                    if self.correcting_xt_fn is not None:
                        x = self.correcting_xt_fn(x, t, step)
                    if return_intermediate:
                        intermediates.append(x)
                    for i in range(order - 1):
                        t_prev_list[i] = t_prev_list[i + 1]
                        model_prev_list[i] = model_prev_list[i + 1]
                    t_prev_list[-1] = t
                    # We do not need to evaluate the final model value.
                    if step < steps:
                        model_prev_list[-1] = self.model_fn(x, t)
            elif method in ["singlestep", "singlestep_fixed"]:
                if method == "singlestep":
                    timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(
                        steps=steps, order=order, skip_type=skip_type, t_T=t_T, t_0=t_0, device=device
                    )
                elif method == "singlestep_fixed":
                    K = steps // order
                    orders = [
                        order,
                    ] * K
                    timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
                for step, order in enumerate(orders):
                    s, t = timesteps_outer[step], timesteps_outer[step + 1]
                    timesteps_inner = self.get_time_steps(
                        skip_type=skip_type, t_T=s.item(), t_0=t.item(), N=order, device=device
                    )
                    lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
                    h = lambda_inner[-1] - lambda_inner[0]
                    r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
                    r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
                    x = self.singlestep_dpm_solver_update(x, s, t, order, solver_type=solver_type, r1=r1, r2=r2)
                    if self.correcting_xt_fn is not None:
                        x = self.correcting_xt_fn(x, t, step)
                    if return_intermediate:
                        intermediates.append(x)
            else:
                raise ValueError(f"Got wrong method {method}")
            if denoise_to_zero:
                t = torch.ones((1,)).to(device) * t_0
                x = self.denoise_to_zero_fn(x, t)
                if self.correcting_xt_fn is not None:
                    x = self.correcting_xt_fn(x, t, step + 1)
                if return_intermediate:
                    intermediates.append(x)
        return (x, intermediates) if return_intermediate else x


#############################################################
# other utility functions
#############################################################


def interpolate_fn(x, xp, yp):
    """
    A piecewise linear function y = f(x), using xp and yp as keypoints.
    We implement f(x) in a differentiable way (i.e. applicable for autograd).
    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)

    Args:
        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
        yp: PyTorch tensor with shape [C, K].
    Returns:
        The function values f(x), with shape [N, C].
    """
    N, K = x.shape[0], xp.shape[1]
    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
    x_idx = torch.argmin(x_indices, dim=2)
    cand_start_idx = x_idx - 1
    start_idx = torch.where(
        torch.eq(x_idx, 0),
        torch.tensor(1, device=x.device),
        torch.where(
            torch.eq(x_idx, K),
            torch.tensor(K - 2, device=x.device),
            cand_start_idx,
        ),
    )
    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
    start_idx2 = torch.where(
        torch.eq(x_idx, 0),
        torch.tensor(0, device=x.device),
        torch.where(
            torch.eq(x_idx, K),
            torch.tensor(K - 2, device=x.device),
            cand_start_idx,
        ),
    )
    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
    return start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)


def expand_dims(v, dims):
    """
    Expand the tensor `v` to the dim `dims`.

    Args:
        `v`: a PyTorch tensor with shape [N].
        `dim`: a `int`.
    Returns:
        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
    """
    return v[(...,) + (None,) * (dims - 1)]


def DPMS(
    model,
    condition,
    uncondition,
    cfg_scale,
    model_type="noise",
    noise_schedule="linear",
    guidance_type="classifier-free",
    model_kwargs=None,
    diffusion_steps=1000,
):
    if model_kwargs is None:
        model_kwargs = {}
    betas = torch.tensor(get_named_beta_schedule(noise_schedule, diffusion_steps))

    ## 1. Define the noise schedule.
    noise_schedule = NoiseScheduleVP(schedule="discrete", betas=betas)

    ## 2. Convert your discrete-time `model` to the continuous-time
    ## noise prediction model. Here is an example for a diffusion model
    ## `model` with the noise prediction type ("noise") .
    model_fn = model_wrapper(
        model,
        noise_schedule,
        model_type=model_type,
        model_kwargs=model_kwargs,
        guidance_type=guidance_type,
        condition=condition,
        unconditional_condition=uncondition,
        guidance_scale=cfg_scale,
    )
    ## 3. Define dpm-solver and sample by multistep DPM-Solver.
    return DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")


================================================
FILE: Open-Sora/opensora/schedulers/iddpm/__init__.py
================================================
from functools import partial

import torch

from opensora.registry import SCHEDULERS

from . import gaussian_diffusion as gd
from .respace import SpacedDiffusion, space_timesteps
from .speed import SpeeDiffusion


@SCHEDULERS.register_module("iddpm")
class IDDPM(SpacedDiffusion):
    def __init__(
        self,
        num_sampling_steps=None,
        timestep_respacing=None,
        noise_schedule="linear",
        use_kl=False,
        sigma_small=False,
        predict_xstart=False,
        learn_sigma=True,
        rescale_learned_sigmas=False,
        diffusion_steps=1000,
        cfg_scale=4.0,
        cfg_channel=None,
    ):
        betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
        if use_kl:
            loss_type = gd.LossType.RESCALED_KL
        elif rescale_learned_sigmas:
            loss_type = gd.LossType.RESCALED_MSE
        else:
            loss_type = gd.LossType.MSE
        if num_sampling_steps is not None:
            assert timestep_respacing is None
            timestep_respacing = str(num_sampling_steps)
        if timestep_respacing is None or timestep_respacing == "":
            timestep_respacing = [diffusion_steps]
        super().__init__(
            use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
            betas=betas,
            model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X),
            model_var_type=(
                (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
                if not learn_sigma
                else gd.ModelVarType.LEARNED_RANGE
            ),
            loss_type=loss_type,
        )

        self.cfg_scale = cfg_scale
        self.cfg_channel = cfg_channel

    def sample(
        self,
        model,
        text_encoder,
        z,
        prompts,
        device,
        additional_args=None,
        mask=None,
        progress=True,
    ):
        n = len(prompts)
        z = torch.cat([z, z], 0)
        model_args = text_encoder.encode(prompts)
        y_null = text_encoder.null(n)
        model_args["y"] = torch.cat([model_args["y"], y_null], 0)
        if additional_args is not None:
            model_args.update(additional_args)
        forward = partial(forward_with_cfg, model, cfg_scale=self.cfg_scale, cfg_channel=self.cfg_channel)
        samples = self.p_sample_loop(
            forward,
            z.shape,
            z,
            clip_denoised=False,
            model_kwargs=model_args,
            progress=progress,
            device=device,
            mask=mask,
        )
        samples, _ = samples.chunk(2, dim=0)
        return samples


def forward_with_cfg(model, x, timestep, y, cfg_scale, cfg_channel=None, **kwargs):
    # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
    half = x[: len(x) // 2]
    combined = torch.cat([half, half], dim=0)
    if "x_mask" in kwargs and kwargs["x_mask"] is not None:
        if len(kwargs["x_mask"]) != len(x):
            kwargs["x_mask"] = torch.cat([kwargs["x_mask"], kwargs["x_mask"]], dim=0)
    model_out = model.forward(combined, timestep, y, **kwargs)
    model_out = model_out["x"] if isinstance(model_out, dict) else model_out
    if cfg_channel is None:
        cfg_channel = model_out.shape[1] // 2
    eps, rest = model_out[:, :cfg_channel], model_out[:, cfg_channel:]
    cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
    half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
    eps = torch.cat([half_eps, half_eps], dim=0)
    return torch.cat([eps, rest], dim=1)


================================================
FILE: Open-Sora/opensora/schedulers/iddpm/diffusion_utils.py
================================================
# Adapted from DiT

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DiT:   https://github.com/facebookresearch/DiT/tree/main
# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
# ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
# --------------------------------------------------------


import numpy as np
import torch


def normal_kl(mean1, logvar1, mean2, logvar2):
    """
    Compute the KL divergence between two gaussians.
    Shapes are automatically broadcasted, so batches can be compared to
    scalars, among other use cases.
    """
    tensor = None
    for obj in (mean1, logvar1, mean2, logvar2):
        if isinstance(obj, torch.Tensor):
            tensor = obj
            break
    assert tensor is not None, "at least one argument must be a Tensor"

    # Force variances to be Tensors. Broadcasting helps convert scalars to
    # Tensors, but it does not work for torch.exp().
    logvar1, logvar2 = [x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) for x in (logvar1, logvar2)]

    return 0.5 * (
        -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
    )


def approx_standard_normal_cdf(x):
    """
    A fast approximation of the cumulative distribution function of the
    standard normal.
    """
    return 0.5 * (1.0 + torch.tanh(np.sqrt(2.0 / torch.pi) * (x + 0.044715 * torch.pow(x, 3))))


def continuous_gaussian_log_likelihood(x, *, means, log_scales):
    """
    Compute the log-likelihood of a continuous Gaussian distribution.
    :param x: the targets
    :param means: the Gaussian mean Tensor.
    :param log_scales: the Gaussian log stddev Tensor.
    :return: a tensor like x of log probabilities (in nats).
    """
    centered_x = x - means
    inv_stdv = torch.exp(-log_scales)
    normalized_x = centered_x * inv_stdv
    log_probs = torch.distributions.Normal(torch.zeros_like(x), torch.ones_like(x)).log_prob(normalized_x)
    return log_probs


def discretized_gaussian_log_likelihood(x, *, means, log_scales):
    """
    Compute the log-likelihood of a Gaussian distribution discretizing to a
    given image.
    :param x: the target images. It is assumed that this was uint8 values,
              rescaled to the range [-1, 1].
    :param means: the Gaussian mean Tensor.
    :param log_scales: the Gaussian log stddev Tensor.
    :return: a tensor like x of log probabilities (in nats).
    """
    assert x.shape == means.shape == log_scales.shape
    centered_x = x - means
    inv_stdv = torch.exp(-log_scales)
    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
    cdf_plus = approx_standard_normal_cdf(plus_in)
    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
    cdf_min = approx_standard_normal_cdf(min_in)
    log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12))
    log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12))
    cdf_delta = cdf_plus - cdf_min
    log_probs = torch.where(
        x < -0.999,
        log_cdf_plus,
        torch.where(x > 0.999, log_one_minus_cdf_min, torch.log(cdf_delta.clamp(min=1e-12))),
    )
    assert log_probs.shape == x.shape
    return log_probs


================================================
FILE: Open-Sora/opensora/schedulers/iddpm/gaussian_diffusion.py
================================================
# Adapted from DiT

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DiT:   https://github.com/facebookresearch/DiT/tree/main
# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
# ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
# --------------------------------------------------------

import enum
from typing import Callable, List

import numpy as np
import torch
from einops import rearrange

from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl


def mean_flat(tensor: torch.Tensor, mask=None):
    """
    Take the mean over all non-batch dimensions.
    """
    if mask is None:
        return tensor.mean(dim=list(range(1, len(tensor.shape))))
    else:
        assert tensor.dim() == 5
        assert tensor.shape[2] == mask.shape[1]
        tensor = rearrange(tensor, "b c t h w -> b t (c h w)")
        denom = mask.sum(dim=1) * tensor.shape[-1]
        loss = (tensor * mask.unsqueeze(2)).sum(dim=1).sum(dim=1) / denom
        return loss


class ModelMeanType(enum.Enum):
    """
    Which type of output the model predicts.
    """

    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
    START_X = enum.auto()  # the model predicts x_0
    EPSILON = enum.auto()  # the model predicts epsilon


class ModelVarType(enum.Enum):
    """
    What is used as the model's output variance.
    The LEARNED_RANGE option has been added to allow the model to predict
    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
    """

    LEARNED = enum.auto()
    FIXED_SMALL = enum.auto()
    FIXED_LARGE = enum.auto()
    LEARNED_RANGE = enum.auto()


class LossType(enum.Enum):
    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
    RESCALED_MSE = enum.auto()  # use raw MSE loss (with RESCALED_KL when learning variances)
    KL = enum.auto()  # use the variational lower-bound
    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB

    def is_vb(self):
        return self == LossType.KL or self == LossType.RESCALED_KL


def _warmup_beta(beta_start: float, beta_end: float, num_diffusion_timesteps: int, warmup_frac: float) -> torch.Tensor:
    betas = beta_end * torch.ones(num_diffusion_timesteps, dtype=torch.float64)
    warmup_time = int(num_diffusion_timesteps * warmup_frac)
    betas[:warmup_time] = torch.linspace(beta_start, beta_end, warmup_time, dtype=torch.float64)
    return betas


def get_beta_schedule(
    beta_schedule: str, *, beta_start: float, beta_end: float, num_diffusion_timesteps: int
) -> torch.Tensor:
    """
    This is the deprecated API for creating beta schedules.
    See get_named_beta_schedule() for the new library of schedules.
    """
    if beta_schedule == "quad":
        betas = (
            torch.linspace(
                beta_start**0.5,
                beta_end**0.5,
                num_diffusion_timesteps,
                dtype=torch.float64,
            )
            ** 2
        )
    elif beta_schedule == "linear":
        betas = torch.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=torch.float64)
    elif beta_schedule == "warmup10":
        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
    elif beta_schedule == "warmup50":
        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
    elif beta_schedule == "const":
        betas = beta_end * torch.ones(num_diffusion_timesteps, dtype=torch.float64)
    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
        betas = 1.0 / torch.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=torch.float64)
    else:
        raise NotImplementedError(beta_schedule)
    assert betas.shape == (num_diffusion_timesteps,)
    return betas


def betas_for_alpha_bar(num_diffusion_timesteps: int, alpha_bar: Callable, max_beta: float = 0.999):
    """
    Create a beta schedule that discretizes the given alpha_t_bar function,
    which defines the cumulative product of (1-beta) over time from t = [0,1].
    :param num_diffusion_timesteps: the number of betas to produce.
    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
                      produces the cumulative product of (1-beta) up to that
                      part of the diffusion process.
    :param max_beta: the maximum beta to use; use values lower than 1 to
                     prevent singularities.
    """
    betas = []
    for i in range(num_diffusion_timesteps):
        t1 = i / num_diffusion_timesteps
        t2 = (i + 1) / num_diffusion_timesteps
        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
    return torch.DoubleTensor(betas)


def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
    """
    Get a pre-defined beta schedule for the given name.
    The beta schedule library consists of beta schedules which remain similar
    in the limit of num_diffusion_timesteps.
    Beta schedules may be added, but should not be removed or changed once
    they are committed to maintain backwards compatibility.
    """
    if schedule_name == "linear":
        # Linear schedule from Ho et al, extended to work for any number of
        # diffusion steps.
        scale = 1000 / num_diffusion_timesteps
        return get_beta_schedule(
            "linear",
            beta_start=scale * 0.0001,
            beta_end=scale * 0.02,
            num_diffusion_timesteps=num_diffusion_timesteps,
        )
    elif schedule_name == "squaredcos_cap_v2":
        return betas_for_alpha_bar(
            num_diffusion_timesteps,
            lambda t: matorch.cos((t + 0.008) / 1.008 * matorch.pi / 2) ** 2,
        )
    else:
        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")


class GaussianDiffusion:
    """
    Utilities for training and sampling diffusion models.
    Original ported from this codebase:
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
    :param betas: a 1-D numpy array of betas for each diffusion timestep,
                  starting at T and going to 1.
    """

    def __init__(
        self,
        *,
        betas: torch.Tensor,
        model_mean_type: str,
        model_var_type: str,
        loss_type: str,
        device: str = "cuda",
    ):
        if device == "cuda":
            device = torch.device(f"cuda:{torch.cuda.current_device()}")
        elif device == "cpu":
            device = torch.device("cpu")
        else:
            raise ValueError(f"Unknown device: {device}")
        self.device = device
        self.model_mean_type = model_mean_type
        self.model_var_type = model_var_type
        self.loss_type = loss_type

        # Use float64 for accuracy.
        self.betas = betas.to(self.device)
        assert len(self.betas.shape) == 1, "betas must be 1-D"
        assert (self.betas > 0).all() and (self.betas <= 1).all()

        self.num_timesteps = int(betas.shape[0])

        alphas = 1.0 - self.betas
        self.alphas_cumprod = torch.cumprod(alphas, axis=0)
        self.alphas_cumprod_prev = torch.cat([torch.tensor([1.0], device=self.device), self.alphas_cumprod[:-1]])
        self.alphas_cumprod_next = torch.cat([self.alphas_cumprod[1:], torch.tensor([0.0], device=self.device)])
        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)

        # calculations for diffusion q(x_t | x_{t-1}) and others
        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)
        self.log_one_minus_alphas_cumprod = torch.log(1.0 - self.alphas_cumprod)
        self.sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod)
        self.sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod - 1)

        # calculations for posterior q(x_{t-1} | x_t, x_0)
        self.posterior_variance = self.betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
        self.posterior_log_variance_clipped = (
            torch.log(torch.cat([self.posterior_variance[1].unsqueeze(0), self.posterior_variance[1:]]))
            if len(self.posterior_variance) > 1
            else torch.DoubleTensor([])
        )

        self.posterior_mean_coef1 = self.betas * torch.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * torch.sqrt(alphas) / (1.0 - self.alphas_cumprod)

    def q_mean_variance(self, x_start, t):
        """
        Get the distribution q(x_t | x_0).
        :param x_start: the [N x C x ...] tensor of noiseless inputs.
        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
        """
        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
        return mean, variance, log_variance

    def q_sample(self, x_start, t, noise=None):
        """
        Diffuse the data for a given number of diffusion steps.
        In other words, sample from q(x_t | x_0).
        :param x_start: the initial data batch.
        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
        :param noise: if specified, the split-out normal noise.
        :return: A noisy version of x_start.
        """
        if noise is None:
            noise = torch.randn_like(x_start)
        assert noise.shape == x_start.shape
        return (
            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
        )

    def q_posterior_mean_variance(self, x_start, x_t, t):
        """
        Compute the mean and variance of the diffusion posterior:
            q(x_{t-1} | x_t, x_0)
        """
        assert x_start.shape == x_t.shape
        posterior_mean = (
            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
        )
        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
        posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
        assert (
            posterior_mean.shape[0]
            == posterior_variance.shape[0]
            == posterior_log_variance_clipped.shape[0]
            == x_start.shape[0]
        )
        return posterior_mean, posterior_variance, posterior_log_variance_clipped

    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
        """
        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
        the initial x, x_0.
        :param model: the model, which takes a signal and a batch of timesteps
                      as input.
        :param x: the [N x C x ...] tensor at time t.
        :param t: a 1-D Tensor of timesteps.
        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample. Applies before
            clip_denoised.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict with the following keys:
                 - 'mean': the model mean output.
                 - 'variance': the model variance output.
                 - 'log_variance': the log of 'variance'.
                 - 'pred_xstart': the prediction for x_0.
        """
        if model_kwargs is None:
            model_kwargs = {}

        B, C = x.shape[:2]
        assert t.shape == (B,)
        model_output = model(x, t, **model_kwargs)
        if isinstance(model_output, tuple):
            model_output, extra = model_output
        else:
            extra = None

        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
            assert model_output.shape == (B, C * 2, *x.shape[2:])
            model_output, model_var_values = torch.split(model_output, C, dim=1)
            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
            max_log = _extract_into_tensor(torch.log(self.betas), t, x.shape)
            # The model_var_values is [-1, 1] for [min_var, max_var].
            frac = (model_var_values + 1) / 2
            model_log_variance = frac * max_log + (1 - frac) * min_log
            model_variance = torch.exp(model_log_variance)
        else:
            model_variance, model_log_variance = {
                # for fixedlarge, we set the initial (log-)variance like so
                # to get a better decoder log likelihood.
                ModelVarType.FIXED_LARGE: (
                    torch.cat(self.posterior_variance[1].unsqueeze(0), self.betas[1:]),
                    torch.log(torch.cat(self.posterior_variance[1].unsqueeze(0), self.betas[1:])),
                ),
                ModelVarType.FIXED_SMALL: (
                    self.posterior_variance,
                    self.posterior_log_variance_clipped,
                ),
            }[self.model_var_type]
            model_variance = _extract_into_tensor(model_variance, t, x.shape)
            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)

        def process_xstart(x):
            if denoised_fn is not None:
                x = denoised_fn(x)
            if clip_denoised:
                return x.clamp(-1, 1)
            return x

        if self.model_mean_type == ModelMeanType.START_X:
            pred_xstart = process_xstart(model_output)
        else:
            pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output))
        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)

        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
        return {
            "mean": model_mean,
            "variance": model_variance,
            "log_variance": model_log_variance,
            "pred_xstart": pred_xstart,
            "extra": extra,
        }

    def _predict_xstart_from_eps(self, x_t, t, eps):
        assert x_t.shape == eps.shape
        return (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
        )

    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
        return (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)

    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
        """
        Compute the mean for the previous step, given a function cond_fn that
        computes the gradient of a conditional log probability with respect to
        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
        condition on y.
        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
        """
        gradient = cond_fn(x, t, **model_kwargs)
        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
        return new_mean

    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
        """
        Compute what the p_mean_variance output would have been, should the
        model's score function be conditioned by cond_fn.
        See condition_mean() for details on cond_fn.
        Unlike condition_mean(), this instead uses the conditioning strategy
        from Song et al (2020).
        """
        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)

        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)

        out = p_mean_var.copy()
        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
        return out

    def p_sample(
        self,
        model,
        x,
        t,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        mask=None,
    ):
        """
        Sample x_{t-1} from the model at the given timestep.
        :param model: the model to sample from.
        :param x: the current tensor at x_{t-1}.
        :param t: the value of t, starting at 0 for the first diffusion step.
        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample.
        :param cond_fn: if not None, this is a gradient function that acts
                        similarly to the model.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict containing the following keys:
                 - 'sample': a random sample from the model.
                 - 'pred_xstart': a prediction of x_0.
        """
        if mask is not None:
            if mask.shape[0] != x.shape[0]:
                mask = mask.repeat(2, 1)  # HACK
            mask_t = (mask * len(self.betas)).to(torch.int)

            # x0: copy unchanged x values
            # x_noise: add noise to x values
            x0 = x.clone()
            x_noise = x0 * _extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) + torch.randn_like(
                x
            ) * _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape)

            # active noise addition
            # WARNING: this is a hacky implementation
            mask_t_equall = (mask_t == t.unsqueeze(1))[:, None, :, None, None]
            x = torch.where(mask_t_equall, x_noise, x0)

            # create x_mask
            mask_t_upper = (mask_t > t.unsqueeze(1))[:, None, :, None, None]
            batch_size = x.shape[0]
            model_kwargs["x_mask"] = mask_t_upper.reshape(batch_size, -1).to(torch.bool)

        out = self.p_mean_variance(
            model,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        noise = torch.randn_like(x)
        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))  # no noise when t == 0
        if cond_fn is not None:
            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
        sample = out["mean"] + nonzero_mask * torch.exp(0.5 * out["log_variance"]) * noise

        if mask is not None:
            mask_t_lower = (mask_t < t.unsqueeze(1))[:, None, :, None, None]
            sample = torch.where(mask_t_lower, x0, sample)

        return {"sample": sample, "pred_xstart": out["pred_xstart"]}

    def p_sample_loop(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
        mask=None,
    ):
        """
        Generate samples from the model.
        :param model: the model module.
        :param shape: the shape of the samples, (N, C, H, W).
        :param noise: if specified, the noise from the encoder to sample.
                      Should be of the same shape as `shape`.
        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample.
        :param cond_fn: if not None, this is a gradient function that acts
                        similarly to the model.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :param device: if specified, the device to create the samples on.
                       If not specified, use a model parameter's device.
        :param progress: if True, show a tqdm progress bar.
        :return: a non-differentiable batch of samples.
        """
        final = None
        for sample in self.p_sample_loop_progressive(
            model,
            shape,
            noise=noise,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            cond_fn=cond_fn,
            model_kwargs=model_kwargs,
            device=device,
            progress=progress,
            mask=mask,
        ):
            final = sample
        return final["sample"]

    def p_sample_loop_progressive(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
        mask=None,
    ):
        """
        Generate samples from the model and yield intermediate samples from
        each timestep of diffusion.
        Arguments are the same as p_sample_loop().
        Returns a generator over dicts, where each dict is the return value of
        p_sample().
        """
        if device is None:
            device = next(model.parameters()).device
        assert isinstance(shape, (tuple, list))
        if noise is not None:
            img = noise
        else:
            img = torch.randn(*shape, device=device)
        indices = list(range(self.num_timesteps))[::-1]

        if progress:
            # Lazy import so that we don't depend on tqdm.
            from tqdm.auto import tqdm

            indices = tqdm(indices)

        for i in indices:
            t = torch.tensor([i] * shape[0], device=device)
            with torch.no_grad():
                out = self.p_sample(
                    model,
                    img,
                    t,
                    clip_denoised=clip_denoised,
                    denoised_fn=denoised_fn,
                    cond_fn=cond_fn,
                    model_kwargs=model_kwargs,
                    mask=mask,
                )
                yield out
                img = out["sample"]

    def ddim_sample(
        self,
        model,
        x,
        t,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        eta=0.0,
    ):
        """
        Sample x_{t-1} from the model using DDIM.
        Same usage as p_sample().
        """
        out = self.p_mean_variance(
            model,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        if cond_fn is not None:
            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)

        # Usually our model outputs epsilon, but we re-derive it
        # in case we used x_start or x_prev prediction.
        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])

        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
        sigma = eta * torch.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * torch.sqrt(1 - alpha_bar / alpha_bar_prev)
        # Equation 12.
        noise = torch.randn_like(x)
        mean_pred = out["pred_xstart"] * torch.sqrt(alpha_bar_prev) + torch.sqrt(1 - alpha_bar_prev - sigma**2) * eps
        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))  # no noise when t == 0
        sample = mean_pred + nonzero_mask * sigma * noise
        return {"sample": sample, "pred_xstart": out["pred_xstart"]}

    def ddim_reverse_sample(
        self,
        model,
        x,
        t,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        eta=0.0,
    ):
        """
        Sample x_{t+1} from the model using DDIM reverse ODE.
        """
        assert eta == 0.0, "Reverse ODE only for deterministic path"
        out = self.p_mean_variance(
            model,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        if cond_fn is not None:
            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
        # Usually our model outputs epsilon, but we re-derive it
        # in case we used x_start or x_prev prediction.
        eps = (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"]
        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)

        # Equation 12. reversed
        mean_pred = out["pred_xstart"] * torch.sqrt(alpha_bar_next) + torch.sqrt(1 - alpha_bar_next) * eps

        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}

    def ddim_sample_loop(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
        eta=0.0,
    ):
        """
        Generate samples from the model using DDIM.
        Same usage as p_sample_loop().
        """
        final = None
        for sample in self.ddim_sample_loop_progressive(
            model,
            shape,
            noise=noise,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            cond_fn=cond_fn,
            model_kwargs=model_kwargs,
            device=device,
            progress=progress,
            eta=eta,
        ):
            final = sample
        return final["sample"]

    def ddim_sample_loop_progressive(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
        eta=0.0,
    ):
        """
        Use DDIM to sample from the model and yield intermediate samples from
        each timestep of DDIM.
        Same usage as p_sample_loop_progressive().
        """
        if device is None:
            device = next(model.parameters()).device
        assert isinstance(shape, (tuple, list))
        if noise is not None:
            img = noise
        else:
            img = torch.randn(*shape, device=device)
        indices = list(range(self.num_timesteps))[::-1]

        if progress:
            # Lazy import so that we don't depend on tqdm.
            from tqdm.auto import tqdm

            indices = tqdm(indices)

        for i in indices:
            t = torch.tensor([i] * shape[0], device=device)
            with torch.no_grad():
                out = self.ddim_sample(
                    model,
                    img,
                    t,
                    clip_denoised=clip_denoised,
                    denoised_fn=denoised_fn,
                    cond_fn=cond_fn,
                    model_kwargs=model_kwargs,
                    eta=eta,
                )
                yield out
                img = out["sample"]

    def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None, mask=None):
        """
        Get a term for the variational lower-bound.
        The resulting units are bits (rather than nats, as one might expect).
        This allows for comparison to other papers.
        :return: a dict with the following keys:
                 - 'output': a shape [N] tensor of NLLs or KLs.
                 - 'pred_xstart': the x_0 predictions.
        """
        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)
        out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs)
        kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"])
        kl = mean_flat(kl, mask=mask) / np.log(2.0)

        decoder_nll = -discretized_gaussian_log_likelihood(
            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
        )
        assert decoder_nll.shape == x_start.shape
        decoder_nll = mean_flat(decoder_nll, mask=mask) / np.log(2.0)

        # At the first timestep return the decoder NLL,
        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
        output = torch.where((t == 0), decoder_nll, kl)
        return {"output": output, "pred_xstart": out["pred_xstart"]}

    def training_losses(self, model, x_start, model_kwargs=None, noise=None, mask=None, weights=None, t=None):
        """
        Compute training losses for a single timestep.
        :param model: the model to evaluate loss on.
        :param x_start: the [N x C x ...] tensor of inputs.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :param noise: if specified, the specific Gaussian noise to try to remove.
        :return: a dict with the key "loss" containing a tensor of shape [N].
                 Some mean or variance settings may also have other keys.
        """
        # sample timestep
        t = torch.randint(0, self.num_timesteps, (x_start.shape[0],), device=x_start.device)

        if model_kwargs is None:
            model_kwargs = {}
        if noise is None:
            noise = torch.randn_like(x_start)
        x_t = self.q_sample(x_start, t, noise=noise)
        if mask is not None:
            t0 = torch.zeros_like(t)
            x_t0 = self.q_sample(x_start, t0, noise=noise)
            x_t = torch.where(mask[:, None, :, None, None], x_t, x_t0)

        terms = {}

        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
            assert mask is None, "mask not supported for KL loss"
            terms["loss"] = self._vb_terms_bpd(
                model=model,
                x_start=x_start,
                x_t=x_t,
                t=t,
                clip_denoised=False,
                model_kwargs=model_kwargs,
            )["output"]
            if self.loss_type == LossType.RESCALED_KL:
                terms["loss"] *= self.num_timesteps
        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
            model_output = model(x_t, t, **model_kwargs)

            if self.model_var_type in [
                ModelVarType.LEARNED,
                ModelVarType.LEARNED_RANGE,
            ]:
                B, C = x_t.shape[:2]
                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
                model_output, model_var_values = torch.split(model_output, C, dim=1)
                # Learn the variance using the variational bound, but don't let
                # it affect our mean prediction.
                frozen_out = torch.cat([model_output.detach(), model_var_values], dim=1)
                terms["vb"] = self._vb_terms_bpd(
                    model=lambda *args, r=frozen_out: r,
                    x_start=x_start,
                    x_t=x_t,
                    t=t,
                    clip_denoised=False,
                    mask=mask,
                )["output"]
                if self.loss_type == LossType.RESCALED_MSE:
                    # Divide by 1000 for equivalence with initial implementation.
                    # Without a factor of 1/1000, the VB term hurts the MSE term.
                    terms["vb"] *= self.num_timesteps / 1000.0

            target = {
                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0],
                ModelMeanType.START_X: x_start,
                ModelMeanType.EPSILON: noise,
            }[self.model_mean_type]
            assert model_output.shape == target.shape == x_start.shape
            if weights is None:
                terms["mse"] = mean_flat((target - model_output) ** 2, mask=mask)
            else:
                weight = _extract_into_tensor(weights, t, target.shape)
                terms["mse"] = mean_flat(weight * (target - model_output) ** 2, mask=mask)
            if "vb" in terms:
                terms["loss"] = terms["mse"] + terms["vb"]
            else:
                terms["loss"] = terms["mse"]
        else:
            raise NotImplementedError(self.loss_type)

        return terms

    def _prior_bpd(self, x_start):
        """
        Get the prior KL term for the variational lower-bound, measured in
        bits-per-dim.
        This term can't be optimized, as it only depends on the encoder.
        :param x_start: the [N x C x ...] tensor of inputs.
        :return: a batch of [N] KL values (in bits), one per batch element.
        """
        batch_size = x_start.shape[0]
        t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
        kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
        return mean_flat(kl_prior) / np.log(2.0)

    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
        """
        Compute the entire variational lower-bound, measured in bits-per-dim,
        as well as other related quantities.
        :param model: the model to evaluate loss on.
        :param x_start: the [N x C x ...] tensor of inputs.
        :param clip_denoised: if True, clip denoised samples.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict containing the following keys:
                 - total_bpd: the total variational lower-bound, per batch element.
                 - prior_bpd: the prior term in the lower-bound.
                 - vb: an [N x T] tensor of terms in the lower-bound.
                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
        """
        device = x_start.device
        batch_size = x_start.shape[0]

        vb = []
        xstart_mse = []
        mse = []
        for t in list(range(self.num_timesteps))[::-1]:
            t_batch = torch.tensor([t] * batch_size, device=device)
            noise = torch.randn_like(x_start)
            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
            # Calculate VLB term at the current timestep
            with torch.no_grad():
                out = self._vb_terms_bpd(
                    model,
                    x_start=x_start,
                    x_t=x_t,
                    t=t_batch,
                    clip_denoised=clip_denoised,
                    model_kwargs=model_kwargs,
                )
            vb.append(out["output"])
            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
            mse.append(mean_flat((eps - noise) ** 2))

        vb = torch.stack(vb, dim=1)
        xstart_mse = torch.stack(xstart_mse, dim=1)
        mse = torch.stack(mse, dim=1)

        prior_bpd = self._prior_bpd(x_start)
        total_bpd = vb.sum(dim=1) + prior_bpd
        return {
            "total_bpd": total_bpd,
            "prior_bpd": prior_bpd,
            "vb": vb,
            "xstart_mse": xstart_mse,
            "mse": mse,
        }


def _extract_into_tensor(arr: torch.Tensor, timesteps: torch.Tensor, broadcast_shape: List[int]):
    """
    Extract values from a 1-D numpy array for a batch of indices.
    :param arr: the 1-D numpy array.
    :param timesteps: a tensor of indices into the array to extract.
    :param broadcast_shape: a larger shape of K dimensions with the batch
                            dimension equal to the length of timesteps.
    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
    """
    res = arr.to(timesteps.device)[timesteps].float()
    while len(res.shape) < len(broadcast_shape):
        res = res[..., None]
    return res + torch.zeros(broadcast_shape, device=timesteps.device)


================================================
FILE: Open-Sora/opensora/schedulers/iddpm/respace.py
================================================
# Adapted from DiT

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DiT:   https://github.com/facebookresearch/DiT/tree/main
# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
# ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
# --------------------------------------------------------


import torch
from colossalai.utils import get_current_device

from .gaussian_diffusion import GaussianDiffusion


def space_timesteps(num_timesteps, section_counts):
    """
    Create a list of timesteps to use from an original diffusion process,
    given the number of timesteps we want to take from equally-sized portions
    of the original process.
    For example, if there's 300 timesteps and the section counts are [10,15,20]
    then the first 100 timesteps are strided to be 10 timesteps, the second 100
    are strided to be 15 timesteps, and the final 100 are strided to be 20.
    If the stride is a string starting with "ddim", then the fixed striding
    from the DDIM paper is used, and only one section is allowed.
    :param num_timesteps: the number of diffusion steps in the original
                          process to divide up.
    :param section_counts: either a list of numbers, or a string containing
                           comma-separated numbers, indicating the step count
                           per section. As a special case, use "ddimN" where N
                           is a number of steps to use the striding from the
                           DDIM paper.
    :return: a set of diffusion steps from the original process to use.
    """
    if isinstance(section_counts, str):
        if section_counts.startswith("ddim"):
            desired_count = int(section_counts[len("ddim") :])
            for i in range(1, num_timesteps):
                if len(range(0, num_timesteps, i)) == desired_count:
                    return set(range(0, num_timesteps, i))
            raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride")
        section_counts = [int(x) for x in section_counts.split(",")]
    size_per = num_timesteps // len(section_counts)
    extra = num_timesteps % len(section_counts)
    start_idx = 0
    all_steps = []
    for i, section_count in enumerate(section_counts):
        size = size_per + (1 if i < extra else 0)
        if size < section_count:
            raise ValueError(f"cannot divide section of {size} steps into {section_count}")
        if section_count <= 1:
            frac_stride = 1
        else:
            frac_stride = (size - 1) / (section_count - 1)
        cur_idx = 0.0
        taken_steps = []
        for _ in range(section_count):
            taken_steps.append(start_idx + round(cur_idx))
            cur_idx += frac_stride
        all_steps += taken_steps
        start_idx += size
    return set(all_steps)


class SpacedDiffusion(GaussianDiffusion):
    """
    A diffusion process which can skip steps in a base diffusion process.
    :param use_timesteps: a collection (sequence or set) of timesteps from the
                          original diffusion process to retain.
    :param kwargs: the kwargs to create the base diffusion process.
    """

    def __init__(self, use_timesteps, **kwargs):
        self.use_timesteps = set(use_timesteps)
        self.timestep_map = []
        self.original_num_steps = len(kwargs["betas"])

        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
        last_alpha_cumprod = 1.0
        new_betas = []
        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
            if i in self.use_timesteps:
                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
                last_alpha_cumprod = alpha_cumprod
                self.timestep_map.append(i)
        kwargs["betas"] = torch.FloatTensor(new_betas)
        super().__init__(**kwargs)
        self.map_tensor = torch.tensor(self.timestep_map, device=get_current_device())

    def p_mean_variance(self, model, *args, **kwargs):  # pylint: disable=signature-differs
        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)

    def training_losses(self, model, *args, **kwargs):  # pylint: disable=signature-differs
        return super().training_losses(self._wrap_model(model), *args, **kwargs)

    def condition_mean(self, cond_fn, *args, **kwargs):
        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)

    def condition_score(self, cond_fn, *args, **kwargs):
        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)

    def _wrap_model(self, model):
        if isinstance(model, _WrappedModel):
            return model
        return _WrappedModel(model, self.map_tensor, self.original_num_steps)

    def _scale_timesteps(self, t):
        # Scaling is done by the wrapped model.
        return t


class _WrappedModel:
    def __init__(self, model, map_tensor, original_num_steps):
        self.model = model
        self.map_tensor = map_tensor
        # self.rescale_timesteps = rescale_timesteps
        self.original_num_steps = original_num_steps

    def __call__(self, x, ts, **kwargs):
        new_ts = self.map_tensor[ts].to(device=ts.device, dtype=ts.dtype)
        # if self.rescale_timesteps:
        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
        return self.model(x, new_ts, **kwargs)


================================================
FILE: Open-Sora/opensora/schedulers/iddpm/speed.py
================================================
import numpy as np
import torch
import torch.nn.functional as F

from opensora.registry import SCHEDULERS

from . import gaussian_diffusion as gd
from .respace import SpacedDiffusion, space_timesteps


@SCHEDULERS.register_module("iddpm-speed")
class SpeeDiffusion(SpacedDiffusion):
    def __init__(
        self,
        num_sampling_steps=None,
        timestep_respacing=None,
        noise_schedule="linear",
        use_kl=False,
        sigma_small=False,
        predict_xstart=False,
        learn_sigma=True,
        rescale_learned_sigmas=False,
        diffusion_steps=1000,
        cfg_scale=4.0,
    ):
        betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
        if use_kl:
            loss_type = gd.LossType.RESCALED_KL
        elif rescale_learned_sigmas:
            loss_type = gd.LossType.RESCALED_MSE
        else:
            loss_type = gd.LossType.MSE
        if num_sampling_steps is not None:
            assert timestep_respacing is None
            timestep_respacing = str(num_sampling_steps)
        if timestep_respacing is None or timestep_respacing == "":
            timestep_respacing = [diffusion_steps]
        super().__init__(
            use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
            betas=betas,
            model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X),
            model_var_type=(
                (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
                if not learn_sigma
                else gd.ModelVarType.LEARNED_RANGE
            ),
            loss_type=loss_type,
        )

        self.cfg_scale = cfg_scale
        # we fallback to numpy here as argmax_cuda is not implemented for Bool
        grad = np.gradient(self.sqrt_one_minus_alphas_cumprod.cpu())
        self.meaningful_steps = np.argmax(grad < 5e-5) + 1

        # p2 weighting from: Perception Prioritized Training of Diffusion Models
        self.p2_gamma = 1
        self.p2_k = 1
        self.snr = 1.0 / (1 - self.alphas_cumprod) - 1
        sqrt_one_minus_alphas_bar = self.sqrt_one_minus_alphas_cumprod
        p = torch.tanh(1e6 * (torch.gradient(sqrt_one_minus_alphas_bar)[0] - 1e-4)) + 1.5
        self.p = F.normalize(p, p=1, dim=0)
        self.weights = 1 / (self.p2_k + self.snr) ** self.p2_gamma

    def t_sample(self, n, device):
        t = torch.multinomial(self.p, n // 2 + 1, replacement=True).to(device)
        dual_t = torch.where(t < self.meaningful_steps, self.meaningful_steps - t, t - self.meaningful_steps)
        t = torch.cat([t, dual_t], dim=0)[:n]
        return t

    def training_losses(self, model, x, *args, **kwargs):  # pylint: disable=signature-differs
        t = self.t_sample(x.shape[0], x.device)
        return super().training_losses(model, x, t, weights=self.weights, *args, **kwargs)

    def sample(self, *args, **kwargs):
        raise NotImplementedError("SpeeDiffusion is only for training")


================================================
FILE: Open-Sora/opensora/schedulers/iddpm/timestep_sampler.py
================================================
# Adapted from DiT

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DiT:   https://github.com/facebookresearch/DiT/tree/main
# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
# ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
# --------------------------------------------------------

from abc import ABC, abstractmethod

import numpy as np
import torch as th
import torch.distributed as dist


def create_named_schedule_sampler(name, diffusion):
    """
    Create a ScheduleSampler from a library of pre-defined samplers.
    :param name: the name of the sampler.
    :param diffusion: the diffusion object to sample for.
    """
    if name == "uniform":
        return UniformSampler(diffusion)
    elif name == "loss-second-moment":
        return LossSecondMomentResampler(diffusion)
    else:
        raise NotImplementedError(f"unknown schedule sampler: {name}")


class ScheduleSampler(ABC):
    """
    A distribution over timesteps in the diffusion process, intended to reduce
    variance of the objective.
    By default, samplers perform unbiased importance sampling, in which the
    objective's mean is unchanged.
    However, subclasses may override sample() to change how the resampled
    terms are reweighted, allowing for actual changes in the objective.
    """

    @abstractmethod
    def weights(self):
        """
        Get a numpy array of weights, one per diffusion step.
        The weights needn't be normalized, but must be positive.
        """

    def sample(self, batch_size, device):
        """
        Importance-sample timesteps for a batch.
        :param batch_size: the number of timesteps.
        :param device: the torch device to save to.
        :return: a tuple (timesteps, weights):
                 - timesteps: a tensor of timestep indices.
                 - weights: a tensor of weights to scale the resulting losses.
        """
        w = self.weights()
        p = w / np.sum(w)
        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
        indices = th.from_numpy(indices_np).long().to(device)
        weights_np = 1 / (len(p) * p[indices_np])
        weights = th.from_numpy(weights_np).float().to(device)
        return indices, weights


class UniformSampler(ScheduleSampler):
    def __init__(self, diffusion):
        self.diffusion = diffusion
        self._weights = np.ones([diffusion.num_timesteps])

    def weights(self):
        return self._weights


class LossAwareSampler(ScheduleSampler):
    def update_with_local_losses(self, local_ts, local_losses):
        """
        Update the reweighting using losses from a model.
        Call this method from each rank with a batch of timesteps and the
        corresponding losses for each of those timesteps.
        This method will perform synchronization to make sure all of the ranks
        maintain the exact same reweighting.
        :param local_ts: an integer Tensor of timesteps.
        :param local_losses: a 1D Tensor of losses.
        """
        batch_sizes = [th.tensor([0], dtype=th.int32, device=local_ts.device) for _ in range(dist.get_world_size())]
        dist.all_gather(
            batch_sizes,
            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
        )

        # Pad all_gather batches to be the maximum batch size.
        batch_sizes = [x.item() for x in batch_sizes]
        max_bs = max(batch_sizes)

        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
        dist.all_gather(timestep_batches, local_ts)
        dist.all_gather(loss_batches, local_losses)
        timesteps = [x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]]
        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
        self.update_with_all_losses(timesteps, losses)

    @abstractmethod
    def update_with_all_losses(self, ts, losses):
        """
        Update the reweighting using losses from a model.
        Sub-classes should override this method to update the reweighting
        using losses from the model.
        This method directly updates the reweighting without synchronizing
        between workers. It is called by update_with_local_losses from all
        ranks with identical arguments. Thus, it should have deterministic
        behavior to maintain state across workers.
        :param ts: a list of int timesteps.
        :param losses: a list of float losses, one per timestep.
        """


class LossSecondMomentResampler(LossAwareSampler):
    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
        self.diffusion = diffusion
        self.history_per_term = history_per_term
        self.uniform_prob = uniform_prob
        self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64)
        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)

    def weights(self):
        if not self._warmed_up():
            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
        weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
        weights /= np.sum(weights)
        weights *= 1 - self.uniform_prob
        weights += self.uniform_prob / len(weights)
        return weights

    def update_with_all_losses(self, ts, losses):
        for t, loss in zip(ts, losses):
            if self._loss_counts[t] == self.history_per_term:
                # Shift out the oldest loss term.
                self._loss_history[t, :-1] = self._loss_history[t, 1:]
                self._loss_history[t, -1] = loss
            else:
                self._loss_history[t, self._loss_counts[t]] = loss
                self._loss_counts[t] += 1

    def _warmed_up(self):
        return (self._loss_counts == self.history_per_term).all()


================================================
FILE: Open-Sora/opensora/schedulers/rf/__init__.py
================================================
import torch
from tqdm import tqdm

from opensora.registry import SCHEDULERS

from .rectified_flow import RFlowScheduler, timestep_transform
from ...models.cache_functions import cache_init
import re

@SCHEDULERS.register_module("rflow")
class RFLOW:
    def __init__(
        self,
        num_sampling_steps=10,
        num_timesteps=1000,
        cfg_scale=4.0,
        use_discrete_timesteps=False,
        use_timestep_transform=False,
        **kwargs,
    ):
        self.num_sampling_steps = num_sampling_steps
        self.num_timesteps = num_timesteps
        self.cfg_scale = cfg_scale
        self.use_discrete_timesteps = use_discrete_timesteps
        self.use_timestep_transform = use_timestep_transform
        
        self.scheduler = RFlowScheduler(
            num_timesteps=num_timesteps,
            num_sampling_steps=num_sampling_steps,
            use_discrete_timesteps=use_discrete_timesteps,
            use_timestep_transform=use_timestep_transform,
            **kwargs,
        )

    def sample(
        self,
        model,
        text_encoder,
        z,
        prompts,
        device,
        additional_args=None,
        mask=None,
        guidance_scale=None,
        progress=True,
        #flops_cal=True,
    ):  
        # if no specific guidance scale is provided, use the default scale when initializing the scheduler
        if guidance_scale is None:
            guidance_scale = self.cfg_scale

        n = len(prompts)
        # text encoding
        model_args = text_encoder.encode(prompts)
        y_null = text_encoder.null(n)
        model_args["y"] = torch.cat([model_args["y"], y_null], 0)
        if additional_args is not None:
            model_args.update(additional_args)
        # prepare timesteps
        timesteps = [(1.0 - i / self.num_sampling_steps) * self.num_timesteps for i in range(self.num_sampling_steps)]
        if self.use_discrete_timesteps:
            timesteps = [int(round(t)) for t in timesteps]
        timesteps = [torch.tensor([t] * z.shape[0], device=device) for t in timesteps]
        if self.use_timestep_transform:
            timesteps = [timestep_transform(t, additional_args, num_timesteps=self.num_timesteps) for t in timesteps]

        if mask is not None:
            noise_added = torch.zeros_like(mask, dtype=torch.bool)
            noise_added = noise_added | (mask == 1)
        
        cache_dic_cal_flops, current_cal_flops = cache_init(model_kwargs=model_args, num_steps=self.num_sampling_steps)
        cache_dic, current = cache_init(model_kwargs=model_args, num_steps=self.num_sampling_steps)
        flops_sum = 0
        cal_flops = False
        if cal_flops:
            from calflops import calculate_flops
        progress_wrap = tqdm if progress else (lambda x: x)
        for i, t in progress_wrap(enumerate(timesteps)):
            current['step'] = i
            current_cal_flops['step'] = i
            # mask for adding noise
            if mask is not None:
                mask_t = mask * self.num_timesteps
                x0 = z.clone()
                x_noise = self.scheduler.add_noise(x0, torch.randn_like(x0), t)

                mask_t_upper = mask_t >= t.unsqueeze(1)
                model_args["x_mask"] = mask_t_upper.repeat(2, 1)
                mask_add_noise = mask_t_upper & ~noise_added

                z = torch.where(mask_add_noise[:, None, :, None, None], x_noise, x0)
                noise_added = mask_t_upper

            # classifier-free guidance
            z_in = torch.cat([z, z], 0)
            t = torch.cat([t, t], 0)
            if cal_flops:
                flop_kwargs = model_args.copy()
                flop_kwargs['x'] = z_in.clone()
                flop_kwargs['timestep'] = t.clone()
                flop_kwargs['cache_dic'] = cache_dic_cal_flops
                flop_kwargs['current'] = current_cal_flops
                flops, macs, params = calculate_flops(model=model,
                                          kwargs = flop_kwargs,
                                          print_results=False)
                # 将字符串转换为浮点数
                #flops = float(re.findall(r"[-+]?\d*\.\d+|\d+", flops)[0])
                match = re.findall(r"([-+]?\d*\.\d+|\d+)\s*([GMTP]?)FLOPS", flops)
                flops_value = float(match[0][0])  # 提取数值部分
                unit = match[0][1]  # 提取量级部分，如 G 或 T
                if unit == 'G':
                    flops = flops_value * 0.001
                else:
                    flops = flops_value
                flops_sum += flops
                
            else:
                pred = model(z_in, t, cache_dic=cache_dic, current=current, **model_args).chunk(2, dim=1)[0]
                pred_cond, pred_uncond = pred.chunk(2, dim=0)
                v_pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond)

                # update z
                dt = timesteps[i] - timesteps[i + 1] if i < len(timesteps) - 1 else timesteps[i]
                dt = dt / self.num_timesteps
                z = z + v_pred * dt[:, None, None, None, None]

                if mask is not None:
                    z = torch.where(mask_t_upper[:, None, :, None, None], z, x0)
        if cal_flops:
            print("FLOPs:", flops_sum, "TFLOPs")
        return z

    def training_losses(self, model, x_start, model_kwargs=None, noise=None, mask=None, weights=None, t=None):
        return self.scheduler.training_losses(model, x_start, model_kwargs, noise, mask, weights, t)


================================================
FILE: Open-Sora/opensora/schedulers/rf/rectified_flow.py
================================================
import torch
from torch.distributions import LogisticNormal

from ..iddpm.gaussian_diffusion import _extract_into_tensor, mean_flat

# some code are inspired by https://github.com/magic-research/piecewise-rectified-flow/blob/main/scripts/train_perflow.py
# and https://github.com/magic-research/piecewise-rectified-flow/blob/main/src/scheduler_perflow.py


def timestep_transform(
    t,
    model_kwargs,
    base_resolution=512 * 512,
    base_num_frames=1,
    scale=1.0,
    num_timesteps=1,
):
    # Force fp16 input to fp32 to avoid nan output
    for key in ["height", "width", "num_frames"]:
        if model_kwargs[key].dtype == torch.float16:
            model_kwargs[key] = model_kwargs[key].float()

    t = t / num_timesteps
    resolution = model_kwargs["height"] * model_kwargs["width"]
    ratio_space = (resolution / base_resolution).sqrt()
    # NOTE: currently, we do not take fps into account
    # NOTE: temporal_reduction is hardcoded, this should be equal to the temporal reduction factor of the vae
    if model_kwargs["num_frames"][0] == 1:
        num_frames = torch.ones_like(model_kwargs["num_frames"])
    else:
        num_frames = model_kwargs["num_frames"] // 17 * 5
    ratio_time = (num_frames / base_num_frames).sqrt()

    ratio = ratio_space * ratio_time * scale
    new_t = ratio * t / (1 + (ratio - 1) * t)

    new_t = new_t * num_timesteps
    return new_t


class RFlowScheduler:
    def __init__(
        self,
        num_timesteps=1000,
        num_sampling_steps=10,
        use_discrete_timesteps=False,
        sample_method="uniform",
        loc=0.0,
        scale=1.0,
        use_timestep_transform=False,
        transform_scale=1.0,
    ):
        self.num_timesteps = num_timesteps
        self.num_sampling_steps = num_sampling_steps
        self.use_discrete_timesteps = use_discrete_timesteps

        # sample method
        assert sample_method in ["uniform", "logit-normal"]
        assert (
            sample_method == "uniform" or not use_discrete_timesteps
        ), "Only uniform sampling is supported for discrete timesteps"
        self.sample_method = sample_method
        if sample_method == "logit-normal":
            self.distribution = LogisticNormal(torch.tensor([loc]), torch.tensor([scale]))
            self.sample_t = lambda x: self.distribution.sample((x.shape[0],))[:, 0].to(x.device)

        # timestep transform
        self.use_timestep_transform = use_timestep_transform
        self.transform_scale = transform_scale

    def training_losses(self, model, x_start, model_kwargs=None, noise=None, mask=None, weights=None, t=None):
        """
        Compute training losses for a single timestep.
        Arguments format copied from opensora/schedulers/iddpm/gaussian_diffusion.py/training_losses
        Note: t is int tensor and should be rescaled from [0, num_timesteps-1] to [1,0]
        """
        if t is None:
            if self.use_discrete_timesteps:
                t = torch.randint(0, self.num_timesteps, (x_start.shape[0],), device=x_start.device)
            elif self.sample_method == "uniform":
                t = torch.rand((x_start.shape[0],), device=x_start.device) * self.num_timesteps
            elif self.sample_method == "logit-normal":
                t = self.sample_t(x_start) * self.num_timesteps

            if self.use_timestep_transform:
                t = timestep_transform(t, model_kwargs, scale=self.transform_scale, num_timesteps=self.num_timesteps)

        if model_kwargs is None:
            model_kwargs = {}
        if noise is None:
            noise = torch.randn_like(x_start)
        assert noise.shape == x_start.shape

        x_t = self.add_noise(x_start, noise, t)
        if mask is not None:
            t0 = torch.zeros_like(t)
            x_t0 = self.add_noise(x_start, noise, t0)
            x_t = torch.where(mask[:, None, :, None, None], x_t, x_t0)

        terms = {}
        model_output = model(x_t, t, **model_kwargs)
        velocity_pred = model_output.chunk(2, dim=1)[0]
        if weights is None:
            loss = mean_flat((velocity_pred - (x_start - noise)).pow(2), mask=mask)
        else:
            weight = _extract_into_tensor(weights, t, x_start.shape)
            loss = mean_flat(weight * (velocity_pred - (x_start - noise)).pow(2), mask=mask)
        terms["loss"] = loss

        return terms

    def add_noise(
        self,
        original_samples: torch.FloatTensor,
        noise: torch.FloatTensor,
        timesteps: torch.IntTensor,
    ) -> torch.FloatTensor:
        """
        compatible with diffusers add_noise()
        """
        timepoints = timesteps.float() / self.num_timesteps
        timepoints = 1 - timepoints  # [1,1/1000]

        # timepoint  (bsz) noise: (bsz, 4, frame, w ,h)
        # expand timepoint to noise shape
        timepoints = timepoints.unsqueeze(1).unsqueeze(1).unsqueeze(1).unsqueeze(1)
        timepoints = timepoints.repeat(1, noise.shape[1], noise.shape[2], noise.shape[3], noise.shape[4])

        return timepoints * original_samples + (1 - timepoints) * noise


================================================
FILE: Open-Sora/opensora/utils/__init__.py
================================================


================================================
FILE: Open-Sora/opensora/utils/ckpt_utils.py
================================================
import functools
import json
import operator
import os
from typing import Tuple

import torch
import torch.distributed as dist
import torch.nn as nn
from colossalai.booster import Booster
from colossalai.checkpoint_io import GeneralCheckpointIO
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torchvision.datasets.utils import download_url

from .misc import get_logger

hf_endpoint = os.environ.get("HF_ENDPOINT")
if hf_endpoint is None:
    hf_endpoint = "https://huggingface.co"

pretrained_models = {
    "DiT-XL-2-512x512.pt": "https://dl.fbaipublicfiles.com/DiT/models/DiT-XL-2-512x512.pt",
    "DiT-XL-2-256x256.pt": "https://dl.fbaipublicfiles.com/DiT/models/DiT-XL-2-256x256.pt",
    "Latte-XL-2-256x256-ucf101.pt": hf_endpoint + "/maxin-cn/Latte/resolve/main/ucf101.pt",
    "PixArt-XL-2-256x256.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-256x256.pth",
    "PixArt-XL-2-SAM-256x256.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-SAM-256x256.pth",
    "PixArt-XL-2-512x512.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-512x512.pth",
    "PixArt-XL-2-1024-MS.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-1024-MS.pth",
    "OpenSora-v1-16x256x256.pth": hf_endpoint + "/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-16x256x256.pth",
    "OpenSora-v1-HQ-16x256x256.pth": hf_endpoint + "/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-HQ-16x256x256.pth",
    "OpenSora-v1-HQ-16x512x512.pth": hf_endpoint + "/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-HQ-16x512x512.pth",
    "PixArt-Sigma-XL-2-256x256.pth": hf_endpoint
    + "/PixArt-alpha/PixArt-Sigma/resolve/main/PixArt-Sigma-XL-2-256x256.pth",
    "PixArt-Sigma-XL-2-512-MS.pth": hf_endpoint
    + "/PixArt-alpha/PixArt-Sigma/resolve/main/PixArt-Sigma-XL-2-512-MS.pth",
    "PixArt-Sigma-XL-2-1024-MS.pth": hf_endpoint
    + "/PixArt-alpha/PixArt-Sigma/resolve/main/PixArt-Sigma-XL-2-1024-MS.pth",
    "PixArt-Sigma-XL-2-2K-MS.pth": hf_endpoint + "/PixArt-alpha/PixArt-Sigma/resolve/main/PixArt-Sigma-XL-2-2K-MS.pth",
}


def reparameter(ckpt, name=None, model=None):
    model_name = name
    name = os.path.basename(name)
    if not dist.is_initialized() or dist.get_rank() == 0:
        get_logger().info("loading pretrained model: %s", model_name)
    if name in ["DiT-XL-2-512x512.pt", "DiT-XL-2-256x256.pt"]:
        ckpt["x_embedder.proj.weight"] = ckpt["x_embedder.proj.weight"].unsqueeze(2)
        del ckpt["pos_embed"]
    if name in ["Latte-XL-2-256x256-ucf101.pt"]:
        ckpt = ckpt["ema"]
        ckpt["x_embedder.proj.weight"] = ckpt["x_embedder.proj.weight"].unsqueeze(2)
        del ckpt["pos_embed"]
        del ckpt["temp_embed"]
    if name in [
        "PixArt-XL-2-256x256.pth",
        "PixArt-XL-2-SAM-256x256.pth",
        "PixArt-XL-2-512x512.pth",
        "PixArt-XL-2-1024-MS.pth",
        "PixArt-Sigma-XL-2-256x256.pth",
        "PixArt-Sigma-XL-2-512-MS.pth",
        "PixArt-Sigma-XL-2-1024-MS.pth",
        "PixArt-Sigma-XL-2-2K-MS.pth",
    ]:
        ckpt = ckpt["state_dict"]
        ckpt["x_embedder.proj.weight"] = ckpt["x_embedder.proj.weight"].unsqueeze(2)
        if "pos_embed" in ckpt:
            del ckpt["pos_embed"]

    if name in [
        "PixArt-1B-2.pth",
    ]:
        ckpt = ckpt["state_dict"]
        if "pos_embed" in ckpt:
            del ckpt["pos_embed"]

    # no need pos_embed
    if "pos_embed_temporal" in ckpt:
        del ckpt["pos_embed_temporal"]
    if "pos_embed" in ckpt:
        del ckpt["pos_embed"]
    # different text length
    if "y_embedder.y_embedding" in ckpt:
        if ckpt["y_embedder.y_embedding"].shape[0] < model.y_embedder.y_embedding.shape[0]:
            get_logger().info(
                "Extend y_embedding from %s to %s",
                ckpt["y_embedder.y_embedding"].shape[0],
                model.y_embedder.y_embedding.shape[0],
            )
            additional_length = model.y_embedder.y_embedding.shape[0] - ckpt["y_embedder.y_embedding"].shape[0]
            new_y_embedding = torch.zeros(additional_length, model.y_embedder.y_embedding.shape[1])
            new_y_embedding[:] = ckpt["y_embedder.y_embedding"][-1]
            ckpt["y_embedder.y_embedding"] = torch.cat([ckpt["y_embedder.y_embedding"], new_y_embedding], dim=0)
        elif ckpt["y_embedder.y_embedding"].shape[0] > model.y_embedder.y_embedding.shape[0]:
            get_logger().info(
                "Shrink y_embedding from %s to %s",
                ckpt["y_embedder.y_embedding"].shape[0],
                model.y_embedder.y_embedding.shape[0],
            )
            ckpt["y_embedder.y_embedding"] = ckpt["y_embedder.y_embedding"][: model.y_embedder.y_embedding.shape[0]]
    # stdit3 special case
    if type(model).__name__ == "STDiT3" and "PixArt-Sigma" in name:
        ckpt_keys = list(ckpt.keys())
        for key in ckpt_keys:
            if "blocks." in key:
                ckpt[key.replace("blocks.", "spatial_blocks.")] = ckpt[key]
                del ckpt[key]

    return ckpt


def find_model(model_name, model=None):
    """
    Finds a pre-trained DiT model, downloading it if necessary. Alternatively, loads a model from a local path.
    """
    if model_name in pretrained_models:  # Find/download our pre-trained DiT checkpoints
        model_ckpt = download_model(model_name)
        model_ckpt = reparameter(model_ckpt, model_name, model=model)
    else:  # Load a custom DiT checkpoint:
        assert os.path.isfile(model_name), f"Could not find DiT checkpoint at {model_name}"
        model_ckpt = torch.load(model_name, map_location=lambda storage, loc: storage)
        model_ckpt = reparameter(model_ckpt, model_name, model=model)
    return model_ckpt


def download_model(model_name=None, local_path=None, url=None):
    """
    Downloads a pre-trained DiT model from the web.
    """
    if model_name is not None:
        assert model_name in pretrained_models
        local_path = f"pretrained_models/{model_name}"
        web_path = pretrained_models[model_name]
    else:
        assert local_path is not None
        assert url is not None
        web_path = url
    if not os.path.isfile(local_path):
        os.makedirs("pretrained_models", exist_ok=True)
        dir_name = os.path.dirname(local_path)
        file_name = os.path.basename(local_path)
        download_url(web_path, dir_name, file_name)
    model = torch.load(local_path, map_location=lambda storage, loc: storage)
    return model


def load_from_sharded_state_dict(model, ckpt_path, model_name="model.safetensors", strict=False):
    ckpt_io = GeneralCheckpointIO()
    ckpt_io.load_model(model, os.path.join(ckpt_path, model_name), strict=strict)


def model_sharding(model: torch.nn.Module):
    global_rank = dist.get_rank()
    world_size = dist.get_world_size()
    for _, param in model.named_parameters():
        padding_size = (world_size - param.numel() % world_size) % world_size
        if padding_size > 0:
            padding_param = torch.nn.functional.pad(param.data.view(-1), [0, padding_size])
        else:
            padding_param = param.data.view(-1)
        splited_params = padding_param.split(padding_param.numel() // world_size)
        splited_params = splited_params[global_rank]
        param.data = splited_params


def model_gathering(model: torch.nn.Module, model_shape_dict: dict):
    global_rank = dist.get_rank()
    global_size = dist.get_world_size()
    for name, param in model.named_parameters():
        all_params = [torch.empty_like(param.data) for _ in range(global_size)]
        dist.all_gather(all_params, param.data, group=dist.group.WORLD)
        if int(global_rank) == 0:
            all_params = torch.cat(all_params)
            param.data = remove_padding(all_params, model_shape_dict[name]).view(model_shape_dict[name])
    dist.barrier()


def remove_padding(tensor: torch.Tensor, original_shape: Tuple) -> torch.Tensor:
    return tensor[: functools.reduce(operator.mul, original_shape)]


def record_model_param_shape(model: torch.nn.Module) -> dict:
    param_shape = {}
    for name, param in model.named_parameters():
        param_shape[name] = param.shape
    return param_shape


def load_checkpoint(model, ckpt_path, save_as_pt=False, model_name="model.safetensors", strict=False):
    if ckpt_path.endswith(".pt") or ckpt_path.endswith(".pth"):
        state_dict = find_model(ckpt_path, model=model)
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=strict)
        get_logger().info("Missing keys: %s", missing_keys)
        get_logger().info("Unexpected keys: %s", unexpected_keys)
    elif ckpt_path.endswith(".safetensors"):
        from safetensors.torch import load_file
        state_dict = load_file(ckpt_path)
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
        print(f"Missing keys: {missing_keys}")
        print(f"Unexpected keys: {unexpected_keys}")
    elif os.path.isdir(ckpt_path):
        load_from_sharded_state_dict(model, ckpt_path, model_name, strict=strict)
        get_logger().info("Model checkpoint loaded from %s", ckpt_path)
        if save_as_pt:
            save_path = os.path.join(ckpt_path, model_name + "_ckpt.pt")
            torch.save(model.state_dict(), save_path)
            get_logger().info("Model checkpoint saved to %s", save_path)
    else:
        raise ValueError(f"Invalid checkpoint path: {ckpt_path}")


def load_json(file_path: str):
    with open(file_path, "r") as f:
        return json.load(f)


def save_json(data, file_path: str):
    with open(file_path, "w") as f:
        json.dump(data, f, indent=4)


# save and load for training


def save(
    booster: Booster,
    save_dir: str,
    model: nn.Module = None,
    ema: nn.Module = None,
    optimizer: Optimizer = None,
    lr_scheduler: _LRScheduler = None,
    sampler=None,
    epoch: int = None,
    step: int = None,
    global_step: int = None,
    batch_size: int = None,
):
    save_dir = os.path.join(save_dir, f"epoch{epoch}-global_step{global_step}")
    os.makedirs(os.path.join(save_dir, "model"), exist_ok=True)

    if model is not None:
        booster.save_model(model, os.path.join(save_dir, "model"), shard=True)
    if optimizer is not None:
        booster.save_optimizer(optimizer, os.path.join(save_dir, "optimizer"), shard=True, size_per_shard=4096)
    if lr_scheduler is not None:
        booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, "lr_scheduler"))
    if dist.get_rank() == 0:
        running_states = {
            "epoch": epoch,
            "step": step,
            "global_step": global_step,
            "batch_size": batch_size,
        }
        save_json(running_states, os.path.join(save_dir, "running_states.json"))

        if ema is not None:
            torch.save(ema.state_dict(), os.path.join(save_dir, "ema.pt"))

        if sampler is not None:
            # only for VariableVideoBatchSampler
            torch.save(sampler.state_dict(step), os.path.join(save_dir, "sampler"))
    dist.barrier()
    return save_dir


def load(
    booster: Booster,
    load_dir: str,
    model: nn.Module = None,
    ema: nn.Module = None,
    optimizer: Optimizer = None,
    lr_scheduler: _LRScheduler = None,
    sampler=None,
) -> Tuple[int, int, int]:
    assert os.path.exists(load_dir), f"Checkpoint directory {load_dir} does not exist"
    assert os.path.exists(os.path.join(load_dir, "running_states.json")), "running_states.json does not exist"
    running_states = load_json(os.path.join(load_dir, "running_states.json"))
    if model is not None:
        booster.load_model(model, os.path.join(load_dir, "model"))
    if ema is not None:
        # ema is not boosted, so we don't use booster.load_model
        ema.load_state_dict(
            torch.load(os.path.join(load_dir, "ema.pt"), map_location=torch.device("cpu")),
            strict=False,
        )
    if optimizer is not None:
        booster.load_optimizer(optimizer, os.path.join(load_dir, "optimizer"))
    if lr_scheduler is not None:
        booster.load_lr_scheduler(lr_scheduler, os.path.join(load_dir, "lr_scheduler"))
    if sampler is not None:
        sampler.load_state_dict(torch.load(os.path.join(load_dir, "sampler")))
    dist.barrier()

    return (
        running_states["epoch"],
        running_states["step"],
    )


================================================
FILE: Open-Sora/opensora/utils/config_utils.py
================================================
import argparse
import json
import os
from glob import glob

from mmengine.config import Config


def parse_args(training=False):
    parser = argparse.ArgumentParser()

    # model config
    parser.add_argument("config", help="model config file path")

    # ======================================================
    # General
    # ======================================================
    parser.add_argument("--seed", default=None, type=int, help="seed for reproducibility")
    parser.add_argument(
        "--ckpt-path",
        default=None,
        type=str,
        help="path to model ckpt; will overwrite cfg.model.from_pretrained if specified",
    )
    parser.add_argument("--batch-size", default=None, type=int, help="batch size")
    parser.add_argument("--outputs", default=None, type=str, help="the dir to save model weights")
    parser.add_argument("--flash-attn", default=None, type=str2bool, help="enable flash attention")
    parser.add_argument("--layernorm-kernel", default=None, type=str2bool, help="enable layernorm kernel")
    parser.add_argument("--resolution", default=None, type=str, help="multi resolution")
    parser.add_argument("--data-path", default=None, type=str, help="path to data csv")
    parser.add_argument("--dtype", default=None, type=str, help="data type")

    # ======================================================
    # Inference
    # ======================================================
    if not training:
        # output
        parser.add_argument("--save-dir", default=None, type=str, help="path to save generated samples")
        parser.add_argument("--sample-name", default=None, type=str, help="sample name, default is sample_idx")
        parser.add_argument("--start-index", default=None, type=int, help="start index for sample name")
        parser.add_argument("--end-index", default=None, type=int, help="end index for sample name")
        parser.add_argument("--num-sample", default=None, type=int, help="number of samples to generate for one prompt")
        parser.add_argument("--prompt-as-path", action="store_true", help="use prompt as path to save samples")
        parser.add_argument("--verbose", default=None, type=int, help="verbose level")

        # prompt
        parser.add_argument("--prompt-path", default=None, type=str, help="path to prompt txt file")
        parser.add_argument("--prompt", default=None, type=str, nargs="+", help="prompt list")
        parser.add_argument("--llm-refine", default=None, type=str2bool, help="enable LLM refine")
        parser.add_argument("--prompt-generator", default=None, type=str, help="prompt generator")

        # image/video
        parser.add_argument("--num-frames", default=None, type=str, help="number of frames")
        parser.add_argument("--fps", default=None, type=int, help="fps")
        parser.add_argument("--save-fps", default=None, type=int, help="save fps")
        parser.add_argument("--image-size", default=None, type=int, nargs=2, help="image size")
        parser.add_argument("--frame-interval", default=None, type=int, help="frame interval")
        parser.add_argument("--aspect-ratio", default=None, type=str, help="aspect ratio (h:w)")
        parser.add_argument("--watermark", default=None, type=str2bool, help="watermark video")

        # hyperparameters
        parser.add_argument("--num-sampling-steps", default=None, type=int, help="sampling steps")
        parser.add_argument("--cfg-scale", default=None, type=float, help="balance between cond & uncond")

        # reference
        parser.add_argument("--loop", default=None, type=int, help="loop")
        parser.add_argument("--condition-frame-length", default=None, type=int, help="condition frame length")
        parser.add_argument("--reference-path", default=None, type=str, nargs="+", help="reference path")
        parser.add_argument("--mask-strategy", default=None, type=str, nargs="+", help="mask strategy")
        parser.add_argument("--aes", default=None, type=float, help="aesthetic score")
        parser.add_argument("--flow", default=None, type=float, help="flow score")
        parser.add_argument("--camera-motion", default=None, type=str, help="camera motion")
    # ======================================================
    # Training
    # ======================================================
    else:
        parser.add_argument("--lr", default=None, type=float, help="learning rate")
        parser.add_argument("--wandb", default=None, type=bool, help="enable wandb")
        parser.add_argument("--load", default=None, type=str, help="path to continue training")
        parser.add_argument("--start-from-scratch", action="store_true", help="start training from scratch")
        parser.add_argument("--warmup-steps", default=None, type=int, help="warmup steps")
        parser.add_argument("--record-time", default=False, action="store_true", help="record time of each part")

    return parser.parse_args()


def merge_args(cfg, args, training=False):
    if args.ckpt_path is not None:
        cfg.model["from_pretrained"] = args.ckpt_path
        if cfg.get("discriminator") is not None:
            cfg.discriminator["from_pretrained"] = args.ckpt_path
        args.ckpt_path = None
    if args.flash_attn is not None:
        cfg.model["enable_flash_attn"] = args.flash_attn
        args.enable_flash_attn = None
    if args.layernorm_kernel is not None:
        cfg.model["enable_layernorm_kernel"] = args.layernorm_kernel
        args.enable_layernorm_kernel = None
    if args.data_path is not None:
        cfg.dataset["data_path"] = args.data_path
        args.data_path = None
    # NOTE: for vae inference (reconstruction)
    if not training and "dataset" in cfg:
        if args.image_size is not None:
            cfg.dataset["image_size"] = args.image_size
        if args.num_frames is not None:
            cfg.dataset["num_frames"] = args.num_frames
    if not training:
        if args.cfg_scale is not None:
            cfg.scheduler["cfg_scale"] = args.cfg_scale
            args.cfg_scale = None
        if args.num_sampling_steps is not None:
            cfg.scheduler["num_sampling_steps"] = args.num_sampling_steps
            args.num_sampling_steps = None

    for k, v in vars(args).items():
        if v is not None:
            cfg[k] = v

    return cfg


def read_config(config_path):
    cfg = Config.fromfile(config_path)
    return cfg


def parse_configs(training=False):
    args = parse_args(training)
    cfg = read_config(args.config)
    cfg = merge_args(cfg, args, training)
    return cfg


def define_experiment_workspace(cfg, get_last_workspace=False):
    """
    This function creates a folder for experiment tracking.

    Args:
        args: The parsed arguments.

    Returns:
        exp_dir: The path to the experiment folder.
    """
    # Make outputs folder (holds all experiment subfolders)
    os.makedirs(cfg.outputs, exist_ok=True)
    experiment_index = len(glob(f"{cfg.outputs}/*"))
    if get_last_workspace:
        experiment_index -= 1

    # Create an experiment folder
    model_name = cfg.model["type"].replace("/", "-")
    exp_name = f"{experiment_index:03d}-{model_name}"
    exp_dir = f"{cfg.outputs}/{exp_name}"
    return exp_name, exp_dir


def save_training_config(cfg, experiment_dir):
    with open(f"{experiment_dir}/config.txt", "w") as f:
        json.dump(cfg, f, indent=4)


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ("yes", "true", "t", "y", "1"):
        return True
    elif v.lower() in ("no", "false", "f", "n", "0"):
        return False
    else:
        raise argparse.ArgumentTypeError("Boolean value expected.")


================================================
FILE: Open-Sora/opensora/utils/inference_utils.py
================================================
import json
import os
import re

import torch

from opensora.datasets import IMG_FPS
from opensora.datasets.utils import read_from_path


def prepare_multi_resolution_info(info_type, batch_size, image_size, num_frames, fps, device, dtype):
    if info_type is None:
        return dict()
    elif info_type == "PixArtMS":
        hw = torch.tensor([image_size], device=device, dtype=dtype).repeat(batch_size, 1)
        ar = torch.tensor([[image_size[0] / image_size[1]]], device=device, dtype=dtype).repeat(batch_size, 1)
        return dict(ar=ar, hw=hw)
    elif info_type in ["STDiT2", "OpenSora"]:
        fps = fps if num_frames > 1 else IMG_FPS
        fps = torch.tensor([fps], device=device, dtype=dtype).repeat(batch_size)
        height = torch.tensor([image_size[0]], device=device, dtype=dtype).repeat(batch_size)
        width = torch.tensor([image_size[1]], device=device, dtype=dtype).repeat(batch_size)
        num_frames = torch.tensor([num_frames], device=device, dtype=dtype).repeat(batch_size)
        ar = torch.tensor([image_size[0] / image_size[1]], device=device, dtype=dtype).repeat(batch_size)
        return dict(height=height, width=width, num_frames=num_frames, ar=ar, fps=fps)
    else:
        raise NotImplementedError


def load_prompts(prompt_path, start_idx=None, end_idx=None):
    with open(prompt_path, "r") as f:
        prompts = [line.strip() for line in f.readlines()]
    prompts = prompts[start_idx:end_idx]
    return prompts


def get_save_path_name(
    save_dir,
    sample_name=None,  # prefix
    sample_idx=None,  # sample index
    prompt=None,  # used prompt
    prompt_as_path=False,  # use prompt as path
    num_sample=1,  # number of samples to generate for one prompt
    k=None,  # kth sample
):
    if sample_name is None:
        sample_name = "" if prompt_as_path else "sample"
    sample_name_suffix = prompt if prompt_as_path else f"_{sample_idx:04d}"
    save_path = os.path.join(save_dir, f"{sample_name}{sample_name_suffix}")
    if num_sample != 1:
        save_path = f"{save_path}-{k}"
    return save_path


def append_score_to_prompts(prompts, aes=None, flow=None, camera_motion=None):
    new_prompts = []
    for prompt in prompts:
        new_prompt = prompt
        if aes is not None and "aesthetic score:" not in prompt:
            new_prompt = f"{new_prompt} aesthetic score: {aes:.1f}."
        if flow is not None and "motion score:" not in prompt:
            new_prompt = f"{new_prompt} motion score: {flow:.1f}."
        if camera_motion is not None and "camera motion:" not in prompt:
            new_prompt = f"{new_prompt} camera motion: {camera_motion}."
        new_prompts.append(new_prompt)
    return new_prompts


def extract_json_from_prompts(prompts, reference, mask_strategy):
    ret_prompts = []
    for i, prompt in enumerate(prompts):
        parts = re.split(r"(?=[{])", prompt)
        assert len(parts) <= 2, f"Invalid prompt: {prompt}"
        ret_prompts.append(parts[0])
        if len(parts) > 1:
            additional_info = json.loads(parts[1])
            for key in additional_info:
                assert key in ["reference_path", "mask_strategy"], f"Invalid key: {key}"
                if key == "reference_path":
                    reference[i] = additional_info[key]
                elif key == "mask_strategy":
                    mask_strategy[i] = additional_info[key]
    return ret_prompts, reference, mask_strategy


def collect_references_batch(reference_paths, vae, image_size):
    refs_x = []  # refs_x: [batch, ref_num, C, T, H, W]
    for reference_path in reference_paths:
        if reference_path == "":
            refs_x.append([])
            continue
        ref_path = reference_path.split(";")
        ref = []
        for r_path in ref_path:
            r = read_from_path(r_path, image_size, transform_name="resize_crop")
            r_x = vae.encode(r.unsqueeze(0).to(vae.device, vae.dtype))
            r_x = r_x.squeeze(0)
            ref.append(r_x)
        refs_x.append(ref)
    return refs_x


def extract_prompts_loop(prompts, num_loop):
    ret_prompts = []
    for prompt in prompts:
        if prompt.startswith("|0|"):
            prompt_list = prompt.split("|")[1:]
            text_list = []
            for i in range(0, len(prompt_list), 2):
                start_loop = int(prompt_list[i])
                text = prompt_list[i + 1]
                end_loop = int(prompt_list[i + 2]) if i + 2 < len(prompt_list) else num_loop + 1
                text_list.extend([text] * (end_loop - start_loop))
            prompt = text_list[num_loop]
        ret_prompts.append(prompt)
    return ret_prompts


def split_prompt(prompt_text):
    if prompt_text.startswith("|0|"):
        # this is for prompts which look like
        # |0| a beautiful day |1| a sunny day |2| a rainy day
        # we want to parse it into a list of prompts with the loop index
        prompt_list = prompt_text.split("|")[1:]
        text_list = []
        loop_idx = []
        for i in range(0, len(prompt_list), 2):
            start_loop = int(prompt_list[i])
            text = prompt_list[i + 1].strip()
            text_list.append(text)
            loop_idx.append(start_loop)
        return text_list, loop_idx
    else:
        return [prompt_text], None


def merge_prompt(text_list, loop_idx_list=None):
    if loop_idx_list is None:
        return text_list[0]
    else:
        prompt = ""
        for i, text in enumerate(text_list):
            prompt += f"|{loop_idx_list[i]}|{text}"
        return prompt


MASK_DEFAULT = ["0", "0", "0", "0", "1", "0"]


def parse_mask_strategy(mask_strategy):
    mask_batch = []
    if mask_strategy == "" or mask_strategy is None:
        return mask_batch

    mask_strategy = mask_strategy.split(";")
    for mask in mask_strategy:
        mask_group = mask.split(",")
        num_group = len(mask_group)
        assert num_group >= 1 and num_group <= 6, f"Invalid mask strategy: {mask}"
        mask_group.extend(MASK_DEFAULT[num_group:])
        for i in range(5):
            mask_group[i] = int(mask_group[i])
        mask_group[5] = float(mask_group[5])
        mask_batch.append(mask_group)
    return mask_batch


def find_nearest_point(value, point, max_value):
    t = value // point
    if value % point > point / 2 and t < max_value // point - 1:
        t += 1
    return t * point


def apply_mask_strategy(z, refs_x, mask_strategys, loop_i, align=None):
    masks = []
    no_mask = True
    for i, mask_strategy in enumerate(mask_strategys):
        no_mask = False
        mask = torch.ones(z.shape[2], dtype=torch.float, device=z.device)
        mask_strategy = parse_mask_strategy(mask_strategy)
        for mst in mask_strategy:
            loop_id, m_id, m_ref_start, m_target_start, m_length, edit_ratio = mst
            if loop_id != loop_i:
                continue
            ref = refs_x[i][m_id]

            if m_ref_start < 0:
                # ref: [C, T, H, W]
                m_ref_start = ref.shape[1] + m_ref_start
            if m_target_start < 0:
                # z: [B, C, T, H, W]
                m_target_start = z.shape[2] + m_target_start
            if align is not None:
                m_ref_start = find_nearest_point(m_ref_start, align, ref.shape[1])
                m_target_start = find_nearest_point(m_target_start, align, z.shape[2])
            m_length = min(m_length, z.shape[2] - m_target_start, ref.shape[1] - m_ref_start)
            z[i, :, m_target_start : m_target_start + m_length] = ref[:, m_ref_start : m_ref_start + m_length]
            mask[m_target_start : m_target_start + m_length] = edit_ratio
        masks.append(mask)
    if no_mask:
        return None
    masks = torch.stack(masks)
    return masks


def append_generated(vae, generated_video, refs_x, mask_strategy, loop_i, condition_frame_length, condition_frame_edit):
    ref_x = vae.encode(generated_video)
    for j, refs in enumerate(refs_x):
        if refs is None:
            refs_x[j] = [ref_x[j]]
        else:
            refs.append(ref_x[j])
        if mask_strategy[j] is None or mask_strategy[j] == "":
            mask_strategy[j] = ""
        else:
            mask_strategy[j] += ";"
        mask_strategy[
            j
        ] += f"{loop_i},{len(refs)-1},-{condition_frame_length},0,{condition_frame_length},{condition_frame_edit}"
    return refs_x, mask_strategy


def dframe_to_frame(num):
    assert num % 5 == 0, f"Invalid num: {num}"
    return num // 5 * 17


OPENAI_CLIENT = None
REFINE_PROMPTS = None
REFINE_PROMPTS_PATH = "assets/texts/t2v_pllava.txt"
REFINE_PROMPTS_TEMPLATE = """
You need to refine user's input prompt. The user's input prompt is used for video generation task. You need to refine the user's prompt to make it more suitable for the task. Here are some examples of refined prompts:
{}

The refined prompt should pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. The refined prompt should be in English.
"""
RANDOM_PROMPTS = None
RANDOM_PROMPTS_TEMPLATE = """
You need to generate one input prompt for video generation task. The prompt should be suitable for the task. Here are some examples of refined prompts:
{}

The prompt should pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. The prompt should be in English.
"""


def get_openai_response(sys_prompt, usr_prompt, model="gpt-4o"):
    global OPENAI_CLIENT
    if OPENAI_CLIENT is None:
        from openai import OpenAI

        OPENAI_CLIENT = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

    completion = OPENAI_CLIENT.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": sys_prompt,
            },  # <-- This is the system message that provides context to the model
            {
                "role": "user",
                "content": usr_prompt,
            },  # <-- This is the user message for which the model will generate a response
        ],
    )

    return completion.choices[0].message.content


def get_random_prompt_by_openai():
    global RANDOM_PROMPTS
    if RANDOM_PROMPTS is None:
        examples = load_prompts(REFINE_PROMPTS_PATH)
        RANDOM_PROMPTS = RANDOM_PROMPTS_TEMPLATE.format("\n".join(examples))

    response = get_openai_response(RANDOM_PROMPTS, "Generate one example.")
    return response


def refine_prompt_by_openai(prompt):
    global REFINE_PROMPTS
    if REFINE_PROMPTS is None:
        examples = load_prompts(REFINE_PROMPTS_PATH)
        REFINE_PROMPTS = REFINE_PROMPTS_TEMPLATE.format("\n".join(examples))

    response = get_openai_response(REFINE_PROMPTS, prompt)
    return response


def has_openai_key():
    return "OPENAI_API_KEY" in os.environ


def refine_prompts_by_openai(prompts):
    new_prompts = []
    for prompt in prompts:
        try:
            if prompt.strip() == "":
                new_prompt = get_random_prompt_by_openai()
                print(f"[Info] Empty prompt detected, generate random prompt: {new_prompt}")
            else:
                new_prompt = refine_prompt_by_openai(prompt)
                print(f"[Info] Refine prompt: {prompt} -> {new_prompt}")
            new_prompts.append(new_prompt)
        except Exception as e:
            print(f"[Warning] Failed to refine prompt: {prompt} due to {e}")
            new_prompts.append(prompt)
    return new_prompts


def add_watermark(
    input_video_path, watermark_image_path="./assets/images/watermark/watermark.png", output_video_path=None
):
    # execute this command in terminal with subprocess
    # return if the process is successful
    if output_video_path is None:
        output_video_path = input_video_path.replace(".mp4", "_watermark.mp4")
    cmd = f'ffmpeg -y -i {input_video_path} -i {watermark_image_path} -filter_complex "[1][0]scale2ref=oh*mdar:ih*0.1[logo][video];[video][logo]overlay" {output_video_path}'
    exit_code = os.system(cmd)
    is_success = exit_code == 0
    return is_success


================================================
FILE: Open-Sora/opensora/utils/lr_scheduler.py
================================================
from torch.optim.lr_scheduler import _LRScheduler


class LinearWarmupLR(_LRScheduler):
    """Linearly warmup learning rate and then linearly decay.

    Args:
        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
        warmup_steps (int, optional): Number of warmup steps, defaults to 0
        last_step (int, optional): The index of last step, defaults to -1. When last_step=-1,
            the schedule is started from the beginning or When last_step=-1, sets initial lr as lr.
    """

    def __init__(self, optimizer, warmup_steps: int = 0, last_epoch: int = -1):
        self.warmup_steps = warmup_steps
        super().__init__(optimizer, last_epoch=last_epoch)

    def get_lr(self):
        if self.last_epoch < self.warmup_steps:
            return [(self.last_epoch + 1) / (self.warmup_steps + 1) * lr for lr in self.base_lrs]
        else:
            return self.base_lrs


================================================
FILE: Open-Sora/opensora/utils/misc.py
================================================
import collections
import importlib
import logging
import os
import time
from collections import OrderedDict
from collections.abc import Sequence
from itertools import repeat
from typing import Optional, Tuple

import numpy as np
import torch
import torch.distributed as dist
from colossalai.cluster.dist_coordinator import DistCoordinator

# ======================================================
# Logging
# ======================================================


def is_distributed():
    return os.environ.get("WORLD_SIZE", None) is not None


def is_main_process():
    return not is_distributed() or dist.get_rank() == 0


def get_world_size():
    if is_distributed():
        return dist.get_world_size()
    else:
        return 1


def create_logger(logging_dir=None):
    """
    Create a logger that writes to a log file and stdout.
    """
    if is_main_process():  # real logger
        additional_args = dict()
        if logging_dir is not None:
            additional_args["handlers"] = [
                logging.StreamHandler(),
                logging.FileHandler(f"{logging_dir}/log.txt"),
            ]
        logging.basicConfig(
            level=logging.INFO,
            format="[\033[34m%(asctime)s\033[0m] %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
            **additional_args,
        )
        logger = logging.getLogger(__name__)
    else:  # dummy logger (does nothing)
        logger = logging.getLogger(__name__)
        logger.addHandler(logging.NullHandler())
    return logger


def get_logger():
    return logging.getLogger(__name__)


def print_rank(var_name, var_value, rank=0):
    if dist.get_rank() == rank:
        print(f"[Rank {rank}] {var_name}: {var_value}")


def print_0(*args, **kwargs):
    if dist.get_rank() == 0:
        print(*args, **kwargs)


def create_tensorboard_writer(exp_dir):
    from torch.utils.tensorboard import SummaryWriter

    tensorboard_dir = f"{exp_dir}/tensorboard"
    os.makedirs(tensorboard_dir, exist_ok=True)
    writer = SummaryWriter(tensorboard_dir)
    return writer


# ======================================================
# String
# ======================================================


def format_numel_str(numel: int) -> str:
    B = 1024**3
    M = 1024**2
    K = 1024
    if numel >= B:
        return f"{numel / B:.2f} B"
    elif numel >= M:
        return f"{numel / M:.2f} M"
    elif numel >= K:
        return f"{numel / K:.2f} K"
    else:
        return f"{numel}"


def get_timestamp():
    timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
    return timestamp


def format_time(seconds):
    days = int(seconds / 3600 / 24)
    seconds = seconds - days * 3600 * 24
    hours = int(seconds / 3600)
    seconds = seconds - hours * 3600
    minutes = int(seconds / 60)
    seconds = seconds - minutes * 60
    secondsf = int(seconds)
    seconds = seconds - secondsf
    millis = int(seconds * 1000)

    f = ""
    i = 1
    if days > 0:
        f += str(days) + "D"
        i += 1
    if hours > 0 and i <= 2:
        f += str(hours) + "h"
        i += 1
    if minutes > 0 and i <= 2:
        f += str(minutes) + "m"
        i += 1
    if secondsf > 0 and i <= 2:
        f += str(secondsf) + "s"
        i += 1
    if millis > 0 and i <= 2:
        f += str(millis) + "ms"
        i += 1
    if f == "":
        f = "0ms"
    return f


class BColors:
    HEADER = "\033[95m"
    OKBLUE = "\033[94m"
    OKCYAN = "\033[96m"
    OKGREEN = "\033[92m"
    WARNING = "\033[93m"
    FAIL = "\033[91m"
    ENDC = "\033[0m"
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"


# ======================================================
# PyTorch
# ======================================================


def requires_grad(model: torch.nn.Module, flag: bool = True) -> None:
    """
    Set requires_grad flag for all parameters in a model.
    """
    for p in model.parameters():
        p.requires_grad = flag


def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
    dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
    tensor.div_(dist.get_world_size())
    return tensor


def get_model_numel(model: torch.nn.Module) -> Tuple[int, int]:
    num_params = 0
    num_params_trainable = 0
    for p in model.parameters():
        num_params += p.numel()
        if p.requires_grad:
            num_params_trainable += p.numel()
    return num_params, num_params_trainable


def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def to_tensor(data):
    """Convert objects of various python types to :obj:`torch.Tensor`.

    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
    :class:`Sequence`, :class:`int` and :class:`float`.

    Args:
        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
            be converted.
    """

    if isinstance(data, torch.Tensor):
        return data
    elif isinstance(data, np.ndarray):
        return torch.from_numpy(data)
    elif isinstance(data, Sequence) and not isinstance(data, str):
        return torch.tensor(data)
    elif isinstance(data, int):
        return torch.LongTensor([data])
    elif isinstance(data, float):
        return torch.FloatTensor([data])
    else:
        raise TypeError(f"type {type(data)} cannot be converted to tensor.")


def to_ndarray(data):
    if isinstance(data, torch.Tensor):
        return data.numpy()
    elif isinstance(data, np.ndarray):
        return data
    elif isinstance(data, Sequence):
        return np.array(data)
    elif isinstance(data, int):
        return np.ndarray([data], dtype=int)
    elif isinstance(data, float):
        return np.array([data], dtype=float)
    else:
        raise TypeError(f"type {type(data)} cannot be converted to ndarray.")


def to_torch_dtype(dtype):
    if isinstance(dtype, torch.dtype):
        return dtype
    elif isinstance(dtype, str):
        dtype_mapping = {
            "float64": torch.float64,
            "float32": torch.float32,
            "float16": torch.float16,
            "fp32": torch.float32,
            "fp16": torch.float16,
            "half": torch.float16,
            "bf16": torch.bfloat16,
        }
        if dtype not in dtype_mapping:
            raise ValueError
        dtype = dtype_mapping[dtype]
        return dtype
    else:
        raise ValueError


def _ntuple(n):
    def parse(x):
        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
            return x
        return tuple(repeat(x, n))

    return parse


to_1tuple = _ntuple(1)
to_2tuple = _ntuple(2)
to_3tuple = _ntuple(3)
to_4tuple = _ntuple(4)
to_ntuple = _ntuple


def convert_SyncBN_to_BN2d(model_cfg):
    for k in model_cfg:
        v = model_cfg[k]
        if k == "norm_cfg" and v["type"] == "SyncBN":
            v["type"] = "BN2d"
        elif isinstance(v, dict):
            convert_SyncBN_to_BN2d(v)


def get_topk(x, dim=4, k=5):
    x = to_tensor(x)
    inds = x[..., dim].topk(k)[1]
    return x[inds]


def param_sigmoid(x, alpha):
    ret = 1 / (1 + (-alpha * x).exp())
    return ret


def inverse_param_sigmoid(x, alpha, eps=1e-5):
    x = x.clamp(min=0, max=1)
    x1 = x.clamp(min=eps)
    x2 = (1 - x).clamp(min=eps)
    return torch.log(x1 / x2) / alpha


def inverse_sigmoid(x, eps=1e-5):
    """Inverse function of sigmoid.

    Args:
        x (Tensor): The tensor to do the
            inverse.
        eps (float): EPS avoid numerical
            overflow. Defaults 1e-5.
    Returns:
        Tensor: The x has passed the inverse
            function of sigmoid, has same
            shape with input.
    """
    x = x.clamp(min=0, max=1)
    x1 = x.clamp(min=eps)
    x2 = (1 - x).clamp(min=eps)
    return torch.log(x1 / x2)


# ======================================================
# Python
# ======================================================


def count_columns(df, columns):
    cnt_dict = OrderedDict()
    num_samples = len(df)

    for col in columns:
        d_i = df[col].value_counts().to_dict()
        for k in d_i:
            d_i[k] = (d_i[k], d_i[k] / num_samples)
        cnt_dict[col] = d_i

    return cnt_dict


def try_import(name):
    """Try to import a module.

    Args:
        name (str): Specifies what module to import in absolute or relative
            terms (e.g. either pkg.mod or ..mod).
    Returns:
        ModuleType or None: If importing successfully, returns the imported
        module, otherwise returns None.
    """
    try:
        return importlib.import_module(name)
    except ImportError:
        return None


def transpose(x):
    """
    transpose a list of list
    Args:
        x (list[list]):
    """
    ret = list(map(list, zip(*x)))
    return ret


def all_exists(paths):
    return all(os.path.exists(path) for path in paths)


# ======================================================
# Profile
# ======================================================


class Timer:
    def __init__(self, name, log=False, coordinator: Optional[DistCoordinator] = None):
        self.name = name
        self.start_time = None
        self.end_time = None
        self.log = log
        self.coordinator = coordinator

    @property
    def elapsed_time(self):
        return self.end_time - self.start_time

    def __enter__(self):
        torch.cuda.synchronize()
        self.start_time = time.time()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.coordinator is not None:
            self.coordinator.block_all()
        torch.cuda.synchronize()
        self.end_time = time.time()
        if self.log:
            print(f"Elapsed time for {self.name}: {self.elapsed_time:.2f} s")


def get_tensor_memory(tensor, human_readable=True):
    size = tensor.element_size() * tensor.nelement()
    if human_readable:
        size = format_numel_str(size)
    return size


class FeatureSaver:
    def __init__(self, save_dir, bin_size=10, start_bin=0):
        self.save_dir = save_dir
        self.bin_size = bin_size
        self.bin_cnt = start_bin

        self.data_list = []
        self.cnt = 0

    def update(self, data):
        self.data_list.append(data)
        self.cnt += 1

        if self.cnt % self.bin_size == 0:
            self.save()

    def save(self):
        save_path = os.path.join(self.save_dir, f"{self.bin_cnt:08}.bin")
        torch.save(self.data_list, save_path)
        get_logger().info("Saved to %s", save_path)
        self.data_list = []
        self.bin_cnt += 1


================================================
FILE: Open-Sora/opensora/utils/train_utils.py
================================================
import math
import random
from collections import OrderedDict

import torch
import torch.distributed as dist
from colossalai.booster.plugin import LowLevelZeroPlugin

from opensora.acceleration.parallel_states import set_data_parallel_group, set_sequence_parallel_group
from opensora.acceleration.plugin import ZeroSeqParallelPlugin

from .misc import get_logger


def create_colossalai_plugin(plugin, dtype, grad_clip, sp_size, reduce_bucket_size_in_m: int = 20):
    if plugin == "zero2":
        assert sp_size == 1, "Zero2 plugin does not support sequence parallelism"
        plugin = LowLevelZeroPlugin(
            stage=2,
            precision=dtype,
            initial_scale=2**16,
            max_norm=grad_clip,
            reduce_bucket_size_in_m=reduce_bucket_size_in_m,
        )
        set_data_parallel_group(dist.group.WORLD)
    elif plugin == "zero2-seq":
        assert sp_size > 1, "Zero2-seq plugin requires sequence parallelism"
        plugin = ZeroSeqParallelPlugin(
            sp_size=sp_size,
            stage=2,
            precision=dtype,
            initial_scale=2**16,
            max_norm=grad_clip,
            reduce_bucket_size_in_m=reduce_bucket_size_in_m,
        )
        set_sequence_parallel_group(plugin.sp_group)
        set_data_parallel_group(plugin.dp_group)
    else:
        raise ValueError(f"Unknown plugin {plugin}")
    return plugin


@torch.no_grad()
def update_ema(
    ema_model: torch.nn.Module, model: torch.nn.Module, optimizer=None, decay: float = 0.9999, sharded: bool = True
) -> None:
    """
    Step the EMA model towards the current model.
    """
    ema_params = OrderedDict(ema_model.named_parameters())
    model_params = OrderedDict(model.named_parameters())

    for name, param in model_params.items():
        if name == "pos_embed":
            continue
        if not param.requires_grad:
            continue
        if not sharded:
            param_data = param.data
            ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)
        else:
            if param.data.dtype != torch.float32:
                param_id = id(param)
                master_param = optimizer._param_store.working_to_master_param[param_id]
                param_data = master_param.data
            else:
                param_data = param.data
            ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)


class MaskGenerator:
    def __init__(self, mask_ratios):
        valid_mask_names = [
            "identity",
            "quarter_random",
            "quarter_head",
            "quarter_tail",
            "quarter_head_tail",
            "image_random",
            "image_head",
            "image_tail",
            "image_head_tail",
            "random",
            "intepolate",
        ]
        assert all(
            mask_name in valid_mask_names for mask_name in mask_ratios.keys()
        ), f"mask_name should be one of {valid_mask_names}, got {mask_ratios.keys()}"
        assert all(
            mask_ratio >= 0 for mask_ratio in mask_ratios.values()
        ), f"mask_ratio should be greater than or equal to 0, got {mask_ratios.values()}"
        assert all(
            mask_ratio <= 1 for mask_ratio in mask_ratios.values()
        ), f"mask_ratio should be less than or equal to 1, got {mask_ratios.values()}"
        # sum of mask_ratios should be 1
        if "identity" not in mask_ratios:
            mask_ratios["identity"] = 1.0 - sum(mask_ratios.values())
        assert math.isclose(
            sum(mask_ratios.values()), 1.0, abs_tol=1e-6
        ), f"sum of mask_ratios should be 1, got {sum(mask_ratios.values())}"
        get_logger().info("mask ratios: %s", mask_ratios)
        self.mask_ratios = mask_ratios

    def get_mask(self, x):
        mask_type = random.random()
        mask_name = None
        prob_acc = 0.0
        for mask, mask_ratio in self.mask_ratios.items():
            prob_acc += mask_ratio
            if mask_type < prob_acc:
                mask_name = mask
                break

        num_frames = x.shape[2]
        # Hardcoded condition_frames
        condition_frames_max = num_frames // 4

        mask = torch.ones(num_frames, dtype=torch.bool, device=x.device)
        if num_frames <= 1:
            return mask

        if mask_name == "quarter_random":
            random_size = random.randint(1, condition_frames_max)
            random_pos = random.randint(0, x.shape[2] - random_size)
            mask[random_pos : random_pos + random_size] = 0
        elif mask_name == "image_random":
            random_size = 1
            random_pos = random.randint(0, x.shape[2] - random_size)
            mask[random_pos : random_pos + random_size] = 0
        elif mask_name == "quarter_head":
            random_size = random.randint(1, condition_frames_max)
            mask[:random_size] = 0
        elif mask_name == "image_head":
            random_size = 1
            mask[:random_size] = 0
        elif mask_name == "quarter_tail":
            random_size = random.randint(1, condition_frames_max)
            mask[-random_size:] = 0
        elif mask_name == "image_tail":
            random_size = 1
            mask[-random_size:] = 0
        elif mask_name == "quarter_head_tail":
            random_size = random.randint(1, condition_frames_max)
            mask[:random_size] = 0
            mask[-random_size:] = 0
        elif mask_name == "image_head_tail":
            random_size = 1
            mask[:random_size] = 0
            mask[-random_size:] = 0
        elif mask_name == "intepolate":
            random_start = random.randint(0, 1)
            mask[random_start::2] = 0
        elif mask_name == "random":
            mask_ratio = random.uniform(0.1, 0.9)
            mask = torch.rand(num_frames, device=x.device) > mask_ratio
            # if mask is all False, set the last frame to True
            if not mask.any():
                mask[-1] = 1

        return mask

    def get_masks(self, x):
        masks = []
        for _ in range(len(x)):
            mask = self.get_mask(x)
            masks.append(mask)
        masks = torch.stack(masks, dim=0)
        return masks


================================================
FILE: Open-Sora/opensora.egg-info/PKG-INFO
================================================
Metadata-Version: 2.1
Name: opensora
Version: 1.2.0
Summary: Democratizing Efficient Video Production for All
Home-page: https://github.com/hpcaitech/Open-Sora
License: Apache Software License 2.0
Project-URL: Bug Tracker, https://github.com/hpcaitech/Open-Sora/issues
Project-URL: Examples, https://hpcaitech.github.io/Open-Sora/
Project-URL: Documentation, https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file
Project-URL: Github, https://github.com/hpcaitech/Open-Sora
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Environment :: GPU :: NVIDIA CUDA
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
Classifier: Topic :: System :: Distributed Computing
Requires-Python: >=3.6
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: colossalai>=0.4.0
Requires-Dist: mmengine>=0.10.3
Requires-Dist: pandas>=2.0.3
Requires-Dist: timm==0.9.16
Requires-Dist: rotary_embedding_torch==0.5.3
Requires-Dist: ftfy>=6.2.0
Requires-Dist: diffusers==0.27.2
Requires-Dist: accelerate==0.29.2
Requires-Dist: av>=12.0.0
Requires-Dist: numpy<2.0.0
Requires-Dist: gradio>=4.26.0
Requires-Dist: spaces>=0.28.3
Requires-Dist: ipykernel>=6.29.4
Requires-Dist: ipywidgets>=8.1.2
Requires-Dist: wandb>=0.17.0
Requires-Dist: tensorboard>=2.14.0
Requires-Dist: pandarallel>=1.6.5
Requires-Dist: pyarrow>=16.1.0
Requires-Dist: pre-commit>=3.5.0
Requires-Dist: openai
Provides-Extra: data
Requires-Dist: gdown>=5.2.0; extra == "data"
Requires-Dist: ninja>=1.11.1.1; extra == "data"
Requires-Dist: shortuuid>=1.0.13; extra == "data"
Requires-Dist: markdown2[all]; extra == "data"
Requires-Dist: scikit-learn>=1.4.2; extra == "data"
Requires-Dist: einops-exts>=0.0.4; extra == "data"
Requires-Dist: decord==0.6.0; extra == "data"
Requires-Dist: ptvsd==4.3.2; extra == "data"
Requires-Dist: imageio-ffmpeg>=0.4.9; extra == "data"
Requires-Dist: ffmpeg-python==0.2.0; extra == "data"
Requires-Dist: lingua-language-detector==2.0.2; extra == "data"
Requires-Dist: imageio>=2.34.1; extra == "data"
Requires-Dist: setuptools==68.2.2; extra == "data"
Requires-Dist: clip@ git+https://github.com/openai/CLIP.git ; extra == "data"
Requires-Dist: mmcv==2.1.0; extra == "data"
Requires-Dist: mmdet==3.1.0; extra == "data"
Requires-Dist: mmocr==1.0.1; extra == "data"
Requires-Dist: detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 ; extra == "data"
Provides-Extra: eval
Requires-Dist: detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 ; extra == "eval"
Requires-Dist: imageio>=2.34.1; extra == "eval"
Requires-Dist: pyiqa==0.1.10; extra == "eval"
Requires-Dist: scikit-learn>=1.4.2; extra == "eval"
Requires-Dist: scikit-image>=0.20.0; extra == "eval"
Requires-Dist: lvis==0.5.3; extra == "eval"
Requires-Dist: boto3>=1.34.113; extra == "eval"
Requires-Dist: easydict>=1.9; extra == "eval"
Requires-Dist: fairscale>=0.4.13; extra == "eval"
Requires-Dist: decord==0.6.0; extra == "eval"
Requires-Dist: pytorchvideo==0.1.5; extra == "eval"
Requires-Dist: lpips==0.1.4; extra == "eval"
Provides-Extra: vae
Requires-Dist: beartype==0.18.5; extra == "vae"
Requires-Dist: einops==0.8.0; extra == "vae"
Requires-Dist: einops-exts==0.0.4; extra == "vae"
Requires-Dist: opencv-python==4.9.0.80; extra == "vae"
Requires-Dist: pillow==10.3.0; extra == "vae"
Provides-Extra: full
Requires-Dist: gdown>=5.2.0; extra == "full"
Requires-Dist: ninja>=1.11.1.1; extra == "full"
Requires-Dist: shortuuid>=1.0.13; extra == "full"
Requires-Dist: markdown2[all]; extra == "full"
Requires-Dist: scikit-learn>=1.4.2; extra == "full"
Requires-Dist: einops-exts>=0.0.4; extra == "full"
Requires-Dist: decord==0.6.0; extra == "full"
Requires-Dist: ptvsd==4.3.2; extra == "full"
Requires-Dist: imageio-ffmpeg>=0.4.9; extra == "full"
Requires-Dist: ffmpeg-python==0.2.0; extra == "full"
Requires-Dist: lingua-language-detector==2.0.2; extra == "full"
Requires-Dist: imageio>=2.34.1; extra == "full"
Requires-Dist: setuptools==68.2.2; extra == "full"
Requires-Dist: clip@ git+https://github.com/openai/CLIP.git ; extra == "full"
Requires-Dist: mmcv==2.1.0; extra == "full"
Requires-Dist: mmdet==3.1.0; extra == "full"
Requires-Dist: mmocr==1.0.1; extra == "full"
Requires-Dist: detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 ; extra == "full"
Requires-Dist: detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992 ; extra == "full"
Requires-Dist: imageio>=2.34.1; extra == "full"
Requires-Dist: pyiqa==0.1.10; extra == "full"
Requires-Dist: scikit-learn>=1.4.2; extra == "full"
Requires-Dist: scikit-image>=0.20.0; extra == "full"
Requires-Dist: lvis==0.5.3; extra == "full"
Requires-Dist: boto3>=1.34.113; extra == "full"
Requires-Dist: easydict>=1.9; extra == "full"
Requires-Dist: fairscale>=0.4.13; extra == "full"
Requires-Dist: decord==0.6.0; extra == "full"
Requires-Dist: pytorchvideo==0.1.5; extra == "full"
Requires-Dist: lpips==0.1.4; extra == "full"

<p align="center">
    <img src="./assets/readme/icon.png" width="250"/>
</p>
<div align="center">
    <a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
    <a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
    <a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
    <a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
    <a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
    <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
    <a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
    <a href="https://huggingface.co/spaces/hpcai-tech/open-sora"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Gradio Demo-blue"></a>
</div>

## Open-Sora: Democratizing Efficient Video Production for All

We design and implement **Open-Sora**, an initiative dedicated to **efficiently** producing high-quality video. We hope to make the model,
tools and all details accessible to all. By embracing **open-source** principles,
Open-Sora not only democratizes access to advanced video generation techniques, but also offers a
streamlined and user-friendly platform that simplifies the complexities of video generation.
With Open-Sora, our goal is to foster innovation, creativity, and inclusivity within the field of content creation.

[[中文文档](/docs/zh_CN/README.md)] [[潞晨云](https://cloud.luchentech.com/)|[OpenSora镜像](https://cloud.luchentech.com/doc/docs/image/open-sora/)|[视频教程](https://www.bilibili.com/video/BV1ow4m1e7PX/?vd_source=c6b752764cd36ff0e535a768e35d98d2)]

## 📰 News

- **[2024.06.17]** 🔥 We released **Open-Sora 1.2**, which includes **3D-VAE**, **rectified flow**, and **score condition**. The video quality is greatly improved. [[checkpoints]](#open-sora-10-model-weights) [[report]](/docs/report_03.md)   [[blog]](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use)
- **[2024.04.25]** 🤗 We released the [Gradio demo for Open-Sora](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face Spaces.
- **[2024.04.25]** We released **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]]() [[report]](/docs/report_02.md)
- **[2024.03.18]** We released **Open-Sora 1.0**, a fully open-source project for video generation.
  Open-Sora 1.0 supports a full pipeline of video data preprocessing, training with
  <a href="https://github.com/hpcaitech/ColossalAI"><img src="assets/readme/colossal_ai.png" width="8%" ></a>
  acceleration,
  inference, and more. Our model can produce 2s 512x512 videos with only 3 days training. [[checkpoints]](#open-sora-10-model-weights)
  [[blog]](https://hpc-ai.com/blog/open-sora-v1.0) [[report]](/docs/report_01.md)
- **[2024.03.04]** Open-Sora provides training with 46% cost reduction.
  [[blog]](https://hpc-ai.com/blog/open-sora)

## 🎥 Latest Demo

🔥 You can experience Open-Sora on our [🤗 Gradio application on Hugging Face](https://huggingface.co/spaces/hpcai-tech/open-sora). More samples and corresponding prompts are available in our [Gallery](https://hpcaitech.github.io/Open-Sora/).

| **4s 720×1280**                                                                                                                                      | **4s 720×1280**                                                                                                                                      | **4s 720×1280**                                                                                                                                      |
| ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="assets/demo/v1.2/sample_0013.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/7895aab6-ed23-488c-8486-091480c26327) | [<img src="assets/demo/v1.2/sample_1718.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/20f07c7b-182b-4562-bbee-f1df74c86c9a) | [<img src="assets/demo/v1.2/sample_0087.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3d897e0d-dc21-453a-b911-b3bda838acc2) |
| [<img src="assets/demo/v1.2/sample_0052.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/644bf938-96ce-44aa-b797-b3c0b513d64c) | [<img src="assets/demo/v1.2/sample_1719.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/272d88ac-4b4a-484d-a665-8d07431671d0) | [<img src="assets/demo/v1.2/sample_0002.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ebbac621-c34e-4bb4-9543-1c34f8989764) |
| [<img src="assets/demo/v1.2/sample_0011.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/a1e3a1a3-4abd-45f5-8df2-6cced69da4ca) | [<img src="assets/demo/v1.2/sample_0004.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/d6ce9c13-28e1-4dff-9644-cc01f5f11926) | [<img src="assets/demo/v1.2/sample_0061.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/561978f8-f1b0-4f4d-ae7b-45bec9001b4a) |

<details>
<summary>OpenSora 1.1 Demo</summary>

| **2s 240×426**                                                                                                                                              | **2s 240×426**                                                                                                                                             |
| ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [<img src="assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) |
| [<img src="assets/demo/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd)  | [<img src="assets/demo/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) |

| **2s 426×240**                                                                                                                                             | **4s 480×854**                                                                                                                                              |
| ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="assets/demo/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="assets/demo/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) |

| **16s 320×320**                                                                                                                                        | **16s 224×448**                                                                                                                                        | **2s 426×240**                                                                                                                                            |
| ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="assets/demo/sample_16s_320x320.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [<img src="assets/demo/sample_16s_224x448.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) | [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |

</details>

<details>
<summary>OpenSora 1.0 Demo</summary>

| **2s 512×512**                                                                                                                                                                 | **2s 512×512**                                                                                                                                                              | **2s 512×512**                                                                                                                                    |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="assets/readme/sample_0.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80)                                 | [<img src="assets/readme/sample_1.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc)                              | [<img src="assets/readme/sample_2.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16)    |
| A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff. | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall. |
| [<img src="assets/readme/sample_3.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94)                                 | [<img src="assets/readme/sample_4.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9)                              | [<img src="assets/readme/sample_5.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65)    |
| A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...]                                                           | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...]                                            | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...]                   |

Videos are downsampled to `.gif` for display. Click for original videos. Prompts are trimmed for display,
see [here](/assets/texts/t2v_samples.txt) for full prompts.

</details>

## 🔆 New Features/Updates

- 📍 **Open-Sora 1.2** released. Model weights are available [here](#model-weights). See our **[report 1.2](/docs/report_03.md)** for more details.
- ✅ Support rectified flow scheduling.
- ✅ Support more conditioning including fps, aesthetic score, motion strength and camera motion.
- ✅ Trained our 3D-VAE for temporal dimension compression.
- 📍 **Open-Sora 1.1** released. Model weights are available [here](#model-weights). It is trained on **0s~15s, 144p to 720p, various aspect ratios** videos. See our **[report 1.1](/docs/report_02.md)** for more discussions.
- 🔧 **Data processing pipeline v1.1** is released. An automatic [processing pipeline](#data-processing) from raw videos to (text, video clip) pairs is provided, including scene cutting $\rightarrow$ filtering(aesthetic, optical flow, OCR, etc.) $\rightarrow$ captioning $\rightarrow$ managing. With this tool, you can easily build your video dataset.

<details>
<summary>View more</summary>

- ✅ Improved ST-DiT architecture includes rope positional encoding, qk norm, longer text length, etc.
- ✅ Support training with any resolution, aspect ratio, and duration (including images).
- ✅ Support image and video conditioning and video editing, and thus support animating images, connecting videos, etc.
- 📍 **Open-Sora 1.0** released. Model weights are available [here](#model-weights). With only 400K video clips and 200 H800
  days (compared with 152M samples in Stable Video Diffusion), we are able to generate 2s 512×512 videos. See our **[report 1.0](docs/report_01.md)** for more discussions.
- ✅ Three-stage training from an image diffusion model to a video diffusion model. We provide the weights for each
  stage.
- ✅ Support training acceleration including accelerated transformer, faster T5 and VAE, and sequence parallelism.
  Open-Sora improves **55%** training speed when training on 64x512x512 videos. Details locates
  at [acceleration.md](docs/acceleration.md).
- 🔧 **Data preprocessing pipeline v1.0**,
  including [downloading](tools/datasets/README.md), [video cutting](tools/scene_cut/README.md),
  and [captioning](tools/caption/README.md) tools. Our data collection plan can be found
  at [datasets.md](docs/datasets.md).
- ✅ We find VQ-VAE from [VideoGPT](https://wilson1yan.github.io/videogpt/index.html) has a low quality and thus adopt a
  better VAE from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original). We also find patching in
  the time dimension deteriorates the quality. See our **[report](docs/report_01.md)** for more discussions.
- ✅ We investigate different architectures including DiT, Latte, and our proposed STDiT. Our **STDiT** achieves a better
  trade-off between quality and speed. See our **[report](docs/report_01.md)** for more discussions.
- ✅ Support clip and T5 text conditioning.
- ✅ By viewing images as one-frame videos, our project supports training DiT on both images and videos (e.g., ImageNet &
  UCF101). See [commands.md](docs/commands.md) for more instructions.
- ✅ Support inference with official weights
  from [DiT](https://github.com/facebookresearch/DiT), [Latte](https://github.com/Vchitect/Latte),
  and [PixArt](https://pixart-alpha.github.io/).
- ✅ Refactor the codebase. See [structure.md](docs/structure.md) to learn the project structure and how to use the
  config files.

</details>

### TODO list sorted by priority

<details>
<summary>View more</summary>

- [x] Training Video-VAE and adapt our model to new VAE.
- [x] Scaling model parameters and dataset size.
- [x] Incoporate a better scheduler (rectified flow).
- [x] Evaluation pipeline.
- [x] Complete the data processing pipeline (including dense optical flow, aesthetics scores, text-image similarity, etc.). See [the dataset](/docs/datasets.md) for more information
- [x] Support image and video conditioning.
- [x] Support variable aspect ratios, resolutions, durations.

</details>

## Contents

- [Installation](#installation)
- [Model Weights](#model-weights)
- [Gradio Demo](#gradio-demo)
- [Inference](#inference)
- [Data Processing](#data-processing)
- [Training](#training)
- [Evaluation](#evaluation)
- [VAE Training & Evaluation](#vae-training--evaluation)
- [Contribution](#contribution)
- [Citation](#citation)
- [Acknowledgement](#acknowledgement)

Other useful documents and links are listed below.

- Report: each version is trained from a image base seperately (not continuously trained), while a newer version will incorporate the techniques from the previous version.
  - [report 1.2](docs/report_03.md): rectified flow, 3d-VAE, score condition, evaluation, etc.
  - [report 1.1](docs/report_02.md): multi-resolution/length/aspect-ratio, image/video conditioning/editing, data preprocessing, etc.
  - [report 1.0](docs/report_01.md): architecture, captioning, etc.
  - [acceleration.md](docs/acceleration.md)
- Repo structure: [structure.md](docs/structure.md)
- Config file explanation: [config.md](docs/config.md)
- Useful commands: [commands.md](docs/commands.md)
- Data processing pipeline and dataset: [datasets.md](docs/datasets.md)
- Each data processing tool's README: [dataset conventions and management](/tools/datasets/README.md), [scene cutting](/tools/scene_cut/README.md), [scoring](/tools/scoring/README.md), [caption](/tools/caption/README.md)
- Evaluation: [eval/README.md](/eval/README.md)
- Gallery: [gallery](https://hpcaitech.github.io/Open-Sora/)

## Installation

### Install from Source

For CUDA 12.1, you can install the dependencies with the following commands. Otherwise, please refer to [Installation Documentation](docs/installation.md) for more instructions on different cuda version, and additional dependency for data preprocessing, VAE, and model evaluation.

```bash
# create a virtual env and activate (conda as an example)
conda create -n opensora python=3.9
conda activate opensora

# download the repo
git clone https://github.com/hpcaitech/Open-Sora
cd Open-Sora

# install torch, torchvision and xformers
pip install -r requirements/requirements-cu121.txt

# the default installation is for inference only
pip install -v . # for development mode, `pip install -v -e .`
```

(Optional, recommended for fast speed, especially for training) To enable `layernorm_kernel` and `flash_attn`, you need to install `apex` and `flash-attn` with the following commands.

```bash
# install flash attention
# set enable_flash_attn=False in config to disable flash attention
pip install packaging ninja
pip install flash-attn --no-build-isolation

# install apex
# set enable_layernorm_kernel=False in config to disable apex
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
```

### Use Docker

Run the following command to build a docker image from Dockerfile provided.

```bash
docker build -t opensora .
```

Run the following command to start the docker container in interactive mode.

```bash
docker run -ti --gpus all -v .:/workspace/Open-Sora opensora
```

## Model Weights

### Open-Sora 1.2 Model Weights

| Model     | Model Size | Data | #iterations | Batch Size | URL                                                           |
| --------- | ---------- | ---- | ----------- | ---------- | ------------------------------------------------------------- |
| Diffusion | 1.1B       | 30M  | 70k         | Dynamic    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v3) |
| VAE       | 384M       | 3M   | 1M          | 8          | [:link:](https://huggingface.co/hpcai-tech/OpenSora-VAE-v1.2) |

See our **[report 1.2](docs/report_03.md)** for more infomation. Weight will be automatically downloaded when you run the inference script.

> For users from mainland China, try `export HF_ENDPOINT=https://hf-mirror.com` to successfully download the weights.

### Open-Sora 1.1 Model Weights

<details>
<summary>View more</summary>

| Resolution         | Model Size | Data                       | #iterations | Batch Size                                        | URL                                                                  |
| ------------------ | ---------- | -------------------------- | ----------- | ------------------------------------------------- | -------------------------------------------------------------------- |
| mainly 144p & 240p | 700M       | 10M videos + 2M images     | 100k        | [dynamic](/configs/opensora-v1-1/train/stage2.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) |
| 144p to 720p       | 700M       | 500K HQ videos + 1M images | 4k          | [dynamic](/configs/opensora-v1-1/train/stage3.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) |

See our **[report 1.1](docs/report_02.md)** for more infomation.

:warning: **LIMITATION**: This version contains known issues which we are going to fix in the next version (as we save computation resource for the next release). In addition, the video generation may fail for long duration, and high resolution will have noisy results due to this problem.

</details>

### Open-Sora 1.0 Model Weights

<details>
<summary>View more</summary>

| Resolution | Model Size | Data   | #iterations | Batch Size | GPU days (H800) | URL                                                                                           |
| ---------- | ---------- | ------ | ----------- | ---------- | --------------- | --------------------------------------------------------------------------------------------- |
| 16×512×512 | 700M       | 20K HQ | 20k         | 2×64       | 35              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth) |
| 16×256×256 | 700M       | 20K HQ | 24k         | 8×64       | 45              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) |
| 16×256×256 | 700M       | 366K   | 80k         | 8×64       | 117             | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth)    |

Training orders: 16x256x256 $\rightarrow$ 16x256x256 HQ $\rightarrow$ 16x512x512 HQ.

Our model's weight is partially initialized from [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha). The number of
parameters is 724M. More information about training can be found in our **[report](/docs/report_01.md)**. More about
the dataset can be found in [datasets.md](/docs/datasets.md). HQ means high quality.

:warning: **LIMITATION**: Our model is trained on a limited budget. The quality and text alignment is relatively poor.
The model performs badly, especially on generating human beings and cannot follow detailed instructions. We are working
on improving the quality and text alignment.

</details>

## Gradio Demo

🔥 You can experience Open-Sora on our [🤗 Gradio application](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face online.

### Local Deployment

If you want to deploy gradio locally, we have also provided a [Gradio application](./gradio) in this repository, you can use the following the command to start an interactive web application to experience video generation with Open-Sora.

```bash
pip install gradio spaces
python gradio/app.py
```

This will launch a Gradio application on your localhost. If you want to know more about the Gradio applicaiton, you can refer to the [Gradio README](./gradio/README.md).

To enable prompt enhancement and other language input (e.g., 中文输入), you need to set the `OPENAI_API_KEY` in the environment. Check [OpenAI's documentation](https://platform.openai.com/docs/quickstart) to get your API key.

```bash
export OPENAI_API_KEY=YOUR_API_KEY
```

### Getting Started

In the Gradio application, the basic options are as follows:

![Gradio Demo](assets/readme/gradio_basic.png)

The easiest way to generate a video is to input a text prompt and click the "**Generate video**" button (scroll down if you cannot find). The generated video will be displayed in the right panel. Checking the "**Enhance prompt with GPT4o**" will use GPT-4o to refine the prompt, while "**Random Prompt**" button will generate a random prompt by GPT-4o for you. Due to the OpenAI's API limit, the prompt refinement result has some randomness.

Then, you can choose the **resolution**, **duration**, and **aspect ratio** of the generated video. Different resolution and video length will affect the video generation speed. On a 80G H100 GPU, the generation speed (with `num_sampling_step=30`) and peak memory usage is:

|      | Image   | 2s       | 4s        | 8s        | 16s       |
| ---- | ------- | -------- | --------- | --------- | --------- |
| 360p | 3s, 24G | 18s, 27G | 31s, 27G  | 62s, 28G  | 121s, 33G |
| 480p | 2s, 24G | 29s, 31G | 55s, 30G  | 108s, 32G | 219s, 36G |
| 720p | 6s, 27G | 68s, 41G | 130s, 39G | 260s, 45G | 547s, 67G |

Note that besides text to video, you can also use **image to video generation**. You can upload an image and then click the "**Generate video**" button to generate a video with the image as the first frame. Or you can fill in the text prompt and click the "**Generate image**" button to generate an image with the text prompt, and then click the "**Generate video**" button to generate a video with the image generated with the same model.

![Gradio Demo](assets/readme/gradio_option.png)

Then you can specify more options, including "**Motion Strength**", "**Aesthetic**" and "**Camera Motion**". If "Enable" not checked or the choice is "none", the information is not passed to the model. Otherwise, the model will generate videos with the specified motion strength, aesthetic score, and camera motion.

For the **aesthetic score**, we recommend using values higher than 6. For **motion strength**, a smaller value will lead to a smoother but less dynamic video, while a larger value will lead to a more dynamic but likely more blurry video. Thus, you can try without it and then adjust it according to the generated video. For the **camera motion**, sometimes the model cannot follow the instruction well, and we are working on improving it.

You can also adjust the "**Sampling steps**", this is directly related to the generation speed as it is the number of denoising. A number smaller than 30 usually leads to a poor generation results, while a number larger than 100 usually has no significant improvement. The "**Seed**" is used for reproducibility, you can set it to a fixed number to generate the same video. The "**CFG Scale**" controls how much the model follows the text prompt, a smaller value will lead to a more random video, while a larger value will lead to a more text-following video (7 is recommended).

For more advanced usage, you can refer to [Gradio README](./gradio/README.md#advanced-usage).

## Inference

### Open-Sora 1.2 Command Line Inference

The basic command line inference is as follows:

```bash
# text to video
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --aspect-ratio 9:16 \
  --prompt "a beautiful waterfall"
```

You can add more options to the command line to customize the generation.

```bash
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --aspect-ratio 9:16 \
  --num-sampling-steps 30 --flow 5 --aes 6.5 \
  --prompt "a beautiful waterfall"
```

For image to video generation and other functionalities, the API is compatible with Open-Sora 1.1. See [here](docs/commands.md) for more instructions.

If your installation do not contain `apex` and `flash-attn`, you need to disable them in the config file, or via the folowing command.

```bash
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p \
  --layernorm-kernel False --flash-attn False \
  --prompt "a beautiful waterfall"
```

### Sequence Parallelism Inference

To enable sequence parallelism, you need to use `torchrun` to run the inference script. The following command will run the inference with 2 GPUs.

```bash
# text to video
CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --aspect-ratio 9:16 \
  --prompt "a beautiful waterfall"
```

:warning: **LIMITATION**: The sequence parallelism is not supported for gradio deployment. For now, the sequence parallelism is only supported when the dimension can be divided by the number of GPUs. Thus, it may fail for some cases. We tested 4 GPUs for 720p and 2 GPUs for 480p.

### GPT-4o Prompt Refinement

We find that GPT-4o can refine the prompt and improve the quality of the generated video. With this feature, you can also use other language (e.g., Chinese) as the prompt. To enable this feature, you need prepare your openai api key in the environment:

```bash
export OPENAI_API_KEY=YOUR_API_KEY
```

Then you can inference with `--llm-refine True` to enable the GPT-4o prompt refinement, or leave prompt empty to get a random prompt generated by GPT-4o.

```bash
python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
  --num-frames 4s --resolution 720p --llm-refine True
```

### Open-Sora 1.1 Command Line Inference

<details>
<summary>View more</summary>

Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument.

```bash
# text to video
python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854
```

If your installation do not contain `apex` and `flash-attn`, you need to disable them in the config file, or via the folowing command.

```bash
python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854 --layernorm-kernel False --flash-attn False
```

See [here](docs/commands.md#inference-with-open-sora-11) for more instructions including text-to-image, image-to-video, video-to-video, and infinite time generation.

</details>

### Open-Sora 1.0 Command Line Inference

<details>
<summary>View more</summary>

We have also provided an offline inference script. Run the following commands to generate samples, the required model weights will be automatically downloaded. To change sampling prompts, modify the txt file passed to `--prompt-path`. See [here](docs/structure.md#inference-config-demos) to customize the configuration.

```bash
# Sample 16x512x512 (20s/sample, 100 time steps, 24 GB memory)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path OpenSora-v1-HQ-16x512x512.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 16x256x256 (5s/sample, 100 time steps, 22 GB memory)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path OpenSora-v1-HQ-16x256x256.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 64x512x512 (40s/sample, 100 time steps)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt

# Sample 64x512x512 with sequence parallelism (30s/sample, 100 time steps)
# sequence parallelism is enabled automatically when nproc_per_node is larger than 1
torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt
```

The speed is tested on H800 GPUs. For inference with other models, see [here](docs/commands.md) for more instructions.
To lower the memory usage, set a smaller `vae.micro_batch_size` in the config (slightly lower sampling speed).

</details>

## Data Processing

High-quality data is crucial for training good generation models.
To this end, we establish a complete pipeline for data processing, which could seamlessly convert raw videos to high-quality video-text pairs.
The pipeline is shown below. For detailed information, please refer to [data processing](docs/data_processing.md).
Also check out the [datasets](docs/datasets.md) we use.

![Data Processing Pipeline](assets/readme/report_data_pipeline.png)

## Training

### Open-Sora 1.2 Training

The training process is same as Open-Sora 1.1.

```bash
# one node
torchrun --standalone --nproc_per_node 8 scripts/train.py \
    configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
# multiple nodes
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \
    configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

### Open-Sora 1.1 Training

<details>
<summary>View more</summary>

Once you prepare the data in a `csv` file, run the following commands to launch training on a single node.

```bash
# one node
torchrun --standalone --nproc_per_node 8 scripts/train.py \
    configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
# multiple nodes
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \
    configs/opensora-v1-1/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

</details>

### Open-Sora 1.0 Training

<details>
<summary>View more</summary>

Once you prepare the data in a `csv` file, run the following commands to launch training on a single node.

```bash
# 1 GPU, 16x256x256
torchrun --nnodes=1 --nproc_per_node=1 scripts/train.py configs/opensora/train/16x256x256.py --data-path YOUR_CSV_PATH
# 8 GPUs, 64x512x512
torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

To launch training on multiple nodes, prepare a hostfile according
to [ColossalAI](https://colossalai.org/docs/basics/launch_colossalai/#launch-with-colossal-ai-cli), and run the
following commands.

```bash
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```

For training other models and advanced usage, see [here](docs/commands.md) for more instructions.

</details>

## Evaluation

We support evaluation based on:

- Validation loss
- [VBench](https://github.com/Vchitect/VBench/tree/master) score
- VBench-i2v score
- Batch generation for human evaluation

All the evaluation code is released in `eval` folder. Check the [README](/eval/README.md) for more details. Our [report](/docs/report_03.md#evaluation) also provides more information about the evaluation during training. The following table shows Open-Sora 1.2 greatly improves Open-Sora 1.0.

| Model          | Total Score | Quality Score | Semantic Score |
| -------------- | ----------- | ------------- | -------------- |
| Open-Sora V1.0 | 75.91%      | 78.81%        | 64.28%         |
| Open-Sora V1.2 | 79.23%      | 80.71%        | 73.30%         |

## VAE Training & Evaluation

We train a VAE pipeline that consists of a spatial VAE followed by a temporal VAE.
For more details, refer to [VAE Documentation](docs/vae.md).
Before you run the following commands, follow our [Installation Documentation](docs/installation.md) to install the required dependencies for VAE and Evaluation.

If you want to train your own VAE, we need to prepare data in the csv following the [data processing](#data-processing) pipeline, then run the following commands.
Note that you need to adjust the number of trained epochs (`epochs`) in the config file accordingly with respect to your own csv data size.

```bash
# stage 1 training, 380k steps, 8 GPUs
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
# stage 2 training, 260k steps, 8 GPUs
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH
# stage 3 training, 540k steps, 24 GPUs
torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH
```

To evaluate the VAE performance, you need to run VAE inference first to generate the videos, then calculate scores on the generated videos:

```bash
# video generation
torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
# the original videos will be saved to `YOUR_VIDEO_DIR_ori`
# the reconstructed videos through the pipeline will be saved to `YOUR_VIDEO_DIR_rec`
# the reconstructed videos through the spatial VAE only will be saved to `YOUR_VIDEO_DIR_spatial`

# score calculation
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
```

## Contribution

Thanks goes to these wonderful contributors:

<a href="https://github.com/hpcaitech/Open-Sora/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=hpcaitech/Open-Sora" />
</a>

If you wish to contribute to this project, please refer to the [Contribution Guideline](./CONTRIBUTING.md).

## Acknowledgement

Here we only list a few of the projects. For other works and datasets, please refer to our report.

- [ColossalAI](https://github.com/hpcaitech/ColossalAI): A powerful large model parallel acceleration and optimization
  system.
- [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers.
- [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): An acceleration for DiT training. We adopt valuable acceleration
  strategies for training progress from OpenDiT.
- [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): An open-source DiT-based text-to-image model.
- [Latte](https://github.com/Vchitect/Latte): An attempt to efficiently train DiT for video.
- [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model.
- [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model.
- [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder.
- [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) and [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
- [PLLaVA](https://github.com/magic-research/PLLaVA): A powerful video captioning model.
- [MiraData](https://github.com/mira-space/MiraData): A large-scale video dataset with long durations and structured caption.

We are grateful for their exceptional work and generous contribution to open source. Special thanks go to the authors of [MiraData](https://github.com/mira-space/MiraData) and [Rectified Flow](https://github.com/gnobitab/RectifiedFlow) for their valuable advice and help. We wish to express gratitude towards AK for sharing this project on social media and Hugging Face for providing free GPU resources for our online Gradio demo.

## Citation

```bibtex
@software{opensora,
  author = {Zangwei Zheng and Xiangyu Peng and Tianji Yang and Chenhui Shen and Shenggui Li and Hongxin Liu and Yukun Zhou and Tianyi Li and Yang You},
  title = {Open-Sora: Democratizing Efficient Video Production for All},
  month = {March},
  year = {2024},
  url = {https://github.com/hpcaitech/Open-Sora}
}
```

## Star History

[![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date)


================================================
FILE: Open-Sora/opensora.egg-info/SOURCES.txt
================================================
LICENSE
README.md
pyproject.toml
setup.py
opensora/__init__.py
opensora/registry.py
opensora.egg-info/PKG-INFO
opensora.egg-info/SOURCES.txt
opensora.egg-info/dependency_links.txt
opensora.egg-info/requires.txt
opensora.egg-info/top_level.txt
opensora/acceleration/__init__.py
opensora/acceleration/checkpoint.py
opensora/acceleration/communications.py
opensora/acceleration/parallel_states.py
opensora/acceleration/plugin.py
opensora/acceleration/shardformer/__init__.py
opensora/acceleration/shardformer/modeling/__init__.py
opensora/acceleration/shardformer/modeling/t5.py
opensora/acceleration/shardformer/policy/__init__.py
opensora/acceleration/shardformer/policy/t5_encoder.py
opensora/datasets/__init__.py
opensora/datasets/aspect.py
opensora/datasets/bucket.py
opensora/datasets/dataloader.py
opensora/datasets/datasets.py
opensora/datasets/read_video.py
opensora/datasets/sampler.py
opensora/datasets/utils.py
opensora/datasets/video_transforms.py
opensora/models/__init__.py
opensora/models/cache_functions/__init__.py
opensora/models/cache_functions/attention.py
opensora/models/cache_functions/cache_cutfresh.py
opensora/models/cache_functions/cache_init.py
opensora/models/cache_functions/force_init.py
opensora/models/cache_functions/force_scheduler.py
opensora/models/cache_functions/fresh_ratio_scheduler.py
opensora/models/cache_functions/global_force_fresh.py
opensora/models/cache_functions/score_evaluate.py
opensora/models/cache_functions/scores.py
opensora/models/cache_functions/token_merge.py
opensora/models/cache_functions/update_cache.py
opensora/models/dit/__init__.py
opensora/models/dit/dit.py
opensora/models/latte/__init__.py
opensora/models/latte/latte.py
opensora/models/layers/__init__.py
opensora/models/layers/blocks.py
opensora/models/pixart/__init__.py
opensora/models/pixart/pixart.py
opensora/models/pixart/pixart_sigma.py
opensora/models/stdit/__init__.py
opensora/models/stdit/stdit.py
opensora/models/stdit/stdit2.py
opensora/models/stdit/stdit3 copy.py
opensora/models/stdit/stdit3.py
opensora/models/text_encoder/__init__.py
opensora/models/text_encoder/classes.py
opensora/models/text_encoder/clip.py
opensora/models/text_encoder/t5.py
opensora/models/vae/__init__.py
opensora/models/vae/discriminator.py
opensora/models/vae/losses.py
opensora/models/vae/lpips.py
opensora/models/vae/utils.py
opensora/models/vae/vae.py
opensora/models/vae/vae_temporal.py
opensora/schedulers/__init__.py
opensora/schedulers/dpms/__init__.py
opensora/schedulers/dpms/dpm_solver.py
opensora/schedulers/iddpm/__init__.py
opensora/schedulers/iddpm/diffusion_utils.py
opensora/schedulers/iddpm/gaussian_diffusion.py
opensora/schedulers/iddpm/respace.py
opensora/schedulers/iddpm/speed.py
opensora/schedulers/iddpm/timestep_sampler.py
opensora/schedulers/rf/__init__.py
opensora/schedulers/rf/rectified_flow.py
opensora/utils/__init__.py
opensora/utils/ckpt_utils.py
opensora/utils/config_utils.py
opensora/utils/inference_utils.py
opensora/utils/lr_scheduler.py
opensora/utils/misc.py
opensora/utils/train_utils.py
tests/test_attn.py
tests/test_lr_scheduler.py
tests/test_np_torch.py
tests/test_pos_emb.py
tests/test_seq_parallel_attention.py
tests/test_stdit3_sequence_parallelism.py
tests/test_t5_shardformer.py
tools/caption/__init__.py
tools/caption/camera_motion_detect.py
tools/caption/caption_gpt4.py
tools/caption/caption_llama3.py
tools/caption/caption_llava.py
tools/caption/utils.py
tools/caption/acceleration/__init__.py
tools/caption/acceleration/llava/__init__.py
tools/caption/acceleration/llava/policies/__init__.py
tools/caption/acceleration/llava/policies/llama.py
tools/caption/acceleration/llava/policies/mistral.py
tools/caption/camera_motion/__init__.py
tools/caption/camera_motion/camera_motion.py
tools/caption/camera_motion/detect.py
tools/caption/camera_motion/utils.py
tools/caption/camera_motion/visualizer.py
tools/datasets/__init__.py
tools/datasets/analyze.py
tools/datasets/convert.py
tools/datasets/datautil.py
tools/datasets/filter_panda10m.py
tools/datasets/split.py
tools/datasets/transform.py
tools/datasets/utils.py
tools/frame_interpolation/__init__.py
tools/frame_interpolation/interpolation.py
tools/frame_interpolation/networks/__init__.py
tools/frame_interpolation/networks/amt_g.py
tools/frame_interpolation/networks/blocks/__init__.py
tools/frame_interpolation/networks/blocks/feat_enc.py
tools/frame_interpolation/networks/blocks/ifrnet.py
tools/frame_interpolation/networks/blocks/multi_flow.py
tools/frame_interpolation/networks/blocks/raft.py
tools/frame_interpolation/utils/__init__.py
tools/frame_interpolation/utils/dist_utils.py
tools/frame_interpolation/utils/flow_utils.py
tools/frame_interpolation/utils/utils.py
tools/scene_cut/__init__.py
tools/scene_cut/convert_id_to_path.py
tools/scene_cut/cut.py
tools/scene_cut/scene_detect.py
tools/scoring/__init__.py
tools/scoring/aesthetic/__init__.py
tools/scoring/aesthetic/inference.py
tools/scoring/matching/__init__.py
tools/scoring/matching/inference.py
tools/scoring/ocr/__init__.py
tools/scoring/ocr/dbnetpp.py
tools/scoring/ocr/inference.py
tools/scoring/optical_flow/__init__.py
tools/scoring/optical_flow/inference.py
tools/scoring/optical_flow/unimatch/__init__.py
tools/scoring/optical_flow/unimatch/attention.py
tools/scoring/optical_flow/unimatch/backbone.py
tools/scoring/optical_flow/unimatch/geometry.py
tools/scoring/optical_flow/unimatch/matching.py
tools/scoring/optical_flow/unimatch/position.py
tools/scoring/optical_flow/unimatch/reg_refine.py
tools/scoring/optical_flow/unimatch/transformer.py
tools/scoring/optical_flow/unimatch/trident_conv.py
tools/scoring/optical_flow/unimatch/unimatch.py
tools/scoring/optical_flow/unimatch/utils.py

================================================
FILE: Open-Sora/opensora.egg-info/dependency_links.txt
================================================


================================================
FILE: Open-Sora/opensora.egg-info/requires.txt
================================================
colossalai>=0.4.0
mmengine>=0.10.3
pandas>=2.0.3
timm==0.9.16
rotary_embedding_torch==0.5.3
ftfy>=6.2.0
diffusers==0.27.2
accelerate==0.29.2
av>=12.0.0
numpy<2.0.0
gradio>=4.26.0
spaces>=0.28.3
ipykernel>=6.29.4
ipywidgets>=8.1.2
wandb>=0.17.0
tensorboard>=2.14.0
pandarallel>=1.6.5
pyarrow>=16.1.0
pre-commit>=3.5.0
openai

[data]
gdown>=5.2.0
ninja>=1.11.1.1
shortuuid>=1.0.13
markdown2[all]
scikit-learn>=1.4.2
einops-exts>=0.0.4
decord==0.6.0
ptvsd==4.3.2
imageio-ffmpeg>=0.4.9
ffmpeg-python==0.2.0
lingua-language-detector==2.0.2
imageio>=2.34.1
setuptools==68.2.2
clip@ git+https://github.com/openai/CLIP.git
mmcv==2.1.0
mmdet==3.1.0
mmocr==1.0.1
detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992

[eval]
detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992
imageio>=2.34.1
pyiqa==0.1.10
scikit-learn>=1.4.2
scikit-image>=0.20.0
lvis==0.5.3
boto3>=1.34.113
easydict>=1.9
fairscale>=0.4.13
decord==0.6.0
pytorchvideo==0.1.5
lpips==0.1.4

[full]
gdown>=5.2.0
ninja>=1.11.1.1
shortuuid>=1.0.13
markdown2[all]
scikit-learn>=1.4.2
einops-exts>=0.0.4
decord==0.6.0
ptvsd==4.3.2
imageio-ffmpeg>=0.4.9
ffmpeg-python==0.2.0
lingua-language-detector==2.0.2
imageio>=2.34.1
setuptools==68.2.2
clip@ git+https://github.com/openai/CLIP.git
mmcv==2.1.0
mmdet==3.1.0
mmocr==1.0.1
detectron2@ git+https://github.com/facebookresearch/detectron2.git@ff53992
pyiqa==0.1.10
scikit-image>=0.20.0
lvis==0.5.3
boto3>=1.34.113
easydict>=1.9
fairscale>=0.4.13
pytorchvideo==0.1.5
lpips==0.1.4

[vae]
beartype==0.18.5
einops==0.8.0
einops-exts==0.0.4
opencv-python==4.9.0.80
pillow==10.3.0


================================================
FILE: Open-Sora/opensora.egg-info/top_level.txt
================================================
opensora
tools


================================================
FILE: Open-Sora/pyproject.toml
================================================
[tool.autoflake]
remove-unused-variables = true
remove-all-unused-imports = true
ignore-init-module-imports = true

[tool.isort]
line_length = 120
multi_line_output = 3
include_trailing_comma = true
ignore_comments = true
profile = "black"
honor_noqa = true

[tool.black]
line-length = 120
target-version = ["py37", "py38", "py39", "py310"]


================================================
FILE: Open-Sora/requirements/requirements-cu121.txt
================================================
torch==2.2.2 --index-url https://download.pytorch.org/whl/cu121
torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu121
xformers==0.0.25.post1 --index-url https://download.pytorch.org/whl/cu121


================================================
FILE: Open-Sora/requirements/requirements-data.txt
================================================
gdown>=5.2.0

# [caption llava]
ninja>=1.11.1.1
shortuuid>=1.0.13
markdown2[all]
scikit-learn>=1.4.2
einops-exts>=0.0.4

# [camera_motion]
decord==0.6.0
ptvsd==4.3.2
imageio-ffmpeg>=0.4.9

# [datasets]
ffmpeg-python==0.2.0
lingua-language-detector==2.0.2

# [frame interpolation]
imageio>=2.34.1

# [aesthetic]
setuptools==68.2.2
clip @ git+https://github.com/openai/CLIP.git

# [ocr]
mmcv==2.1.0
mmdet==3.1.0
mmocr==1.0.1
detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992


================================================
FILE: Open-Sora/requirements/requirements-eval.txt
================================================
# [vbench]
detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992
imageio>=2.34.1
pyiqa==0.1.10
scikit-learn>=1.4.2
scikit-image>=0.20.0
lvis==0.5.3
boto3>=1.34.113
easydict>=1.9
fairscale>=0.4.13

# [vae]
decord==0.6.0
pytorchvideo==0.1.5
lpips==0.1.4


================================================
FILE: Open-Sora/requirements/requirements-pllava.txt
================================================
absl-py==2.1.0
accelerate==0.26.1
addict==2.4.0
aiofiles==23.2.1
aliyun-python-sdk-core==2.15.0
aliyun-python-sdk-kms==2.16.2
altair==5.2.0
annotated-types==0.6.0
antlr4-python3-runtime==4.9.3
anyio==4.3.0
anykeystore==0.2
apex==0.9.10.dev0
appdirs==1.4.4
argcomplete==3.2.3
attrs==23.2.0
av==10.0.0
beautifulsoup4==4.12.3
blessed==1.20.0
blessings==1.7
boto3==1.34.63
botocore==1.34.63
Brotli==1.1.0
cachetools==5.3.3
certifi==2024.2.2
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
colorama==0.4.6
contourpy==1.2.0
crcmod==1.7
cryptacular==1.6.2
cryptography==42.0.5
cycler==0.12.1
dacite==1.7.0
decorator==4.4.2
decord==0.6.0
deepspeed==0.14.0
defusedxml==0.7.1
Deprecated==1.2.14
dill==0.3.8
distro==1.9.0
dnspython==2.6.1
docker-pycreds==0.4.0
einops==0.6.1
exceptiongroup==1.2.0
fastapi==0.110.0
ffmpeg==1.4
ffmpy==0.3.2
fiftyone==0.23.6
fiftyone-brain==0.16.1
fiftyone_db==1.1.2
filelock==3.9.0
fonttools==4.49.0
fsspec==2024.2.0
ftfy==6.1.3
future==1.0.0
fvcore==0.1.5.post20221221
gdown==5.1.0
gitdb==4.0.11
GitPython==3.1.42
glob2==0.7
google-auth==2.28.2
google-auth-oauthlib==1.2.0
gpustat==1.1.1
gradio==4.21.0
gradio_client==0.12.0
graphql-core==3.2.3
greenlet==3.0.3
grpcio==1.62.1
h11==0.14.0
h2==4.1.0
hjson==3.1.0
hpack==4.0.0
httpcore==1.0.4
httpx==0.27.0
huggingface-hub==0.21.4
humanize==4.9.0
hupper==1.12.1
Hypercorn==0.16.0
hyperframe==6.0.1
idna==3.6
idscheck==2.3.0
imageio==2.27.0
imageio-ffmpeg==0.4.9
importlib_metadata==7.0.2
importlib_resources==6.3.0
inflate64==1.0.0
iopath==0.1.10
Jinja2==3.1.2
jmespath==0.10.0
joblib==1.3.2
jsonlines==4.0.0
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
kaleido==0.2.1
kiwisolver==1.4.5
lazy_loader==0.3
Markdown==3.6
markdown-it-py==3.0.0
MarkupSafe==2.1.3
matplotlib==3.8.3
mdurl==0.1.2
mmcv-full==1.7.2
model-index==0.1.11
mongoengine==0.24.2
motor==3.3.2
moviepy==1.0.3
mpmath==1.3.0
multivolumefile==0.2.3
networkx==3.2.1
ninja==1.11.1.1
numpy==1.23.5
nvidia-ml-py==12.535.133
nvidia-ml-py3==7.352.0
oauthlib==3.2.2
omegaconf==2.3.0
openai==1.14.0
opencv-python==4.9.0.80
opencv-python-headless==4.9.0.80
opendatalab==0.0.10
openmim==0.3.9
openxlab==0.0.36
ordered-set==4.1.0
orjson==3.9.15
oss2==2.17.0
packaging==24.0
pandas==1.5.3
PasteDeploy==3.1.0
pathtools==0.1.2
pbkdf2==1.3
peft==0.10.0
pillow==10.2.0
plaster==1.1.2
plaster-pastedeploy==1.0.1
platformdirs==4.2.0
plotly==5.20.0
portalocker==2.8.2
pprintpp==0.4.0
priority==2.0.0
proglog==0.1.10
protobuf==4.23.4
psutil==5.9.4
py-cpuinfo==9.0.0
py7zr==0.21.0
pyasn1==0.5.1
pyasn1-modules==0.3.0
pybcj==1.0.2
pycparser==2.21
pycryptodome==3.20.0
pycryptodomex==3.20.0
pydantic==2.6.4
pydantic_core==2.16.3
pydub==0.25.1
Pygments==2.17.2
pymongo==4.6.2
pynvml==11.5.0
pyparsing==3.1.2
pyppmd==1.1.0
pyramid==2.0.2
pyramid-mailer==0.15.1
PySocks==1.7.1
python-dateutil==2.9.0.post0
python-multipart==0.0.9
python3-openid==3.2.0
pytz==2023.4
PyYAML==6.0
pyzstd==0.15.9
rarfile==4.1
referencing==0.33.0
regex==2023.12.25
repoze.sendmail==4.4.1
requests==2.28.2
requests-oauthlib==1.4.0
retrying==1.3.4
rich==13.4.2
rpds-py==0.18.0
rsa==4.9
ruff==0.3.2
s3transfer==0.10.1
safetensors==0.4.2
scikit-image==0.22.0
scikit-learn==1.4.1.post1
scipy==1.10.1
semantic-version==2.10.0
sentencepiece==0.2.0
sentry-sdk==1.42.0
setproctitle==1.3.3
shellingham==1.5.4
six==1.16.0
smmap==5.0.1
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.5
SQLAlchemy==2.0.28
sse-starlette==0.10.3
sseclient-py==1.8.0
starlette==0.36.3
strawberry-graphql==0.138.1
sympy==1.12
tabulate==0.9.0
taskgroup==0.0.0a4
tenacity==8.2.3
tensorboard==2.15.1
tensorboard-data-server==0.7.2
tensorboardX==2.6.2.2
termcolor==2.3.0
texttable==1.7.0
threadpoolctl==3.3.0
tifffile==2024.2.12
timm==0.6.12
tokenizers==0.15.2
tomli==2.0.1
tomlkit==0.12.0
toolz==0.12.1
torch==2.2.2
torchaudio
torchvision==0.17.2
tqdm==4.65.2
transaction==4.0
transformers==4.37.1
translationstring==1.4
triton==2.2.0
typer==0.9.0
typing_extensions==4.8.0
tzdata==2024.1
tzlocal==5.2
universal-analytics-python3==1.1.1
urllib3==1.26.18
uvicorn==0.28.0
velruse==1.1.1
venusian==3.1.0
voxel51-eta==0.12.6
wandb==0.14.0
wcwidth==0.2.13
WebOb==1.8.7
websockets==11.0.3
Werkzeug==3.0.1
wrapt==1.16.0
wsproto==1.2.0
WTForms==3.1.2
wtforms-recaptcha==0.3.2
xmltodict==0.13.0
yacs==0.1.8
yapf==0.40.2
zipp==3.18.1
zope.deprecation==5.0
zope.interface==6.2
zope.sqlalchemy==3.1


================================================
FILE: Open-Sora/requirements/requirements-vae.txt
================================================
beartype==0.18.5
einops==0.8.0
einops-exts==0.0.4
opencv-python==4.9.0.80
pillow==10.3.0


================================================
FILE: Open-Sora/requirements/requirements.txt
================================================
colossalai>=0.4.0
mmengine>=0.10.3
pandas>=2.0.3
timm==0.9.16
rotary_embedding_torch==0.5.3
ftfy>=6.2.0 # for t5
diffusers==0.27.2 # for vae
accelerate==0.29.2 # for t5
av>=12.0.0 # for video loading
numpy<2.0.0

# [gradio]
gradio>=4.26.0
spaces>=0.28.3

# [notebook]
ipykernel>=6.29.4
ipywidgets>=8.1.2

# [training]
wandb>=0.17.0
tensorboard>=2.14.0
pandarallel>=1.6.5
pyarrow>=16.1.0 # for parquet

# [dev]
pre-commit>=3.5.0
openai


================================================
FILE: Open-Sora/scripts/inference.py
================================================
import os
import time
from pprint import pformat

import colossalai
import torch
import torch.distributed as dist
from colossalai.cluster import DistCoordinator
from mmengine.runner import set_random_seed
from tqdm import tqdm

from opensora.acceleration.parallel_states import set_sequence_parallel_group
from opensora.datasets import save_sample
from opensora.datasets.aspect import get_image_size, get_num_frames
from opensora.models.text_encoder.t5 import text_preprocessing
from opensora.registry import MODELS, SCHEDULERS, build_module
from opensora.utils.config_utils import parse_configs
from opensora.utils.inference_utils import (
    add_watermark,
    append_generated,
    append_score_to_prompts,
    apply_mask_strategy,
    collect_references_batch,
    dframe_to_frame,
    extract_json_from_prompts,
    extract_prompts_loop,
    get_save_path_name,
    load_prompts,
    merge_prompt,
    prepare_multi_resolution_info,
    refine_prompts_by_openai,
    split_prompt,
)
from opensora.utils.misc import all_exists, create_logger, is_distributed, is_main_process, to_torch_dtype


def main():
    torch.set_grad_enabled(False)
    # ======================================================
    # configs & runtime variables
    # ======================================================
    # == parse configs ==
    cfg = parse_configs(training=False)

    # == device and dtype ==
    device = "cuda" if torch.cuda.is_available() else "cpu"
    cfg_dtype = cfg.get("dtype", "fp32")
    assert cfg_dtype in ["fp16", "bf16", "fp32"], f"Unknown mixed precision {cfg_dtype}"
    dtype = to_torch_dtype(cfg.get("dtype", "bf16"))
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # == init distributed env ==
    if is_distributed():
        colossalai.launch_from_torch({})
        coordinator = DistCoordinator()
        enable_sequence_parallelism = coordinator.world_size > 1
        if enable_sequence_parallelism:
            set_sequence_parallel_group(dist.group.WORLD)
    else:
        coordinator = None
        enable_sequence_parallelism = False
    set_random_seed(seed=cfg.get("seed", 1024))

    # == init logger ==
    logger = create_logger()
    logger.info("Inference configuration:\n %s", pformat(cfg.to_dict()))
    verbose = cfg.get("verbose", 1)
    progress_wrap = tqdm if verbose == 1 else (lambda x: x)

    # ======================================================
    # build model & load weights
    # ======================================================
    logger.info("Building models...")
    # == build text-encoder and vae ==
    text_encoder = build_module(cfg.text_encoder, MODELS, device=device)
    vae = build_module(cfg.vae, MODELS).to(device, dtype).eval()

    # == prepare video size ==
    image_size = cfg.get("image_size", None)
    if image_size is None:
        resolution = cfg.get("resolution", None)
        aspect_ratio = cfg.get("aspect_ratio", None)
        assert (
            resolution is not None and aspect_ratio is not None
        ), "resolution and aspect_ratio must be provided if image_size is not provided"
        image_size = get_image_size(resolution, aspect_ratio)
    num_frames = get_num_frames(cfg.num_frames)

    # == build diffusion model ==
    input_size = (num_frames, *image_size)
    latent_size = vae.get_latent_size(input_size)
    model = (
        build_module(
            cfg.model,
            MODELS,
            input_size=latent_size,
            in_channels=vae.out_channels,
            caption_channels=text_encoder.output_dim,
            model_max_length=text_encoder.model_max_length,
            enable_sequence_parallelism=enable_sequence_parallelism,
        )
        .to(device, dtype)
        .eval()
    )
    text_encoder.y_embedder = model.y_embedder  # HACK: for classifier-free guidance

    # == build scheduler ==
    scheduler = build_module(cfg.scheduler, SCHEDULERS)

    # ======================================================
    # inference
    # ======================================================
    # == load prompts ==
    prompts = cfg.get("prompt", None)
    start_idx = cfg.get("start_index", 0)
    if prompts is None:
        if cfg.get("prompt_path", None) is not None:
            prompts = load_prompts(cfg.prompt_path, start_idx, cfg.get("end_index", None))
        else:
            prompts = [cfg.get("prompt_generator", "")] * 1_000_000  # endless loop
    #print(start_idx, cfg.get("end_index", None))
    # == prepare reference ==
    reference_path = cfg.get("reference_path", [""] * len(prompts))
    mask_strategy = cfg.get("mask_strategy", [""] * len(prompts))
    assert len(reference_path) == len(prompts), "Length of reference must be the same as prompts"
    assert len(mask_strategy) == len(prompts), "Length of mask_strategy must be the same as prompts"

    # == prepare arguments ==
    fps = cfg.fps
    save_fps = cfg.get("save_fps", fps // cfg.get("frame_interval", 1))
    multi_resolution = cfg.get("multi_resolution", None)
    batch_size = cfg.get("batch_size", 1)
    num_sample = cfg.get("num_sample", 1)
    loop = cfg.get("loop", 1)
    condition_frame_length = cfg.get("condition_frame_length", 5)
    condition_frame_edit = cfg.get("condition_frame_edit", 0.0)
    align = cfg.get("align", None)

    save_dir = cfg.save_dir
    os.makedirs(save_dir, exist_ok=True)
    sample_name = cfg.get("sample_name", None)
    prompt_as_path = cfg.get("prompt_as_path", False)

    # == Iter over all samples ==
    for i in progress_wrap(range(0, len(prompts), batch_size)):
        # == prepare batch prompts ==
        batch_prompts = prompts[i : i + batch_size]
        ms = mask_strategy[i : i + batch_size]
        refs = reference_path[i : i + batch_size]

        # == get json from prompts ==
        batch_prompts, refs, ms = extract_json_from_prompts(batch_prompts, refs, ms)
        original_batch_prompts = batch_prompts

        # == get reference for condition ==
        refs = collect_references_batch(refs, vae, image_size)

        # == multi-resolution info ==
        model_args = prepare_multi_resolution_info(
            multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype
        )

        model_args['cache_type'] = 'attention'
        model_args['ratio_scheduler'] = 'ToCa'
        model_args['fresh_ratio'] = 0.1
        model_args['fresh_threshold'] = 3 # Note this does not decide the force activatioin cycles, see more details in Open-Sora\opensora\models\cache_functions\force_scheduler.py
        model_args['force_fresh'] = 'global'
        model_args['soft_fresh_weight'] = 0.25

        # == Iter over number of sampling for one prompt ==
        for k in range(num_sample):
            # == prepare save paths ==
            save_paths = [
                get_save_path_name(
                    save_dir,
                    sample_name=sample_name,
                    sample_idx=start_idx + idx,
                    prompt=original_batch_prompts[idx],
                    prompt_as_path=prompt_as_path,
                    num_sample=num_sample,
                    k=k,
                )
                for idx in range(len(batch_prompts))
            ]

            # NOTE: Skip if the sample already exists
            # This is useful for resuming sampling VBench
            if prompt_as_path and all_exists(save_paths):
                continue

            # == process prompts step by step ==
            # 0. split prompt
            # each element in the list is [prompt_segment_list, loop_idx_list]
            batched_prompt_segment_list = []
            batched_loop_idx_list = []
            for prompt in batch_prompts:
                prompt_segment_list, loop_idx_list = split_prompt(prompt)
                batched_prompt_segment_list.append(prompt_segment_list)
                batched_loop_idx_list.append(loop_idx_list)

            # 1. refine prompt by openai
            if cfg.get("llm_refine", False):
                # only call openai API when
                # 1. seq parallel is not enabled
                # 2. seq parallel is enabled and the process is rank 0
                if not enable_sequence_parallelism or (enable_sequence_parallelism and is_main_process()):
                    for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
                        batched_prompt_segment_list[idx] = refine_prompts_by_openai(prompt_segment_list)

                # sync the prompt if using seq parallel
                if enable_sequence_parallelism:
                    coordinator.block_all()
                    prompt_segment_length = [
                        len(prompt_segment_list) for prompt_segment_list in batched_prompt_segment_list
                    ]

                    # flatten the prompt segment list
                    batched_prompt_segment_list = [
                        prompt_segment
                        for prompt_segment_list in batched_prompt_segment_list
                        for prompt_segment in prompt_segment_list
                    ]

                    # create a list of size equal to world size
                    broadcast_obj_list = [batched_prompt_segment_list] * coordinator.world_size
                    dist.broadcast_object_list(broadcast_obj_list, 0)

                    # recover the prompt list
                    batched_prompt_segment_list = []
                    segment_start_idx = 0
                    all_prompts = broadcast_obj_list[0]
                    for num_segment in prompt_segment_length:
                        batched_prompt_segment_list.append(
                            all_prompts[segment_start_idx : segment_start_idx + num_segment]
                        )
                        segment_start_idx += num_segment

            # 2. append score
            for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
                batched_prompt_segment_list[idx] = append_score_to_prompts(
                    prompt_segment_list,
                    aes=cfg.get("aes", None),
                    flow=cfg.get("flow", None),
                    camera_motion=cfg.get("camera_motion", None),
                )

            # 3. clean prompt with T5
            for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
                batched_prompt_segment_list[idx] = [text_preprocessing(prompt) for prompt in prompt_segment_list]

            # 4. merge to obtain the final prompt
            batch_prompts = []
            for prompt_segment_list, loop_idx_list in zip(batched_prompt_segment_list, batched_loop_idx_list):
                batch_prompts.append(merge_prompt(prompt_segment_list, loop_idx_list))

            # == Iter over loop generation ==
            video_clips = []
            for loop_i in range(loop):
                # == get prompt for loop i ==
                batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i)

                # == add condition frames for loop ==
                if loop_i > 0:
                    refs, ms = append_generated(
                        vae, video_clips[-1], refs, ms, loop_i, condition_frame_length, condition_frame_edit
                    )

                # == sampling ==
                torch.manual_seed(1024 + k) # should set diffrent seed for different samples
                z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
                masks = apply_mask_strategy(z, refs, ms, loop_i, align=align)
                samples = scheduler.sample(
                    model,
                    text_encoder,
                    z=z,
                    prompts=batch_prompts_loop,
                    device=device,
                    additional_args=model_args,
                    progress=verbose >= 2,
                    mask=masks,
                )
                samples = vae.decode(samples.to(dtype), num_frames=num_frames)
                video_clips.append(samples)

            # == save samples ==
            if is_main_process():
                for idx, batch_prompt in enumerate(batch_prompts):
                    if verbose >= 2:
                        logger.info("Prompt: %s", batch_prompt)
                    save_path = save_paths[idx]
                    video = [video_clips[i][idx] for i in range(loop)]
                    for i in range(1, loop):
                        video[i] = video[i][:, dframe_to_frame(condition_frame_length) :]
                    video = torch.cat(video, dim=1)
                    save_path = save_sample(
                        video,
                        fps=save_fps,
                        save_path=save_path,
                        verbose=verbose >= 2,
                    )
                    if save_path.endswith(".mp4") and cfg.get("watermark", False):
                        time.sleep(1)  # prevent loading previous generated video
                        add_watermark(save_path)
        start_idx += len(batch_prompts)
    logger.info("Inference finished.")
    logger.info("Saved %s samples to %s", start_idx, save_dir)


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/scripts/inference_vae.py
================================================
import os
from pprint import pformat

import colossalai
import torch
from mmengine.runner import set_random_seed
from tqdm import tqdm

from opensora.acceleration.parallel_states import get_data_parallel_group
from opensora.datasets import save_sample
from opensora.datasets.dataloader import prepare_dataloader
from opensora.models.vae.losses import VAELoss
from opensora.registry import DATASETS, MODELS, build_module
from opensora.utils.config_utils import parse_configs
from opensora.utils.misc import create_logger, get_world_size, is_distributed, is_main_process, to_torch_dtype


def main():
    torch.set_grad_enabled(False)
    # ======================================================
    # configs & runtime variables
    # ======================================================
    # == parse configs ==
    cfg = parse_configs(training=False)

    # == device and dtype ==
    device = "cuda" if torch.cuda.is_available() else "cpu"
    cfg_dtype = cfg.get("dtype", "fp32")
    assert cfg_dtype in ["fp16", "bf16", "fp32"], f"Unknown mixed precision {cfg_dtype}"
    dtype = to_torch_dtype(cfg.get("dtype", "bf16"))
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # == init distributed env ==
    if is_distributed():
        colossalai.launch_from_torch({})
    set_random_seed(seed=cfg.get("seed", 1024))

    # == init logger ==
    logger = create_logger()
    logger.info("Inference configuration:\n %s", pformat(cfg.to_dict()))
    verbose = cfg.get("verbose", 1)

    # ======================================================
    # build dataset and dataloader
    # ======================================================
    logger.info("Building reconstruction dataset...")
    dataset = build_module(cfg.dataset, DATASETS)
    batch_size = cfg.get("batch_size", 1)
    dataloader, _ = prepare_dataloader(
        dataset,
        batch_size=batch_size,
        num_workers=cfg.get("num_workers", 4),
        shuffle=False,
        drop_last=False,
        pin_memory=True,
        process_group=get_data_parallel_group(),
    )
    logger.info("Dataset %s contains %s videos.", cfg.dataset.data_path, len(dataset))
    total_batch_size = batch_size * get_world_size()
    logger.info("Total batch size: %s", total_batch_size)

    total_steps = len(dataloader)
    if cfg.get("num_samples", None) is not None:
        total_steps = min(int(cfg.num_samples // cfg.batch_size), total_steps)
        logger.info("limiting test dataset to %s", int(cfg.num_samples // cfg.batch_size) * cfg.batch_size)
    dataiter = iter(dataloader)

    # ======================================================
    # build model & loss
    # ======================================================
    logger.info("Building models...")
    model = build_module(cfg.model, MODELS).to(device, dtype).eval()
    vae_loss_fn = VAELoss(
        logvar_init=cfg.get("logvar_init", 0.0),
        perceptual_loss_weight=cfg.get("perceptual_loss_weight", 0.1),
        kl_loss_weight=cfg.get("kl_loss_weight", 1e-6),
        device=device,
        dtype=dtype,
    )

    # ======================================================
    # inference
    # ======================================================
    # == global variables ==
    running_loss = running_nll = running_nll_z = 0.0
    loss_steps = 0
    cal_stats = cfg.get("cal_stats", False)
    if cal_stats:
        num_samples = 0
        running_sum = running_var = 0.0
        running_sum_c = torch.zeros(model.out_channels, dtype=torch.float, device=device)
        running_var_c = torch.zeros(model.out_channels, dtype=torch.float, device=device)

    # prepare arguments
    save_fps = cfg.get("fps", 24) // cfg.get("frame_interval", 1)

    # Iter over the dataset
    with tqdm(
        range(total_steps),
        disable=not is_main_process() or verbose < 1,
        total=total_steps,
        initial=0,
    ) as pbar:
        for step in pbar:
            batch = next(dataiter)
            x = batch["video"].to(device, dtype)  # [B, C, T, H, W]

            # == vae encoding & decoding ===
            z, posterior, x_z = model.encode(x)
            x_rec, x_z_rec = model.decode(z, num_frames=x.size(2))
            x_ref = model.spatial_vae.decode(x_z)

            # == check z shape ==
            input_size = x.shape[2:]
            latent_size = model.get_latent_size(input_size)
            assert list(z.shape[2:]) == latent_size, f"z shape: {z.shape}, latent_size: {latent_size}"

            # == calculate stats ==
            if cal_stats:
                num_samples += 1
                running_sum += z.mean().item()
                running_var += (z - running_sum / num_samples).pow(2).mean().item()
                running_sum_c += z.mean(dim=(0, 2, 3, 4)).float()
                running_var_c += (
                    (z - running_sum_c[None, :, None, None, None] / num_samples).pow(2).mean(dim=(0, 2, 3, 4)).float()
                )
                if verbose >= 1:
                    pbar.set_postfix(
                        {
                            "mean": running_sum / num_samples,
                            "std": (running_var / num_samples) ** 0.5,
                        }
                    )
                if num_samples % cfg.get("log_stats_every", 100) == 0:
                    logger.info(
                        "VAE feature per channel stats: mean %s, var %s",
                        (running_sum_c / num_samples).cpu().tolist(),
                        (running_var_c / num_samples).sqrt().cpu().tolist(),
                    )

            # == loss calculation ==
            nll_loss, weighted_nll_loss, weighted_kl_loss = vae_loss_fn(x, x_rec, posterior)
            nll_loss_z, _, _ = vae_loss_fn(x_z, x_z_rec, posterior, no_perceptual=True)
            vae_loss = weighted_nll_loss + weighted_kl_loss
            loss_steps += 1
            running_loss = vae_loss.item() / loss_steps + running_loss * ((loss_steps - 1) / loss_steps)
            running_nll = nll_loss.item() / loss_steps + running_nll * ((loss_steps - 1) / loss_steps)
            running_nll_z = nll_loss_z.item() / loss_steps + running_nll_z * ((loss_steps - 1) / loss_steps)

            # == save samples ==
            save_dir = cfg.get("save_dir", None)
            if is_main_process() and save_dir is not None:
                ori_dir = f"{save_dir}_ori"
                rec_dir = f"{save_dir}_rec"
                ref_dir = f"{save_dir}_spatial"
                os.makedirs(ori_dir, exist_ok=True)
                os.makedirs(rec_dir, exist_ok=True)
                os.makedirs(ref_dir, exist_ok=True)
                for idx, vid in enumerate(x):
                    pos = step * cfg.batch_size + idx
                    save_sample(vid, fps=save_fps, save_path=f"{ori_dir}/{pos:03d}", verbose=verbose >= 2)
                    save_sample(x_rec[idx], fps=save_fps, save_path=f"{rec_dir}/{pos:03d}", verbose=verbose >= 2)
                    save_sample(x_ref[idx], fps=save_fps, save_path=f"{ref_dir}/{pos:03d}", verbose=verbose >= 2)

    logger.info("VAE loss: %s", running_loss)
    logger.info("VAE nll loss: %s", running_nll)
    logger.info("VAE nll_z loss: %s", running_nll_z)


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/scripts/misc/extract_feat.py
================================================
import os
from pprint import pformat

import colossalai
import torch
import torch.distributed as dist
from tqdm import tqdm

from opensora.acceleration.parallel_states import get_data_parallel_group, set_data_parallel_group
from opensora.datasets.dataloader import prepare_dataloader
from opensora.registry import DATASETS, MODELS, build_module
from opensora.utils.config_utils import parse_configs, save_training_config
from opensora.utils.misc import FeatureSaver, Timer, create_logger, format_numel_str, get_model_numel, to_torch_dtype


def main():
    torch.set_grad_enabled(False)
    # ======================================================
    # 1. configs & runtime variables
    # ======================================================
    # == parse configs ==
    cfg = parse_configs(training=False)

    # == device and dtype ==
    assert torch.cuda.is_available(), "Training currently requires at least one GPU."
    cfg_dtype = cfg.get("dtype", "bf16")
    assert cfg_dtype in ["fp16", "bf16"], f"Unknown mixed precision {cfg_dtype}"
    dtype = to_torch_dtype(cfg.get("dtype", "bf16"))

    # == colossalai init distributed training ==
    device = "cuda" if torch.cuda.is_available() else "cpu"
    cfg_dtype = cfg.get("dtype", "fp32")
    assert cfg_dtype in ["fp16", "bf16", "fp32"], f"Unknown mixed precision {cfg_dtype}"
    dtype = to_torch_dtype(cfg.get("dtype", "bf16"))
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    colossalai.launch_from_torch({})
    set_data_parallel_group(dist.group.WORLD)

    # == init logger, tensorboard & wandb ==
    logger = create_logger()
    logger.info("Configuration:\n %s", pformat(cfg.to_dict()))

    # ======================================================
    # 2. build dataset and dataloader
    # ======================================================
    logger.info("Building dataset...")
    # == build dataset ==
    dataset = build_module(cfg.dataset, DATASETS)
    logger.info("Dataset contains %s samples.", len(dataset))

    # == build dataloader ==
    dataloader_args = dict(
        dataset=dataset,
        batch_size=cfg.get("batch_size", None),
        num_workers=cfg.get("num_workers", 4),
        seed=cfg.get("seed", 1024),
        shuffle=True,
        drop_last=True,
        pin_memory=True,
        process_group=get_data_parallel_group(),
    )
    dataloader, _ = prepare_dataloader(
        bucket_config=cfg.get("bucket_config", None),
        num_bucket_build_workers=cfg.get("num_bucket_build_workers", 1),
        **dataloader_args,
    )
    num_steps_per_epoch = len(dataloader)

    # ======================================================
    # 3. build model
    # ======================================================
    logger.info("Building models...")
    # == build text-encoder and vae ==
    text_encoder = build_module(cfg.text_encoder, MODELS, device=device, dtype=dtype)
    vae = build_module(cfg.vae, MODELS).to(device, dtype).eval()

    # == build diffusion model ==
    input_size = (dataset.num_frames, *dataset.image_size)
    latent_size = vae.get_latent_size(input_size)
    model = (
        build_module(
            cfg.model,
            MODELS,
            input_size=latent_size,
            in_channels=vae.out_channels,
            caption_channels=text_encoder.output_dim,
            model_max_length=text_encoder.model_max_length,
        )
        .to(device, dtype)
        .train()
    )
    model_numel, model_numel_trainable = get_model_numel(model)
    logger.info(
        "[Diffusion] Trainable model params: %s, Total model params: %s",
        format_numel_str(model_numel_trainable),
        format_numel_str(model_numel),
    )

    # =======================================================
    # 5. training loop
    # =======================================================
    # == global variables ==
    bin_size = cfg.bin_size
    save_text_features = cfg.get("save_text_features", False)
    save_compressed_text_features = cfg.get("save_compressed_text_features", False)

    # == number of bins ==
    num_bin = num_steps_per_epoch // bin_size
    logger.info("Number of batches: %s", num_steps_per_epoch)
    logger.info("Bin size: %s", bin_size)
    logger.info("Number of bins: %s", num_bin)

    # resume from a specific batch index
    start_index = cfg.get("start_index", 0)
    end_index = cfg.get("end_index", num_bin)
    dataloader.batch_sampler.load_state_dict({"last_micro_batch_access_index": start_index})
    num_bin_to_process = min(num_bin, end_index) - start_index
    logger.info("Start index: %s", start_index)
    logger.info("End index: %s", end_index)
    logger.info("Number of batches to process: %s", num_bin_to_process)

    # create save directory
    assert cfg.get("save_dir", None) is not None, "Please specify the save_dir in the config file."
    save_dir = os.path.join(cfg.save_dir, f"s{start_index}_e{end_index}")
    os.makedirs(save_dir, exist_ok=True)
    save_training_config(cfg.to_dict(), save_dir)
    logger.info("Saving features to %s", save_dir)

    saver = FeatureSaver(save_dir, bin_size, start_bin=start_index)

    # == training loop in an epoch ==
    dataloader_iter = iter(dataloader)
    log_time = cfg.get("log_time", False)
    for i in tqdm(range(0, num_bin_to_process * bin_size)):
        with Timer("step", log=log_time):
            with Timer("data loading", log=log_time):
                batch = next(dataloader_iter)
                x = batch.pop("video").to(device, dtype)  # [B, C, T, H, W]
                y = batch.pop("text")

            with Timer("vae", log=log_time):
                x = vae.encode(x)
            with Timer("feature to cpu", log=log_time):
                x = x.cpu()

            batch_dict = {
                "x": x,
                "text": y,
                "fps": batch["fps"].to(dtype),
                "height": batch["height"].to(dtype),
                "width": batch["width"].to(dtype),
                "num_frames": batch["num_frames"].to(dtype),
            }

            if save_text_features:
                with Timer("text", log=log_time):
                    text_infos = text_encoder.encode(y)
                    y_feat = text_infos["y"]
                    y_mask = text_infos["mask"]
                    if save_compressed_text_features:
                        y_feat, y_mask = model.encode_text(y_feat, y_mask)
                        y_mask = torch.tensor(y_mask)
                with Timer("feature to cpu", log=log_time):
                    y_feat = y_feat.cpu()
                    y_mask = y_mask.cpu()
                batch_dict.update({"y": y_feat, "mask": y_mask})

            saver.update(batch_dict)


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/scripts/misc/launch_extract_feat.sh
================================================
#!/bin/bash

set -x
set -e

START_SPLIT=0
NUM_SPLIT=10

DATA_PATH=$1
SAVE_PATH=$2
DATA_ARG="--data-path $DATA_PATH"
SAVE_ARG="--save-dir $SAVE_PATH"

CMD="torchrun --standalone --nproc_per_node 1 scripts/misc/extract_feat.py configs/opensora-v1-2/misc/extract.py $DATA_ARG $SAVE_ARG"
declare -a GPUS=(0 1 2 3 4 5 6 7)

mkdir -p logs/extract_feat

for i in "${GPUS[@]}"; do
    CUDA_VISIBLE_DEVICES=$i $CMD --start-index $(($START_SPLIT + i * $NUM_SPLIT)) --end-index $(($START_SPLIT + (i + 1) * $NUM_SPLIT)) >logs/extract_feat/$i.log 2>&1 &
done


================================================
FILE: Open-Sora/setup.py
================================================
from typing import List

from setuptools import find_packages, setup


def fetch_requirements(paths) -> List[str]:
    """
    This function reads the requirements file.

    Args:
        path (str): the path to the requirements file.

    Returns:
        The lines in the requirements file.
    """
    if not isinstance(paths, list):
        paths = [paths]
    requirements = []
    for path in paths:
        with open(path, "r") as fd:
            requirements += [r.strip() for r in fd.readlines()]
    return requirements


def fetch_readme() -> str:
    """
    This function reads the README.md file in the current directory.

    Returns:
        The lines in the README file.
    """
    with open("README.md", encoding="utf-8") as f:
        return f.read()


setup(
    name="opensora",
    version="1.2.0",
    packages=find_packages(
        exclude=(
            "assets",
            "cache",
            "configs",
            "docs",
            "eval",
            "evaluation_results",
            "gradio",
            "logs",
            "notebooks",
            "outputs",
            "pretrained_models",
            "samples",
            "scripts",
            "tests",
            "tools",
            "*.egg-info",
        )
    ),
    description="Democratizing Efficient Video Production for All",
    long_description=fetch_readme(),
    long_description_content_type="text/markdown",
    license="Apache Software License 2.0",
    url="https://github.com/hpcaitech/Open-Sora",
    project_urls={
        "Bug Tracker": "https://github.com/hpcaitech/Open-Sora/issues",
        "Examples": "https://hpcaitech.github.io/Open-Sora/",
        "Documentation": "https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file",
        "Github": "https://github.com/hpcaitech/Open-Sora",
    },
    install_requires=fetch_requirements("requirements/requirements.txt"),
    python_requires=">=3.6",
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: Apache Software License",
        "Environment :: GPU :: NVIDIA CUDA",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
        "Topic :: System :: Distributed Computing",
    ],
    extras_require={
        "data": fetch_requirements("requirements/requirements-data.txt"),
        "eval": fetch_requirements("requirements/requirements-eval.txt"),
        "vae": fetch_requirements("requirements/requirements-vae.txt"),
        "full": fetch_requirements(
            [
                "requirements/requirements-data.txt",
                "requirements/requirements-eval.txt",
            ]
        ),
    },
)


================================================
FILE: Open-Sora/tests/test_attn.py
================================================
import torch
from colossalai.accelerator import get_accelerator
from colossalai.utils import get_current_device
from rotary_embedding_torch import RotaryEmbedding

from opensora.models.layers.blocks import Attention

# B, S, H = 7488, 1, 1152
# B, S, H = 32, 234, 1152
B, S, H = 128, 32, 1152
N, D = 16, 72


def run_attn(enable_flash_attn: bool):
    get_accelerator().reset_peak_memory_stats()
    rope = RotaryEmbedding(D).to(device=get_current_device(), dtype=torch.bfloat16)
    attn = Attention(
        H,
        N,
        qkv_bias=True,
        rope=rope.rotate_queries_or_keys,
        enable_flash_attn=enable_flash_attn,
    ).to(device=get_current_device(), dtype=torch.bfloat16)
    x = torch.randn(B, S, H, device=get_current_device(), dtype=torch.bfloat16).requires_grad_()
    y = attn(x)
    y.mean().backward()
    print(f"Peak memory: {get_accelerator().max_memory_allocated() / 1024**2:.2f} MB")


if __name__ == "__main__":
    print("Use flashattn")
    run_attn(True)
    print("No flashattn")
    run_attn(False)


================================================
FILE: Open-Sora/tests/test_lr_scheduler.py
================================================
import torch
from torch.optim import Adam
from torchvision.models import resnet50
from tqdm import tqdm

from opensora.utils.lr_scheduler import LinearWarmupLR


def test_lr_scheduler():
    warmup_steps = 200
    model = resnet50().cuda()
    optimizer = Adam(model.parameters(), lr=0.01)
    scheduler = LinearWarmupLR(optimizer, warmup_steps=warmup_steps)
    current_lr = scheduler.get_lr()[0]
    data = torch.rand(1, 3, 224, 224).cuda()

    for i in tqdm(range(warmup_steps * 2)):
        out = model(data)
        out.mean().backward()
        optimizer.step()
        scheduler.step()

        if i >= warmup_steps:
            assert scheduler.get_lr()[0] == 0.01
        else:
            assert scheduler.get_lr()[0] > current_lr, f"{scheduler.get_lr()[0]} <= {current_lr}"
            current_lr = scheduler.get_lr()[0]


if __name__ == "__main__":
    test_lr_scheduler()


================================================
FILE: Open-Sora/tools/__init__.py
================================================


================================================
FILE: Open-Sora/tools/caption/README.md
================================================
# Video Captioning

Human labeling of videos is expensive and time-consuming. We adopt powerful image captioning models to generate captions for videos. Although GPT-4V achieves a better performance, its 20s/sample speed is too slow for us. As for our v1.2 model, we captioned our training videos with the [PLLaVA](https://github.com/magic-research/PLLaVA) model. PLLaVA performs highly competitively on multiple video-based text generation benchmarks including [MVbench](https://paperswithcode.com/sota/video-question-answering-on-mvbench?p=pllava-parameter-free-llava-extension-from-1).

## PLLaVA Captioning

To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video. We accelerate its inference via (1) batching and (2) offload frame extraction to a separate process such that the GPU computations and frame extraction happen in parallel.

### Installation
Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "PLLaVA Captioning" sections.


<!-- ### Download the PLLaVA repo

First, make sure you are under the directory of tools/caption/pllava_dir. Then,

```bash
git clone https://github.com/magic-research/PLLaVA.git
cd PLLaVA
git checkout fd9194a


```

### Environment

```bash
conda create -n pllava python=3.10

conda activate pllava

pip install -r requirements.txt # change to your own torch version if neccessary; torch==2.2.2, torchaudio==2.2.2, torchvision==0.17.2 worked for H100 for Tom.

```


### Download weights

```bash
python python_scripts/hf.py # download the weights
``` -->
### Usage

Since PLLaVA is not fashioned as a package, we will use PYTHONPATH to use it.


```bash
cd .. # step back to pllava_dir

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
PYTHONPATH='$PYTHONPATH:OPEN_SORA_HOME/tools/caption/pllava_dir/PLLaVA' \
nohup python caption_pllava.py \
  --pretrained_model_name_or_path PLLaVA/MODELS/pllava-13b \
  --use_lora \
  --lora_alpha 4 \
  --num_frames 4 \
  --weight_dir PLLaVA/MODELS/pllava-13b \
  --csv_path meta.csv \
  --pooling_shape 4-12-12 \
  > pllava_caption.out 2>&1 &
```

### PLLaVA vs. LLaVA
In our previous releases, we used [LLaVA](#llava-captioning) for video captioning.
Qualitatively speaking, we observe that PLLaVA has a somewhat higher chance of accurately capture the details in the video than LLaVA. See below for their comparison on a video sample.
<!-- <img src="../../assets/readme/llava_vs_pllava_sample.gif" width="300" height="200" alt="LLaVA vs PLLaVA"> -->

<figure>
    <img src="../../assets/readme/llava_vs_pllava_sample.gif" width="300" height="200" alt="LLaVA vs PLLaVA">
</figure>


| LLaVA | PLLaVA |
|----------|----------|
| The video is a close-up shot of two gold wedding rings. The rings are placed on a (black surface)✅, casting a soft shadow beneath them. The rings are positioned in such a way that (they are facing each other)❌, creating a mirror image effect. The rings are (identical in size and design)✅, suggesting they are a pair. The lighting in the video is soft and diffused, highlighting the gold color of the rings and creating a warm and inviting atmosphere. The overall style of the video is minimalist and elegant, focusing solely on the rings and their reflection.	| The video shows a pair of gold wedding rings on a (reflective surface)✅. The rings are placed one on top of the other, (with the top ring slightly tilted to the left)✅. The rings have a (shiny, metallic finish)✅ and are the main focus of the image. The background is a gradient of dark to light gray, providing a neutral backdrop that highlights the rings. There are no texts or other objects in the image. The style of the video is a simple product display with a focus on the rings, likely intended for promotional or sales purposes. The lighting and shadows suggest a soft, even light source, (possibly a studio light)✅, which creates a reflective surface beneath the rings.|
<!-- |Row2Cell1|Row2Cell2| -->


## LLaVA Captioning

We extract three frames from the video for captioning. With batch inference, we can achieve 10 times speedup. With approximately 720p resolution and 1 frames, the speed is 2~3 videos/s on 8 GPUs. If we resize the smaller side to 336, the speed can be 8 videos/s. In Open-Sora v1.1, to lower the cost, we use the 7B model.

### Installation

Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "LLaVA Captioning" sections.

<!-- ### Requirement

```bash
# create conda env
conda create -n llava python=3.10 -y
conda activate llava

# install torch
pip install torch torchvision

# clone llava
git clone https://github.com/haotian-liu/LLaVA.git
cd LLaVA
# CAUTION: This line is to remove torch dependency in pyproject.toml, which is:
# "torch==2.1.2", "torchvision==0.16.2",
# It is better manually remove it in your local pyproject.toml
sed -i '16d' pyproject.toml

# install llava
pip install --upgrade pip  # enable PEP 660 support
pip install -e .

# install flash attention
pip install flash-attn --no-build-isolation
# install colossalai and decord
pip install colossalai decord
``` -->

### Usage

Prepare a csv file for processing. The csv file can be generated by `convert_dataset.py` according to its [documentation](/tools/datasets/README.md). Then, run the following command to generate captions for videos/images with Llava:

```bash
# caption with mistral-7B
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video

# caption with llava-34B
# NOTE: remember to enable flash attention for this model
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 4 --tp-size 2 --model-path liuhaotian/llava-v1.6-34b --prompt image-3ex --flash-attention

# we run this on 8xH800 GPUs
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 4 --bs 16

# at least two 80G GPUs are required
torchrun --nproc_per_node 2 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 1 --bs 16

# can also caption images
torchrun --nproc_per_node 2 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 1 --bs 16 --prompt image-3ex
```

Please note that you should add the `--flash-attention` flag when running with Llama-based Llava models as it provides speedup but do turn it off for mistral-based ones. Reasons can be found in [this issue](https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453).

After running the script, with `dp-size=N`, you will get `N` parts of csv files. Run the following command to merge them:

```bash
python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv
```

### Resume

Sometimes the process may be interrupted. We can resume the process by running the following command:

```bash
# merge generated results
python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv

# get the remaining videos
python -m tools.datasets.datautil DATA.csv --difference DATA_caption.csv --output DATA_remaining.csv
```

Then use the output csv file to resume the process.


## GPT-4V Captioning

Run the following command to generate captions for videos with GPT-4V:

```bash
# output: DATA_caption.csv
python -m tools.caption.caption_gpt4 DATA.csv --key $OPENAI_API_KEY
```

The cost is approximately $0.01 per video (3 frames per video).

## Camera Motion Detection

<!-- Install additional required packages: `tools/caption/camera_motion/requirements.txt`. -->
Install required packages with `pip install -v .[data]` (See [installation.md](../../docs/installation.md)).
Run the following command to classify camera motion:

```bash
# output: meta_cmotion.csv
python -m tools.caption.camera_motion.detect tools/caption/camera_motion/meta.csv
```

You may additionally specify `threshold` to indicate how "sensitive" the detection should be as below. For example `threshold = 0.2` means that the video is only counted as `tilt_up` when the pixels moved down by `>20%` of video height between the starting and ending frames.
```bash
# output: meta_cmotion.csv
python -m tools.caption.camera_motion.detect tools/caption/camera_motion/meta.csv --threshold 0.2
```

Each video is classified according to 8 categories:
            `pan_right,
            pan_left,
            tilt_up,
            tilt_down,
            zoom_in,
            zoom_out,
            static,
            unclassified`.
Categories of `tilt`, `pan` and `zoom` can overlap with each other.


## Tagging with Llama3

To understand the overall category distribution of our training dataset, we use Llama3 to generate tags based on the video captions.

After obtaining Llama3 usage permission from huggingface/meta, you may generate tags based on the captions using Llama3 like this:

```bash
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llama3 meta.csv --key objects --output_prefix meta
```

This will generate tags based on the `text` column of `meta.csv` and put the results to `output_prefix + key.csv`. Currently the prompts for `objects` and `actions` are supported.


================================================
FILE: Open-Sora/tools/caption/__init__.py
================================================


================================================
FILE: Open-Sora/tools/caption/acceleration/__init__.py
================================================


================================================
FILE: Open-Sora/tools/caption/acceleration/llava/__init__.py
================================================


================================================
FILE: Open-Sora/tools/caption/acceleration/llava/policies/__init__.py
================================================
from .llama import LlavaLlamaForCausalLMPolicy
from .mistral import LlavaMistralForCausalLMPolicy


================================================
FILE: Open-Sora/tools/caption/acceleration/llava/policies/llama.py
================================================
from typing import Dict, Union

import torch.nn as nn
from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription

__all__ = ["LlavaLlamaPolicy", "LlavaLlamaForCausalLMPolicy"]


class LlavaLlamaPolicy(Policy):
    def config_sanity_check(self):
        pass

    def preprocess(self):
        if self.shard_config.enable_tensor_parallelism:
            # Resize embedding
            self.model.config.vocab_size
            self.shard_config.tensor_parallel_size

            # if vocab_size % world_size != 0:
            #     new_vocab_size = vocab_size + world_size - vocab_size % world_size
            #     self.model.resize_token_embeddings(new_vocab_size)

        return self.model

    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
        from transformers.models.llama.modeling_llama import LlamaDecoderLayer

        policy = {}

        if self.shard_config.enable_tensor_parallelism:
            decoder_attribute_replacement = {
                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
            }
            if getattr(self.model.config, "num_key_value_heads", False):
                decoder_attribute_replacement["self_attn.num_key_value_heads"] = (
                    self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size
                )

            policy[LlamaDecoderLayer] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
                sub_module_replacement=[
                    SubModuleReplacementDescription(
                        suffix="self_attn.q_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.k_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.v_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.o_proj",
                        target_module=Linear1D_Row,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.gate_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.up_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.down_proj",
                        target_module=Linear1D_Row,
                    ),
                ],
            )

        return policy

    def postprocess(self):
        return self.model


class LlavaLlamaForCausalLMPolicy(LlavaLlamaPolicy):
    def module_policy(self):
        from transformers import LlamaForCausalLM

        policy = super().module_policy()
        if self.shard_config.enable_tensor_parallelism:
            # add a new item for casual lm
            new_item = {
                LlamaForCausalLM: ModulePolicyDescription(
                    sub_module_replacement=[
                        SubModuleReplacementDescription(
                            suffix="lm_head", target_module=Linear1D_Col, kwargs={"gather_output": True}
                        )
                    ],
                )
            }
            policy.update(new_item)
        return policy


================================================
FILE: Open-Sora/tools/caption/acceleration/llava/policies/mistral.py
================================================
import warnings
from typing import Dict, Union

import torch.nn as nn
from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription

__all__ = ["LlavaMistralPolicy", "LlavaMistralForCausalLMPolicy"]


class LlavaMistralPolicy(Policy):
    def config_sanity_check(self):
        pass

    def preprocess(self):
        if self.shard_config.enable_tensor_parallelism:
            # Resize embedding
            vocab_size = self.model.config.vocab_size
            world_size = self.shard_config.tensor_parallel_size

            if vocab_size % world_size != 0:
                new_vocab_size = vocab_size + world_size - vocab_size % world_size
                self.model.resize_token_embeddings(new_vocab_size)

        return self.model

    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
        from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MistralModel

        policy = {}

        if self.shard_config.enable_sequence_parallelism:
            self.shard_config.enable_sequence_parallelism = False
            warnings.warn(
                "Mistral doesn't support sequence parallelism now, will ignore the sequence parallelism flag."
            )

        if self.shard_config.enable_tensor_parallelism:
            decoder_attribute_replacement = {
                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
                "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
                // self.shard_config.tensor_parallel_size,
            }

            policy[MistralDecoderLayer] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
                sub_module_replacement=[
                    SubModuleReplacementDescription(
                        suffix="self_attn.q_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.k_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.v_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.o_proj",
                        target_module=Linear1D_Row,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.gate_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.up_proj",
                        target_module=Linear1D_Col,
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.down_proj",
                        target_module=Linear1D_Row,
                    ),
                ],
            )

            self.append_or_create_submodule_replacement(
                description=SubModuleReplacementDescription(
                    suffix="embed_tokens",
                    target_module=VocabParallelEmbedding1D,
                ),
                policy=policy,
                target_key=MistralModel,
            )

        return policy

    def postprocess(self):
        return self.model


class LlavaMistralForCausalLMPolicy(LlavaMistralPolicy):
    def module_policy(self):
        from transformers import MistralForCausalLM

        policy = super().module_policy()

        if self.shard_config.enable_tensor_parallelism:
            # add a new item for casual lm
            new_item = {
                MistralForCausalLM: ModulePolicyDescription(
                    sub_module_replacement=[
                        SubModuleReplacementDescription(
                            suffix="lm_head", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
                        )
                    ]
                )
            }
            policy.update(new_item)
        return policy


================================================
FILE: Open-Sora/tools/caption/camera_motion/__init__.py
================================================


================================================
FILE: Open-Sora/tools/caption/camera_motion/camera_motion.py
================================================
import os

import numpy as np
import torch

from .utils import load_video
from .visualizer import Visualizer


def transform(vector):
    x = np.mean([item[0] for item in vector])
    y = np.mean([item[1] for item in vector])
    return [x, y]


class CameraPredict:
    def __init__(self, device, submodules_list, factor=0.25):
        self.device = device
        self.grid_size = 10
        self.factor = factor
        try:
            self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device)
        except:
            # workaround for CERTIFICATE_VERIFY_FAILED (see: https://github.com/pytorch/pytorch/issues/33288#issuecomment-954160699)
            import ssl

            ssl._create_default_https_context = ssl._create_unverified_context
            self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device)

    def infer(self, video_path, save_video=False, save_dir="./saved_videos"):
        # load video
        video = load_video(video_path, return_tensor=False)
        # set scale
        height, width = video.shape[1], video.shape[2]
        self.scale = min(height, width)
        video = torch.from_numpy(video).permute(0, 3, 1, 2)[None].float().to(self.device)  # B T C H W
        pred_tracks, pred_visibility = self.model(video, grid_size=self.grid_size)  # B T N 2,  B T N 1

        if save_video:
            video_name = os.path.basename(video_path)[:-4]
            vis = Visualizer(save_dir=save_dir, pad_value=120, linewidth=3)
            vis.visualize(video, pred_tracks, pred_visibility, filename=video_name)

        return pred_tracks[0].long().detach().cpu().numpy()

    def transform_class(self, vector, min_reso):  # 768*0.05
        scale = min_reso * self.factor
        x, y = vector
        direction = []
        if x > scale:
            direction.append("right")
        elif x < -scale:
            direction.append("left")

        if y > scale:
            direction.append("down")
        elif y < -scale:
            direction.append("up")

        return direction if direction else ["static"]

    def get_edge_point(self, track):
        middle = self.grid_size // 2
        top = [list(track[0, i, :]) for i in range(middle - 2, middle + 2)]
        down = [list(track[self.grid_size - 1, i, :]) for i in range(middle - 2, middle + 2)]
        left = [list(track[i, 0, :]) for i in range(middle - 2, middle + 2)]
        right = [list(track[i, self.grid_size - 1, :]) for i in range(middle - 2, middle + 2)]

        return top, down, left, right

    def get_edge_direction(self, track1, track2):
        edge_points1 = self.get_edge_point(track1)
        edge_points2 = self.get_edge_point(track2)

        vector_results = []
        for points1, points2 in zip(edge_points1, edge_points2):
            vectors = [[end[0] - start[0], end[1] - start[1]] for start, end in zip(points1, points2)]
            vector_results.append(vectors)
        vector_results = list(map(transform, vector_results))
        class_results = [self.transform_class(vector, min_reso=self.scale) for vector in vector_results]

        return class_results

    def classify_top_down(self, top, down):
        results = []
        classes = [f"{item_t}_{item_d}" for item_t in top for item_d in down]

        results_mapping = {
            "left_left": "pan_right",
            "right_right": "pan_left",
            "down_down": "tilt_up",
            "up_up": "tilt_down",
            "up_down": "zoom_in",
            "down_up": "zoom_out",
            "static_static": "static",
        }
        results = [results_mapping.get(cls) for cls in classes if cls in results_mapping]
        return results if results else ["None"]

    def classify_left_right(self, left, right):
        results = []
        classes = [f"{item_l}_{item_r}" for item_l in left for item_r in right]
        results_mapping = {
            "left_left": "pan_right",
            "right_right": "pan_left",
            "down_down": "tilt_up",
            "up_up": "tilt_down",
            "left_right": "zoom_in",
            "right_left": "zoom_out",
            "static_static": "static",
        }
        results = [results_mapping.get(cls) for cls in classes if cls in results_mapping]
        return results if results else ["None"]

    def camera_classify(self, track1, track2):
        top, down, left, right = self.get_edge_direction(track1, track2)

        top_results = self.classify_top_down(top, down)
        left_results = self.classify_left_right(left, right)

        results = list(set(top_results + left_results))
        if "None" in results and len(results) > 1:
            results.remove("None")
        if "static" in results and len(results) > 1:
            results.remove("static")
        if len(results) == 1 and results[0] == "None":  # Tom added this to deal with edge cases
            results = ["Undetermined"]
        return results

    def predict(self, video_path):
        pred_track = self.infer(video_path)
        track1 = pred_track[0].reshape((self.grid_size, self.grid_size, 2))
        track2 = pred_track[-1].reshape((self.grid_size, self.grid_size, 2))
        results = self.camera_classify(track1, track2)
        return results


def compute_camera_motion(device, submodules_dict, video_paths, factor):
    camera = CameraPredict(device, submodules_dict, factor)
    # predict_results = camera.predict(video_path)
    # return predict_results
    all_predictions = []
    for video_path in video_paths:
        camera_motion_types = camera.predict(video_path)
        all_predictions.append("+".join(camera_motion_types))
    return all_predictions


================================================
FILE: Open-Sora/tools/caption/camera_motion/detect.py
================================================
# Originally developed by https://github.com/Vchitect/VBench based on https://github.com/facebookresearch/co-tracker.

import argparse
from typing import List

import pandas as pd

from .camera_motion import compute_camera_motion


def process(paths: List[str], threshold: float) -> List[str]:
    device = "cuda"
    submodules = {"repo": "facebookresearch/co-tracker", "model": "cotracker2"}
    camera_motion_types = compute_camera_motion(device, submodules, paths, factor=threshold)
    return camera_motion_types


def main(args):
    output_file = args.input.replace(".csv", "_cmotion.csv")
    data = pd.read_csv(args.input)
    data["cmotion"] = process(data["path"], args.threshold)
    data.to_csv(output_file, index=False)
    print(f"Output saved to {output_file}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str)
    parser.add_argument("--threshold", type=float, default=0.25)
    args = parser.parse_args()
    main(args)


================================================
FILE: Open-Sora/tools/caption/camera_motion/requirements.txt
================================================
decord
ptvsd
imageio-ffmpeg


================================================
FILE: Open-Sora/tools/caption/camera_motion/utils.py
================================================
import numpy as np
import torch
from decord import VideoReader
from PIL import Image, ImageSequence


def get_frame_indices(num_frames, vlen, sample="rand", fix_start=None, input_fps=1, max_num_frames=-1):
    if sample in ["rand", "middle"]:  # uniform sampling
        acc_samples = min(num_frames, vlen)
        # split the video into `acc_samples` intervals, and sample from each interval.
        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
        ranges = []
        for idx, interv in enumerate(intervals[:-1]):
            ranges.append((interv, intervals[idx + 1] - 1))
        if sample == "rand":
            try:
                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
            except:
                frame_indices = np.random.permutation(vlen)[:acc_samples]
                frame_indices.sort()
                frame_indices = list(frame_indices)
        elif fix_start is not None:
            frame_indices = [x[0] + fix_start for x in ranges]
        elif sample == "middle":
            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
        else:
            raise NotImplementedError

        if len(frame_indices) < num_frames:  # padded with last frame
            padded_frame_indices = [frame_indices[-1]] * num_frames
            padded_frame_indices[: len(frame_indices)] = frame_indices
            frame_indices = padded_frame_indices
    elif "fps" in sample:  # fps0.5, sequentially sample frames at 0.5 fps
        output_fps = float(sample[3:])
        duration = float(vlen) / input_fps
        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
        frame_indices = np.around(frame_seconds * input_fps).astype(int)
        frame_indices = [e for e in frame_indices if e < vlen]
        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
            frame_indices = frame_indices[:max_num_frames]
            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
    else:
        raise ValueError
    return frame_indices


def load_video(video_path, data_transform=None, num_frames=None, return_tensor=True, width=None, height=None):
    """
    Load a video from a given path and apply optional data transformations.

    The function supports loading video in GIF (.gif), PNG (.png), and MP4 (.mp4) formats.
    Depending on the format, it processes and extracts frames accordingly.

    Parameters:
    - video_path (str): The file path to the video or image to be loaded.
    - data_transform (callable, optional): A function that applies transformations to the video data.

    Returns:
    - frames (torch.Tensor): A tensor containing the video frames with shape (T, C, H, W),
      where T is the number of frames, C is the number of channels, H is the height, and W is the width.

    Raises:
    - NotImplementedError: If the video format is not supported.

    The function first determines the format of the video file by its extension.
    For GIFs, it iterates over each frame and converts them to RGB.
    For PNGs, it reads the single frame, converts it to RGB.
    For MP4s, it reads the frames using the VideoReader class and converts them to NumPy arrays.
    If a data_transform is provided, it is applied to the buffer before converting it to a tensor.
    Finally, the tensor is permuted to match the expected (T, C, H, W) format.
    """
    if video_path.endswith(".gif"):
        frame_ls = []
        img = Image.open(video_path)
        for frame in ImageSequence.Iterator(img):
            frame = frame.convert("RGB")
            frame = np.array(frame).astype(np.uint8)
            frame_ls.append(frame)
        buffer = np.array(frame_ls).astype(np.uint8)
    elif video_path.endswith(".png"):
        frame = Image.open(video_path)
        frame = frame.convert("RGB")
        frame = np.array(frame).astype(np.uint8)
        frame_ls = [frame]
        buffer = np.array(frame_ls)
    elif video_path.endswith(".mp4"):
        import decord

        decord.bridge.set_bridge("native")
        if width:
            video_reader = VideoReader(video_path, width=width, height=height, num_threads=1)
        else:
            video_reader = VideoReader(video_path, num_threads=1)
        frames = video_reader.get_batch(range(len(video_reader)))  # (T, H, W, C), torch.uint8

        buffer = frames.asnumpy().astype(np.uint8)
    else:
        raise NotImplementedError

    frames = buffer
    if num_frames:
        frame_indices = get_frame_indices(num_frames, len(frames), sample="middle")
        frames = frames[frame_indices]

    if data_transform:
        frames = data_transform(frames)
    elif return_tensor:
        frames = torch.Tensor(frames)
        frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8

    return frames


================================================
FILE: Open-Sora/tools/caption/camera_motion/visualizer.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the cotracker github repo. https://github.com/facebookresearch/co-tracker.
import os

import imageio
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from matplotlib import cm
from PIL import Image, ImageDraw


def read_video_from_path(path):
    try:
        reader = imageio.get_reader(path)
    except Exception as e:
        print("Error opening video file: ", e)
        return None
    frames = []
    for i, im in enumerate(reader):
        frames.append(np.array(im))
    return np.stack(frames)


def draw_circle(rgb, coord, radius, color=(255, 0, 0), visible=True):
    # Create a draw object
    draw = ImageDraw.Draw(rgb)
    # Calculate the bounding box of the circle
    left_up_point = (coord[0] - radius, coord[1] - radius)
    right_down_point = (coord[0] + radius, coord[1] + radius)
    # Draw the circle
    draw.ellipse(
        [left_up_point, right_down_point],
        fill=tuple(color) if visible else None,
        outline=tuple(color),
    )
    return rgb


def draw_line(rgb, coord_y, coord_x, color, linewidth):
    draw = ImageDraw.Draw(rgb)
    draw.line(
        (coord_y[0], coord_y[1], coord_x[0], coord_x[1]),
        fill=tuple(color),
        width=linewidth,
    )
    return rgb


def add_weighted(rgb, alpha, original, beta, gamma):
    return (rgb * alpha + original * beta + gamma).astype("uint8")


class Visualizer:
    def __init__(
        self,
        save_dir: str = "./results",
        grayscale: bool = False,
        pad_value: int = 0,
        fps: int = 10,
        mode: str = "rainbow",  # 'cool', 'optical_flow'
        linewidth: int = 2,
        show_first_frame: int = 10,
        tracks_leave_trace: int = 0,  # -1 for infinite
    ):
        self.mode = mode
        self.save_dir = save_dir
        if mode == "rainbow":
            self.color_map = cm.get_cmap("gist_rainbow")
        elif mode == "cool":
            self.color_map = cm.get_cmap(mode)
        self.show_first_frame = show_first_frame
        self.grayscale = grayscale
        self.tracks_leave_trace = tracks_leave_trace
        self.pad_value = pad_value
        self.linewidth = linewidth
        self.fps = fps

    def visualize(
        self,
        video: torch.Tensor,  # (B,T,C,H,W)
        tracks: torch.Tensor,  # (B,T,N,2)
        visibility: torch.Tensor = None,  # (B, T, N, 1) bool
        gt_tracks: torch.Tensor = None,  # (B,T,N,2)
        segm_mask: torch.Tensor = None,  # (B,1,H,W)
        filename: str = "video",
        writer=None,  # tensorboard Summary Writer, used for visualization during training
        step: int = 0,
        query_frame: int = 0,
        save_video: bool = True,
        compensate_for_camera_motion: bool = False,
    ):
        if compensate_for_camera_motion:
            assert segm_mask is not None
        if segm_mask is not None:
            coords = tracks[0, query_frame].round().long()
            segm_mask = segm_mask[0, query_frame][coords[:, 1], coords[:, 0]].long()

        video = F.pad(
            video,
            (self.pad_value, self.pad_value, self.pad_value, self.pad_value),
            "constant",
            255,
        )
        print("video shape after pad is: ", video.shape)
        tracks = tracks + self.pad_value

        print(tracks)
        print("tracks shape after pad is: ", tracks.shape)

        if self.grayscale:
            transform = transforms.Grayscale()
            video = transform(video)
            video = video.repeat(1, 1, 3, 1, 1)

        res_video = self.draw_tracks_on_video(
            video=video,
            tracks=tracks,
            visibility=visibility,
            segm_mask=segm_mask,
            gt_tracks=gt_tracks,
            query_frame=query_frame,
            compensate_for_camera_motion=compensate_for_camera_motion,
        )
        if save_video:
            self.save_video(res_video, filename=filename, writer=writer, step=step)
        return res_video

    def save_video(self, video, filename, writer=None, step=0):
        if writer is not None:
            writer.add_video(
                filename,
                video.to(torch.uint8),
                global_step=step,
                fps=self.fps,
            )
        else:
            os.makedirs(self.save_dir, exist_ok=True)
            wide_list = list(video.unbind(1))
            wide_list = [wide[0].permute(1, 2, 0).cpu().numpy() for wide in wide_list]

            # Prepare the video file path
            save_path = os.path.join(self.save_dir, f"{filename}.mp4")

            # Create a writer object
            video_writer = imageio.get_writer(save_path, fps=self.fps)

            # Write frames to the video file
            for frame in wide_list[2:-1]:
                video_writer.append_data(frame)

            video_writer.close()

            print(f"Video saved to {save_path}")

    def draw_tracks_on_video(
        self,
        video: torch.Tensor,
        tracks: torch.Tensor,
        visibility: torch.Tensor = None,
        segm_mask: torch.Tensor = None,
        gt_tracks=None,
        query_frame: int = 0,
        compensate_for_camera_motion=False,
    ):
        B, T, C, H, W = video.shape
        _, _, N, D = tracks.shape

        assert D == 2
        assert C == 3
        video = video[0].permute(0, 2, 3, 1).byte().detach().cpu().numpy()  # S, H, W, C
        tracks = tracks[0].long().detach().cpu().numpy()  # S, N, 2
        if gt_tracks is not None:
            gt_tracks = gt_tracks[0].detach().cpu().numpy()

        res_video = []

        # process input video
        for rgb in video:
            res_video.append(rgb.copy())
        vector_colors = np.zeros((T, N, 3))

        if self.mode == "optical_flow":
            import flow_vis

            vector_colors = flow_vis.flow_to_color(tracks - tracks[query_frame][None])
        elif segm_mask is None:
            if self.mode == "rainbow":
                y_min, y_max = (
                    tracks[query_frame, :, 1].min(),
                    tracks[query_frame, :, 1].max(),
                )
                norm = plt.Normalize(y_min, y_max)
                for n in range(N):
                    color = self.color_map(norm(tracks[query_frame, n, 1]))
                    color = np.array(color[:3])[None] * 255
                    vector_colors[:, n] = np.repeat(color, T, axis=0)
            else:
                # color changes with time
                for t in range(T):
                    color = np.array(self.color_map(t / T)[:3])[None] * 255
                    vector_colors[t] = np.repeat(color, N, axis=0)
        else:
            if self.mode == "rainbow":
                vector_colors[:, segm_mask <= 0, :] = 255

                y_min, y_max = (
                    tracks[0, segm_mask > 0, 1].min(),
                    tracks[0, segm_mask > 0, 1].max(),
                )
                norm = plt.Normalize(y_min, y_max)
                for n in range(N):
                    if segm_mask[n] > 0:
                        color = self.color_map(norm(tracks[0, n, 1]))
                        color = np.array(color[:3])[None] * 255
                        vector_colors[:, n] = np.repeat(color, T, axis=0)

            else:
                # color changes with segm class
                segm_mask = segm_mask.cpu()
                color = np.zeros((segm_mask.shape[0], 3), dtype=np.float32)
                color[segm_mask > 0] = np.array(self.color_map(1.0)[:3]) * 255.0
                color[segm_mask <= 0] = np.array(self.color_map(0.0)[:3]) * 255.0
                vector_colors = np.repeat(color[None], T, axis=0)

        #  draw tracks
        if self.tracks_leave_trace != 0:
            for t in range(query_frame + 1, T):
                first_ind = max(0, t - self.tracks_leave_trace) if self.tracks_leave_trace >= 0 else 0
                curr_tracks = tracks[first_ind : t + 1]
                curr_colors = vector_colors[first_ind : t + 1]
                if compensate_for_camera_motion:
                    diff = (tracks[first_ind : t + 1, segm_mask <= 0] - tracks[t : t + 1, segm_mask <= 0]).mean(1)[
                        :, None
                    ]

                    curr_tracks = curr_tracks - diff
                    curr_tracks = curr_tracks[:, segm_mask > 0]
                    curr_colors = curr_colors[:, segm_mask > 0]

                res_video[t] = self._draw_pred_tracks(
                    res_video[t],
                    curr_tracks,
                    curr_colors,
                )
                if gt_tracks is not None:
                    res_video[t] = self._draw_gt_tracks(res_video[t], gt_tracks[first_ind : t + 1])

        #  draw points
        for t in range(query_frame, T):
            img = Image.fromarray(np.uint8(res_video[t]))
            for i in range(N):
                coord = (tracks[t, i, 0], tracks[t, i, 1])
                visibile = True
                if visibility is not None:
                    visibile = visibility[0, t, i]
                if coord[0] != 0 and coord[1] != 0:
                    if not compensate_for_camera_motion or (compensate_for_camera_motion and segm_mask[i] > 0):
                        img = draw_circle(
                            img,
                            coord=coord,
                            radius=int(self.linewidth * 2),
                            color=vector_colors[t, i].astype(int),
                            visible=visibile,
                        )
            res_video[t] = np.array(img)

        #  construct the final rgb sequence
        if self.show_first_frame > 0:
            res_video = [res_video[0]] * self.show_first_frame + res_video[1:]
        return torch.from_numpy(np.stack(res_video)).permute(0, 3, 1, 2)[None].byte()

    def _draw_pred_tracks(
        self,
        rgb: np.ndarray,  # H x W x 3
        tracks: np.ndarray,  # T x 2
        vector_colors: np.ndarray,
        alpha: float = 0.5,
    ):
        T, N, _ = tracks.shape
        rgb = Image.fromarray(np.uint8(rgb))
        for s in range(T - 1):
            vector_color = vector_colors[s]
            original = rgb.copy()
            alpha = (s / T) ** 2
            for i in range(N):
                coord_y = (int(tracks[s, i, 0]), int(tracks[s, i, 1]))
                coord_x = (int(tracks[s + 1, i, 0]), int(tracks[s + 1, i, 1]))
                if coord_y[0] != 0 and coord_y[1] != 0:
                    rgb = draw_line(
                        rgb,
                        coord_y,
                        coord_x,
                        vector_color[i].astype(int),
                        self.linewidth,
                    )
            if self.tracks_leave_trace > 0:
                rgb = Image.fromarray(np.uint8(add_weighted(np.array(rgb), alpha, np.array(original), 1 - alpha, 0)))
        rgb = np.array(rgb)
        return rgb

    def _draw_gt_tracks(
        self,
        rgb: np.ndarray,  # H x W x 3,
        gt_tracks: np.ndarray,  # T x 2
    ):
        T, N, _ = gt_tracks.shape
        color = np.array((211, 0, 0))
        rgb = Image.fromarray(np.uint8(rgb))
        for t in range(T):
            for i in range(N):
                gt_tracks = gt_tracks[t][i]
                #  draw a red cross
                if gt_tracks[0] > 0 and gt_tracks[1] > 0:
                    length = self.linewidth * 3
                    coord_y = (int(gt_tracks[0]) + length, int(gt_tracks[1]) + length)
                    coord_x = (int(gt_tracks[0]) - length, int(gt_tracks[1]) - length)
                    rgb = draw_line(
                        rgb,
                        coord_y,
                        coord_x,
                        color,
                        self.linewidth,
                    )
                    coord_y = (int(gt_tracks[0]) - length, int(gt_tracks[1]) + length)
                    coord_x = (int(gt_tracks[0]) + length, int(gt_tracks[1]) - length)
                    rgb = draw_line(
                        rgb,
                        coord_y,
                        coord_x,
                        color,
                        self.linewidth,
                    )
        rgb = np.array(rgb)
        return rgb


================================================
FILE: Open-Sora/tools/caption/camera_motion_detect.py
================================================
# ref: https://github.com/antiboredom/camera-motion-detector

import argparse

import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()


def apply(df, func, **kwargs):
    if pandas_has_parallel:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)


try:
    from pandarallel import pandarallel

    pandarallel.initialize(progress_bar=True)
    pandas_has_parallel = True
except ImportError:
    pandas_has_parallel = False


def make_empty(new_w, new_h):
    empty = []
    for y in range(new_h):
        xvals = []
        for x in range(new_w):
            xvals.append([x, y])
        empty.append(xvals)

    empty = np.array(empty)
    return empty


def get_type(mag, ang, zoom_in, tau_static=1.0, tau_zoom=(0.4, 0.6)):
    if mag < tau_static:
        return "static"
    if zoom_in < tau_zoom[0]:
        return "zoom out"
    if zoom_in > tau_zoom[1]:
        return "zoom in"
    if ang < 45 or ang >= 315:
        return "pan left"
    if 45 <= ang < 135:
        return "tilt up"
    if 135 <= ang < 225:
        return "pan right"
    if 225 <= ang < 315:
        return "tilt down"
    return "unknown"


def get_video_type(frame_types):
    # count the number of each type
    counts = {}
    max_count = 0
    max_type = None
    for frame_type in frame_types:
        if frame_type not in counts:
            counts[frame_type] = 0
        counts[frame_type] += 1
        if counts[frame_type] > max_count:
            max_count = counts[frame_type]
            max_type = frame_type
    if max_count > len(frame_types) / 2:
        return max_type
    if "static" in counts:
        return "unknown"
    if "zoom in" not in counts and "zoom out" not in counts:
        return "pan/tilt"
    return "dynamic"


def process(path: str, frame_interval=15) -> str:
    cap = cv2.VideoCapture(path)
    count = 0
    prvs = None
    frame_types = []
    while cap.isOpened():
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            if count == 0:
                prvs = frame
                h, w = frame.shape
                empty = make_empty(w, h)
                empty_dists = np.sqrt(
                    np.square(empty.ravel()[::2] - (w / 2)) + np.square(empty.ravel()[1::2] - (h / 2))
                )
            else:
                flow = cv2.calcOpticalFlowFarneback(prvs, frame, None, 0.5, 3, 15, 3, 5, 1.2, 0)
                mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1], angleInDegrees=True)
                mean_mag = np.median(mag)
                mean_ang = np.median(ang)

                flow_coords = flow + empty
                xvals = flow_coords.ravel()[::2] - (w / 2)
                yvals = flow_coords.ravel()[1::2] - (h / 2)
                dists = np.sqrt(np.square(xvals) + np.square(yvals))
                dist_diff = dists >= empty_dists
                zoom_in_factor = np.count_nonzero(dist_diff) / len(dist_diff)
                frame_types.append(get_type(mean_mag, mean_ang, zoom_in_factor))
            count += frame_interval
            cap.set(cv2.CAP_PROP_POS_FRAMES, count)
        else:
            cap.release()
            break
    video_type = get_video_type(frame_types)
    return video_type


def main(args):
    output_file = args.input.replace(".csv", "_cmotion.csv")
    data = pd.read_csv(args.input)
    data["cmotion"] = apply(data["path"], process)
    data.to_csv(output_file, index=False)
    print(f"Output saved to {output_file}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str)
    parser.add_argument("--disable-parallel", action="store_true")
    args = parser.parse_args()
    if args.disable_parallel:
        pandas_has_parallel = False
    main(args)


================================================
FILE: Open-Sora/tools/caption/caption_gpt4.py
================================================
import argparse
import base64
import csv
import os
from io import BytesIO

import requests
import tqdm

from .utils import IMG_EXTENSIONS, PROMPTS, VID_EXTENSIONS, VideoTextDataset


def to_base64(image):
    buffer = BytesIO()
    image.save(buffer, format="JPEG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")


def get_caption(frame, prompt, api_key):
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}},
                ],
            }
        ],
        "max_tokens": 300,
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=60)
    caption = response.json()["choices"][0]["message"]["content"]
    caption = caption.replace("\n", " ")
    return caption


def main(args):
    # ======================================================
    # 1. read video list
    # ======================================================
    dataset = VideoTextDataset(args.input)
    output_file = os.path.splitext(args.input)[0] + "_caption.csv"
    f = open(output_file, "w")
    writer = csv.writer(f)
    writer.writerow(["video", "text"])

    # make sure that the prompt type matches the data type
    data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1]
    prompt_type = PROMPTS[args.prompt]["type"]
    if prompt_type == "image":
        assert (
            data_extension.lower() in IMG_EXTENSIONS
        ), "The prompt is suitable for an image dataset but the data is not image."
    elif prompt_type == "video":
        assert (
            data_extension.lower() in VID_EXTENSIONS
        ), "The prompt is suitable for a video dataset but the data is not video."
    else:
        raise ValueError(f"Found invalid prompt type {prompt_type}")

    # ======================================================
    # 2. generate captions
    # ======================================================
    for sample in tqdm.tqdm(dataset):
        prompt = PROMPTS[args.prompt]["text"]
        if "text" in args.prompt:
            prompt = prompt.format(sample["text"])
        frames = sample["image"]
        frames = [to_base64(frame) for frame in frames]
        caption = get_caption(frames, prompt, args.key)

        writer.writerow((sample["path"], caption))
    f.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="Path to the input CSV file")
    parser.add_argument("--prompt", type=str, default="video-f3-detail-3ex")
    parser.add_argument("--key", type=str)
    args = parser.parse_args()

    main(args)


================================================
FILE: Open-Sora/tools/caption/caption_llama3.py
================================================
import argparse
import csv
import os
import warnings
from datetime import timedelta

import pandas as pd
import torch
import torch.distributed as dist
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

from .utils import read_file

os.system(f"cp {__file__} ~/backup/")  # optionally backup the script
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from torch.distributed.elastic.multiprocessing.errors import record


class CSVTextDataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        # assert text is in the columns
        assert "text" in self.df.columns, "text column not found in the csv file"

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self.df):
            raise IndexError
        return self.df.iloc[idx]

    def set_rank_and_world_size(self, rank, world_size):
        self.rank = rank
        self.world_size = world_size
        self.data_per_gpu = len(self) // world_size
        self.start_index = rank * self.data_per_gpu
        self.end_index = (rank + 1) * self.data_per_gpu if rank != world_size - 1 else len(self)
        self.df = self.df.iloc[self.start_index : self.end_index]

    def write_to_csv(self, output_file, data, new_key):
        """write the part of the df to a csv file corresponding to the rank and write self.data_list as a new column"""
        writer = csv.writer(open(output_file, "w"))
        columns = self.df.columns + [new_key]
        writer.writerow(columns)
        for index, row in self.df.iterrows():
            if index < self.start_index or index >= self.end_index:
                continue
            writer.writerow([*row, data[index - self.start_index]])
        writer.close()


def pad_left(sequences, padding_value=0):
    # Determine the maximum length of the sequences
    max_len = max([s.size(0) for s in sequences])
    # Create a list to hold the padded sequences
    padded_sequences = []
    for sequence in sequences:
        # Calculate the number of padding elements needed for this sequence
        num_padding = max_len - sequence.size(0)
        # Create a tensor of padding values
        padding = torch.full((num_padding,), padding_value, dtype=sequence.dtype).to(sequence.device)
        # Concatenate the padding and the sequence to pad on the left
        padded_sequence = torch.cat([padding, sequence], dim=0)
        padded_sequences.append(padded_sequence)
    # Stack the padded sequences into a batch
    batch = torch.stack(padded_sequences)
    return batch


@record
def main(args):
    # ======================================================
    # 1. init environment
    # ======================================================
    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())

    # ======================================================
    # 2. Prep rank-wise dataloader
    # ======================================================
    dataframe = read_file(args.input)
    print("read data from {}".format(args.input))
    dataset = CSVTextDataset(args.input)
    dataset.set_rank_and_world_size(dist.get_rank(), dist.get_world_size())

    import os

    if os.getenv("DEBUG_ADDRESS") != None and dist.get_rank() == 2:
        import ptvsd

        print("waiting for debugger attachment")
        ptvsd.enable_attach(address=("localhost", int(os.getenv("DEBUG_ADDRESS"))), redirect_output=True)
        ptvsd.wait_for_attach()

    output_file = args.output_prefix + f"_rank{dist.get_rank()}" + f"_{args.key}.csv"
    output_file_handle = open(output_file, "w")
    writer = csv.writer(output_file_handle)
    columns = list(dataframe.columns) + [args.key]

    writer.writerow(columns)

    # add a new key named summary, write in csv file
    print("the processed data saved on this rank will be saved to {}".format(output_file))

    def collate_fn(batch):
        return batch

    dataloader = torch.utils.data.DataLoader(
        dataset,
        # num_workers=2,
        batch_size=args.batch_size,
        collate_fn=collate_fn,
        shuffle=False,
    )

    # ======================================================
    # 2. process using llama3 and prompt
    # ======================================================

    print("Using model with the id {}".format(args.model_id))
    model_id = args.model_id
    tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map=dist.get_rank() % torch.cuda.device_count(),
    )
    # .to(dist.get_rank() % torch.cuda.device_count())
    dist.barrier()
    print("======== Process data using LLAMA3 ========")

    def extract_batch(texts, prompt):
        input_ids_list = [
            tokenizer.apply_chat_template(
                [{"role": "system", "content": prompt}, {"role": "user", "content": text}],
                add_generation_prompt=True,
                return_tensors="pt",
            ).to(model.device)[0]
            for text in texts
        ]

        attention_mask_list = [
            torch.ones(input_ids.shape, dtype=torch.long, device=model.device) for input_ids in input_ids_list
        ]

        # input_ids_batch = pad_left(
        #     input_ids_list, padding_value=tokenizer.eos_token_id
        # )

        input_ids_batch = torch.nn.utils.rnn.pad_sequence(
            input_ids_list, batch_first=True, padding_value=tokenizer.eos_token_id
        )

        attention_mask_batch = torch.nn.utils.rnn.pad_sequence(attention_mask_list, batch_first=True, padding_value=0)

        # attention_mask_batch = pad_left(
        #     attention_mask_list, padding_value=0
        # )

        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>"),
        ]
        outputs = model.generate(
            input_ids_batch,
            max_new_tokens=512,
            attention_mask=attention_mask_batch,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=terminators,
            # do_sample=True,
            # temperature=0.6,
            # top_p=0.9,
        )

        responses = []
        for i in range(len(texts)):
            response = outputs[i][input_ids_list[i].shape[-1] :]
            response = tokenizer.decode(response, skip_special_tokens=True)
            responses.append(response)

        return responses

    print("Processing starting...")
    if args.prompt == "" and args.key == "objects":
        prompt = (
            "You are a AI assistant to extract objects from user's text. "
            "For example: user: 'In this video a dog is running around. In addition, a person is laughing at the dog.', you produce a list of objects separated by ',' and wrapped by '[' and ']': '[dog, person]' "
        )
    elif args.prompt == "" and args.key == "actions":
        prompt = (
            "You are a AI assistant to extract actions from user's text. "
            "For example: user: 'In this video a dog is running around. In addition, a person is laughing at the dog.', you produce a list of actions separated by ',' and wrapped by '[' and ']': '[run, laugh]' "
        )
    else:
        prompt = args.prompt

    print("Prompt: {}".format(prompt))

    args.batch_size
    # for i in tqdm(range(0, len(dataframe), batch_size)):
    for _, batch in enumerate(tqdm(dataloader)):
        # get the text column from the batch
        texts = [batch[i]["text"] for i in range(len(batch))]
        list_keywords = extract_batch(texts, prompt)

        for idx, keywords in enumerate(list_keywords):
            try:
                keywords_start = keywords.find("[")
                keywords_end = keywords.find("]")
                keywords = keywords[keywords_start + 1 : keywords_end]
                if (
                    "\n" in keywords or len(keywords.strip()) == 0
                ):  # we empirically observe that it produces newlines when no keywords are found
                    keywords = "NONE_FOUND"
            except:
                keywords = "NONE_FOUND"
            row = batch[idx]
            writer.writerow([*row, keywords])

    output_file_handle.close()
    dist.barrier()

    if dist.get_rank() == 0:
        collated_file = args.output_prefix + f"_{args.key}.csv"
        print("All ranks are finished. Collating the processed data to {}".format(collated_file))
        import pandas as pd

        csv_files = [args.output_prefix + f"_rank{i}" + f"_{args.key}.csv" for i in range(dist.get_world_size())]
        # List to hold DataFrames
        dataframes = []
        # Read each CSV into a DataFrame and append to list
        for file in csv_files:
            df = pd.read_csv(file)
            # scan each line in the df, if the ``key`` column is NaN, replace it with "NONE_FOUND"
            df[args.key] = df[args.key].fillna("NONE_FOUND")
            dataframes.append(df)
        # Concatenate all DataFrames
        combined_df = pd.concat(dataframes, ignore_index=True)

        # Save the combined DataFrame to a new CSV file
        combined_df.to_csv(collated_file, index=False)
        print("Collated data saved to {}".format(collated_file))
    # terminate distributed env
    dist.destroy_process_group()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-id", default="meta-llama/Meta-Llama-3-8B-Instruct")
    parser.add_argument("input", type=str, help="Path to the input CSV file")
    parser.add_argument("--output_prefix", type=str, help="Path to the output CSV file")
    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--key", type=str)
    args = parser.parse_args()

    main(args)


================================================
FILE: Open-Sora/tools/caption/caption_llava.py
================================================
import argparse
import csv
import time
import warnings
from datetime import timedelta

import torch
import torch.distributed as dist
from colossalai.cluster import DistCoordinator, ProcessGroupMesh
from colossalai.shardformer import ShardConfig, ShardFormer
from colossalai.utils import get_current_device, set_seed
from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
from llava.conversation import conv_templates
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm

from ..datasets.utils import IMG_EXTENSIONS, VID_EXTENSIONS
from .acceleration.llava.policies import LlavaLlamaForCausalLMPolicy, LlavaMistralForCausalLMPolicy
from .utils import PROMPTS, Timer, VideoTextDataset, collate_fn

disable_torch_init()


class NoPaddingDistributedSampler(DistributedSampler):
    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0, drop_last=False):
        super().__init__(
            dataset=dataset, num_replicas=num_replicas, rank=rank, seed=seed, shuffle=False, drop_last=False
        )
        remainder = len(self.dataset) % self.num_replicas
        if remainder > 0 and (self.rank + 1) - remainder <= 0:
            # if the dataset is not divisible by num_replicas
            # the remaining items will be allocated to the first n ranks
            self.num_samples = len(self.dataset) // self.num_replicas + 1
        else:
            self.num_samples = len(self.dataset) // self.num_replicas
        self.total_size = len(dataset)

    def __iter__(self):
        if self.shuffle:
            # deterministically shuffle based on epoch and seed
            g = torch.Generator()
            g.manual_seed(self.seed + self.epoch)
            indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
        else:
            indices = list(range(len(self.dataset)))  # type: ignore[arg-type]

        # remove tail of data to make it evenly divisible.
        indices = indices[: self.total_size]

        # subsample
        indices = indices[self.rank : self.total_size : self.num_replicas]
        assert len(indices) == self.num_samples
        return iter(indices)


@torch.inference_mode()
def main(args):
    # ======================================================
    # 1. init environment
    # ======================================================
    # we set a very large timeout to avoid some processes exit early
    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
    set_seed(1024)
    coordinator = DistCoordinator()

    # prepare the dp and tp groups
    assert (
        args.dp_size * args.tp_size == coordinator.world_size
    ), f"DP size {args.dp_size} * TP size {args.tp_size} must equal to world size {coordinator.world_size}"
    mesh = ProcessGroupMesh(args.dp_size, args.tp_size)
    dp_group = mesh.get_group_along_axis(0)
    tp_group = mesh.get_group_along_axis(1)

    # ======================================================
    # 2. load model
    # ======================================================
    model_path = args.model_path
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")  # Pytorch non-meta copying warning fills out the console
        tokenizer, model, image_processor, context_len = load_pretrained_model(
            model_path=model_path,
            model_base=None,
            model_name=get_model_name_from_path(model_path),
            device=get_current_device(),
            torch_dtype=torch.float16,
            attn_implementation="flash_attention_2" if args.flash_attention else "eager",
        )
        dist.barrier()

    # ======================================================
    # 3. Apply system optimization
    # ======================================================
    tp_size = dist.get_world_size(tp_group)
    shard_config = ShardConfig(
        tensor_parallel_process_group=tp_group if tp_size > 1 else None,
        enable_tensor_parallelism=True if tp_size > 1 else False,
    )
    shard_former = ShardFormer(shard_config=shard_config)

    # check the model type
    model_name = model.__class__.__name__
    print(model_name)
    if model_name == "LlavaLlamaForCausalLM":
        model = shard_former.optimize(model, policy=LlavaLlamaForCausalLMPolicy())[0].cuda()
    elif model_name == "LlavaMistralForCausalLM":
        model = shard_former.optimize(model, policy=LlavaMistralForCausalLMPolicy())[0].cuda()
    else:
        print(f"The shardformer policy for {model_name} is not implemented, skip")
    torch.cuda.empty_cache()

    # ======================================================
    # 4. Prepare dataloader
    # ======================================================
    # prepare prompt
    query = PROMPTS[args.prompt]["text"]
    if dist.get_rank() == 0:
        print(f"Prompt: {query}")

    if "text" in args.prompt:

        def get_text_input_ids(text):
            conv = conv_templates["chatml_direct"].copy()
            query_text = query.format(text)
            conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query_text)
            prompt = conv.get_prompt()
            # add num_frames images
            t = prompt.split("<image>")
            prompt = t[0] + "<image>" * args.num_frames + t[1]
            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
            input_ids = input_ids.unsqueeze(0)
            return input_ids

    else:
        conv = conv_templates["chatml_direct"].copy()
        conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query)
        prompt = conv.get_prompt()
        # add num_frames images
        t = prompt.split("<image>")
        prompt = t[0] + "<image>" * args.num_frames + t[1]
        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
        input_ids = input_ids.unsqueeze(0)

        def get_text_input_ids(*args):
            return input_ids

    # build dataset
    def transform(imgs):
        imgs = process_images(imgs, image_processor, model.config)
        imgs = imgs.to(dtype=torch.float16)
        return imgs

    dataset = VideoTextDataset(
        args.input,
        transform=transform,
        num_frames=args.num_frames,
        get_text_input_ids=get_text_input_ids,
        resize=args.resize,
    )

    # make sure that the prompt type matches the data type
    data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1]
    prompt_type = PROMPTS[args.prompt]["type"]
    if prompt_type == "image":
        assert (
            data_extension.lower() in IMG_EXTENSIONS
        ), f"The prompt is suitable for an image dataset but the data is not image. The first data is of format {data_extension}"
    elif prompt_type == "video":
        assert (
            data_extension.lower() in VID_EXTENSIONS
        ), f"The prompt is suitable for a video dataset but the data is not video. The first data is of format {data_extension}"
    else:
        raise ValueError(f"Found invalid prompt type {prompt_type}")

    total_num_videos = len(dataset)

    # build sampler
    dp_rank = dist.get_rank(dp_group)
    dp_size = dist.get_world_size(dp_group)
    sampler = NoPaddingDistributedSampler(dataset, rank=dp_rank, num_replicas=dp_size)

    # build dataloader
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=args.bs,
        shuffle=False,
        num_workers=args.num_workers,
        pin_memory=True,
        prefetch_factor=args.prefetch_factor,
        sampler=sampler,
        collate_fn=collate_fn,
    )

    # prepare output file reader
    output_file = args.input.replace(".csv", "_caption.csv")

    # create csv writer
    has_dp_writter = dist.get_rank(tp_group) == 0

    if has_dp_writter:
        # the dp writer takes care of the files processed on the current dp rank
        # so we use write mode
        output_file_split = output_file.replace(".csv", f"_part{dp_rank}.csv")
        dp_file = open(output_file_split, "w")
        dp_writer = csv.writer(dp_file)
        dp_writer.writerow(["path", "text", "num_frames"])

    # ======================================================
    # 5. generate captions
    # ======================================================
    if dist.get_rank(tp_group) == 0:
        pbar = tqdm(dataloader, position=dp_rank, desc=f"Data Parallel Rank {dist.get_rank(dp_group)}")
    else:
        pbar = dataloader

    if args.profile:
        encode_time = []
        generate_time = []
        output_length = []
        total_time = []

    for i, batch in enumerate(pbar):
        # measure time
        if args.profile:
            torch.cuda.synchronize()
            start_time = time.time()

        video_files, frames, video_lengths, img_size_list, texts = batch

        # encode the batch of inputs
        with Timer() as encode_timer:
            samples = []
            for imgs, imgs_size, input_ids in zip(frames, img_size_list, texts):
                imgs = imgs.cuda()
                input_ids = input_ids.cuda()
                _, _, _, _, inputs_embeds, _ = model.prepare_inputs_labels_for_multimodal(
                    input_ids, None, None, None, None, images=imgs, image_sizes=imgs_size
                )
                samples.append(inputs_embeds)

        # padding
        max_len = max([sample.shape[1] for sample in samples])
        attention_mask = torch.tensor(
            [[0] * (max_len - samples[i].shape[1]) + [1] * samples[i].shape[1] for i in range(len(samples))]
        ).to(model.device)
        inputs_embeds = [
            torch.cat(
                [
                    torch.zeros(
                        (1, max_len - samples[i].shape[1], samples[i].shape[-1]),
                        device=model.device,
                        dtype=torch.float16,
                    ),
                    samples[i],
                ],
                dim=1,
            )
            for i in range(len(samples))
        ]
        inputs_embeds = torch.cat(inputs_embeds, dim=0)

        # generate outputs
        with Timer() as generate_timer:
            output_ids = super(type(model), model).generate(
                inputs_embeds=inputs_embeds,
                attention_mask=attention_mask,
                do_sample=False,  # sampling is not deterministic and may cause TP to hang
                max_new_tokens=args.max_tokens,
                use_cache=True,
            )

            # skip warmup and add profiling data
            if args.profile and i >= args.profile_warmup:
                output_length.append(output_ids.size(0) * output_ids.size(1))

            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            outputs = [output.replace("\n", " ").strip() for output in outputs]

        # skip warmup and add profiling data
        if args.profile and i >= args.profile_warmup:
            # measure time
            torch.cuda.synchronize()
            time_taken = time.time() - start_time

            total_time.append(time_taken)
            encode_time.append(encode_timer.time_taken)
            generate_time.append(generate_timer.time_taken)

        # save results
        if has_dp_writter:
            result = list(zip(video_files, outputs, video_lengths))
            for t in result:
                dp_writer.writerow(t)

    # display profiling info
    if args.profile:
        print(output_length)
        num_samples_after_warmup = total_num_videos - args.bs * args.profile_warmup * dp_size
        print(f"throughput (samples/s): {num_samples_after_warmup / sum(total_time)}")
        print(f"average encode time per sample: {sum(encode_time) / num_samples_after_warmup}")
        print(f"average generate time per sample: {sum(generate_time) / num_samples_after_warmup}")
        print(f"average number of tokens characters per sample: {sum(output_length) / num_samples_after_warmup}")
        print(f"Max GPU allocated / GB: {torch.cuda.max_memory_allocated() / 1024**3}")
        print(f"Max GPU reserved / GB: {torch.cuda.max_memory_reserved() / 1024**3}")

    # ======================================================
    # 6. shutdown
    # ======================================================
    # close file writing
    if has_dp_writter:
        dp_file.close()
    dist.barrier()

    # terminate distributed env
    dist.destroy_process_group()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="Path to the input CSV file")
    parser.add_argument("--model-path", type=str, default="liuhaotian/llava-v1.6-34b")
    parser.add_argument("--prompt", type=str, default="video-f1-detail-3ex")
    parser.add_argument("--resize", type=int, default=336)
    parser.add_argument("--num-frames", type=int, default=1)
    parser.add_argument("--max-tokens", type=int, default=300)
    # speed related
    parser.add_argument("--bs", type=int, default=16)
    parser.add_argument("--tp-size", type=int, default=2)
    parser.add_argument("--dp-size", type=int, default=4)
    parser.add_argument("--num-workers", type=int, default=8)
    parser.add_argument("--prefetch-factor", type=int, default=8, help="Prefetch factor")
    parser.add_argument(
        "--flash-attention",
        action="store_true",
        help="Whether to use flash attention. You can turn on this flag for llama model and off for mistral model.",
    )
    # debug related
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--profile-warmup", type=int, default=1)

    args = parser.parse_args()
    main(args)


================================================
FILE: Open-Sora/tools/caption/pllava_dir/caption_pllava.py
================================================
import sys
import os
import os
from pathlib import Path

current_file = Path(__file__)  # Gets the path of the current file
fourth_level_parent = current_file.parents[3]

datasets_dir = os.path.join(fourth_level_parent, "opensora/datasets")
import sys
sys.path.append(datasets_dir)
from read_video import read_video_av
sys.path.remove(datasets_dir)

import itertools
import logging
import multiprocessing as mp
from argparse import ArgumentParser
from multiprocessing import Process, Queue

import numpy as np
import pandas as pd
import torch
import torchvision
import transformers
from decord import VideoReader, cpu
from PIL import Image
from tasks.eval.eval_utils import Conversation
from tasks.eval.model_utils import load_pllava
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers.feature_extraction_utils import BatchFeature

conv_template = Conversation(
    system="Describe this video. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
    roles=("USER:", "ASSISTANT:"),
    messages=[],
    sep=(" ", "</s>"),
    mm_token="<image>",
)

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

RESOLUTION = 672  #


def pllava_answer(
    conv: Conversation,
    model,
    processor,
    video_list,
    do_sample=True,
    max_new_tokens=200,
    num_beams=1,
    min_length=1,
    top_p=0.9,
    repetition_penalty=1.0,
    length_penalty=1,
    temperature=1.0,
    stop_criteria_keywords=None,
    print_res=False,
):
    # torch.cuda.empty_cache()
    prompt = conv.get_prompt()
    inputs_list = [processor(text=prompt, images=video, return_tensors="pt") for video in video_list]
    inputs_batched = dict()  # add batch dimension by cat
    for input_type in list(inputs_list[0].keys()):
        inputs_batched[input_type] = torch.cat([inputs[input_type] for inputs in inputs_list])
    inputs_batched = BatchFeature(inputs_batched, tensor_type="pt").to(model.device)

    with torch.no_grad():
        output_texts = model.generate(
            **inputs_batched,
            media_type="video",
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            min_length=min_length,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            length_penalty=length_penalty,
            temperature=temperature,
        )
        output_texts = processor.batch_decode(
            output_texts, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
    for i in range(len(output_texts)):
        if print_res:  # debug usage
            print("### PROMPTING LM WITH: ", prompt)
            print("### LM OUTPUT TEXT:  ", output_texts[i])
        if conv.roles[-1] == "<|im_start|>assistant\n":
            split_tag = "<|im_start|> assistant\n"
        else:
            split_tag = conv.roles[-1]
        output_texts[i] = output_texts[i].split(split_tag)[-1]
        ending = conv.sep if isinstance(conv.sep, str) else conv.sep[1]
        output_texts[i] = output_texts[i].removesuffix(ending).strip()
        output_texts[i] = output_texts[i].replace("\n", " ")
        conv.messages[-1][1] = output_texts[i]
    return output_texts, conv


def get_index(num_frames, num_segments):
    seg_size = float(num_frames - 1) / num_segments
    start = int(seg_size / 2)
    offsets = np.array([start + int(np.round(seg_size * idx)) for idx in range(num_segments)])
    return offsets


# def load_video(video_path, num_frames, return_msg=False, resolution=336):
#     transforms = torchvision.transforms.Resize(size=resolution)
#     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
#     total_num_frames = len(vr)
#     frame_indices = get_index(total_num_frames, num_frames)
#     images_group = list()
#     for frame_index in frame_indices:
#         img = Image.fromarray(vr[frame_index].asnumpy())
#         images_group.append(transforms(img))
#     if return_msg:
#         fps = float(vr.get_avg_fps())
#         sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
#         # " " should be added in the start and end
#         msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
#         return images_group, msg
#     else:
#         return images_group


def load_video(video_path, num_frames, return_msg=False, resolution=336):
    transforms = torchvision.transforms.Resize(size=resolution)
    # vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    vframes, aframes, info = read_video_av(
        video_path,
        pts_unit="sec", 
        output_format="THWC"
    )
    print(vframes.shape)
    total_num_frames = len(vframes)
    # print("Video path: ", video_path)
    # print("Total number of frames: ", total_num_frames)
    frame_indices = get_index(total_num_frames, num_frames)
    images_group = list()
    for frame_index in frame_indices:
        img = Image.fromarray(vframes[frame_index].numpy())
        images_group.append(transforms(img))
    if return_msg:
        # fps = float(vframes.get_avg_fps())
        # sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
        # # " " should be added in the start and end
        # msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
        # return images_group, msg
        exit('return_msg not implemented yet')
    else:
        return images_group


def collate_fn(batch):
    return batch


class CSVDataset(Dataset):
    def __init__(self, csv_path, num_frames):
        self.df = pd.read_csv(csv_path)
        self.data_list = self.df.path.tolist()
        self.num_frames = num_frames

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self.data_list):
            raise IndexError
        try:
            video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION)
        except:
            return None
        return video

    def set_rank_and_world_size(self, rank, world_size):
        self.rank = rank
        self.world_size = world_size
        self.data_per_gpu = len(self) // world_size
        start_index = rank * self.data_per_gpu
        end_index = (rank + 1) * self.data_per_gpu if rank != world_size - 1 else len(self)
        self.data_list = self.data_list[start_index:end_index]


def parse_args():
    parser = ArgumentParser()
    parser.add_argument("--pretrained_model_name_or_path", type=str, required=True, default="llava-hf/llava-1.5-7b-hf")
    parser.add_argument(
        "--batch_size",
        type=int,
        required=False,
        default=1,
    )
    parser.add_argument(
        "--csv_path",
        type=str,
        required=True,
    )
    parser.add_argument(
        "--num_frames",
        type=int,
        required=True,
        default=4,
    )
    parser.add_argument("--use_lora", action="store_true")
    parser.add_argument(
        "--lora_alpha",
        type=int,
        required=False,
        default=4,
    )
    parser.add_argument(
        "--weight_dir",
        type=str,
        required=False,
        default=None,
    )
    parser.add_argument(
        "--conv_mode",
        type=str,
        required=False,
        default="eval_mvbench",
    )
    parser.add_argument(
        "--pooling_shape",
        type=str,
        required=False,
        default=None,
    )
    parser.add_argument(
        "--error_message",
        type=str,
        required=False,
        default='error occured during captioning',
    )
    args = parser.parse_args()
    return args


def load_model_and_dataset(
    rank,
    world_size,
    pretrained_model_name_or_path,
    num_frames,
    use_lora,
    lora_alpha,
    weight_dir,
    csv_path,
    pooling_shape=(16, 12, 12),
):
    # remind that, once the model goes larger (30B+) may cause the memory to be heavily used up. Even Tearing Nodes.
    model, processor = load_pllava(
        pretrained_model_name_or_path,
        num_frames=num_frames,
        use_lora=use_lora,
        weight_dir=weight_dir,
        lora_alpha=lora_alpha,
        pooling_shape=pooling_shape,
    )
    logger.info("done loading llava")

    #  position embedding
    model = model.to(torch.device(rank))
    model = model.eval()

    dataset = CSVDataset(csv_path, num_frames)
    dataset.set_rank_and_world_size(rank, world_size)
    return model, processor, dataset


def infer(
    model,
    processor,
    video_list,
    conv_mode,
    print_res=False,
):
    # check if any video in video_list is None, if so, raise an exception
    if any([video is None for video in video_list]):
        raise Exception("Video not loaded properly")
    conv = conv_template.copy()
    conv.user_query("Describe the video in details.", is_mm=True)

    llm_responses, conv = pllava_answer(
        conv=conv,
        model=model,
        processor=processor,
        video_list=video_list,
        max_new_tokens=256,
        do_sample=False,
        print_res=print_res,
    )

    return llm_responses


def run(rank, args, world_size, output_queue):
    if rank == 0:
        import os

        if os.getenv("DEBUG_ADDRESS") != None:
            import ptvsd

            ptvsd.enable_attach(address=("localhost", int(os.getenv("DEBUG_ADDRESS"))), redirect_output=True)
            ptvsd.wait_for_attach()
            print("waiting for debugger attachment")
    if rank != 0:
        transformers.utils.logging.set_verbosity_error()
        logger.setLevel(transformers.logging.ERROR)

    print_res = False
    conv_mode = args.conv_mode
    if args.pooling_shape is not None:
        pooling_shape = tuple([int(x) for x in args.pooling_shape.split("-")])

    logger.info(f"loading model and constructing dataset to gpu {rank}...")
    model, processor, dataset = load_model_and_dataset(
        rank,
        world_size,
        pretrained_model_name_or_path=args.pretrained_model_name_or_path,
        num_frames=args.num_frames,
        use_lora=args.use_lora,
        lora_alpha=args.lora_alpha,
        weight_dir=args.weight_dir,
        pooling_shape=pooling_shape,
        csv_path=args.csv_path,
    )
    logger.info(f"done model and dataset...")
    logger.info("constructing dataset...")
    logger.info("single test...")
    dataloader = torch.utils.data.DataLoader(
        dataset,
        num_workers=2,
        batch_size=args.batch_size,
        collate_fn=collate_fn,
        shuffle=False,
    )

    total = 0
    result_list = []
    print(len(dataset))
    for batch in tqdm(dataloader):
        total += 1
        try:
            preds = infer(
                model,
                processor,
                batch,
                conv_mode=conv_mode,
                print_res=print_res,
            )
        except Exception as e:
            logger.error(f"error in {batch}: {str(e)}")
            # preds = args.error_message duplicated for each video in the batch
            preds = [args.error_message] * len(batch)
        result_list.extend(preds)
    output_queue.put((rank, result_list))
    return result_list


def main():
    multiprocess = True
    mp.set_start_method("spawn")
    args = parse_args()
    # csv_path = '/home/tom/PLLaVA/test_short_caption_part2.csv'
    if multiprocess:
        n_gpus = torch.cuda.device_count()
        world_size = n_gpus
        print(f"world_size: {world_size}")
        # Create a queue to collect results from each process
        output_queue = Queue()

        # with Pool(world_size) as pool:
        #     func = functools.partial(run, args=args, world_size=world_size)
        #     result_lists = pool.map(func, range(world_size))
        processes = []
        for i in range(world_size):
            # Each process will now also take the output queue as an argument
            p = Process(target=run, args=(i, args, world_size, output_queue))
            p.daemon = False
            processes.append(p)
            p.start()

        results_by_rank = {}
        for _ in range(world_size):
            rank, results = output_queue.get()  # Retrieve results as they finish
            results_by_rank[rank] = results
            print(f"Results received from rank {rank}")
            # ORDER THE RESULTS BY RANK
        logger.info("finished running")
        for p in processes:
            p.join()

        results_list = list(itertools.chain.from_iterable(results_by_rank[i] for i in range(world_size)))
        # results_list = list(itertools.chain([results_by_rank[i] for i in range(world_size)]))
        # (data[key] for key in sorted_keys)
        # results_list = [item for sublist in results_by_rank.values() for item in sublist]

    else:
        results_list = run(0, world_size=1, args=args)  # debug

    print(results_list)

    df = pd.read_csv(args.csv_path)
    # add a new column to the dataframe
    df["text"] = results_list
    drop_failed = True
    if drop_failed:
        # iterate through the dataframe and delete the entire row if captioning failed
        for i in tqdm(range(len(df))):
            if df["text"][i] == args.error_message:
                df = df.drop(i)
    # write the dataframe to a new csv file called '*_pllava_13b_caption.csv'
    new_csv_path = args.csv_path.replace(".csv", "_text.csv")
    df.to_csv(new_csv_path, index=False)
    print(f"Results saved to {new_csv_path}")

if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/tools/caption/utils.py
================================================
import time

import pandas as pd
import torch
import torchvision.transforms as transforms
from torchvision.datasets.folder import pil_loader

from tools.datasets.utils import extract_frames, is_video

IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
PROMPTS = {
    "image": {
        "text": "Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than five sentences. Remember do not exceed 5 sentences.",
        "type": "image",
    },
    "image-text": {
        "text": "Describe this image and its style in a very detailed manner. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than six sentences. Some information about the image is '{}'.",
        "type": "image",
    },
    "image-3ex": {
        "text": "An image is given. Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the video. The description should be no more than five sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick and walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
        "type": "image",
    },
    "video": {
        "text": "Describe this video and its style in a very detailed manner. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.",
        "type": "video",
    },
    "video-text": {
        "text": "Describe this video and its style in a very detailed manner. Some information about the image is '{}'. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.",
        "type": "video",
    },
    "video-f1-detail-3ex": {
        "text": "A video is given by providing the middle frame. Describe this video and its style to generate a description. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
        "type": "video",
    },
    "video-f1-detail-2ex-text": {
        "text": "A video is given by providing the middle frame. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.",
        "type": "video",
    },
    "video-f3-detail-3ex": {
        "text": "A video is given by providing three frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
        "type": "video",
    },
    "video-f3-detail-2ex-text": {
        "text": "A video is given by providing three frames in chronological order. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.",
        "type": "video",
    },
}


NUM_FRAMES_POINTS = {
    1: (0.5,),
    2: (0.25, 0.75),
    3: (0.1, 0.5, 0.9),
}


def read_file(input_path):
    if input_path.endswith(".csv"):
        return pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        return pd.read_parquet(input_path)
    else:
        raise NotImplementedError(f"Unsupported file format: {input_path}")


class VideoTextDataset(torch.utils.data.Dataset):
    def __init__(self, csv_path, transform=None, num_frames=3, get_text_input_ids=None, resize=None):
        self.csv_path = csv_path
        self.transform = transform
        self.data = read_file(csv_path)
        self.points = NUM_FRAMES_POINTS[num_frames]
        self.get_text_input_ids = get_text_input_ids
        self.use_text = False
        self.resize_size = resize
        self.resize = transforms.Resize(resize, transforms.InterpolationMode.BICUBIC) if resize is not None else None
        if "text" in self.data.columns:
            self.use_text = True

    def getitem(self, index):
        sample = self.data.iloc[index]
        path = sample["path"]
        if not is_video(path):
            images = [pil_loader(path)]
            length = 1
        else:
            images, length = extract_frames(sample["path"], points=self.points, backend="opencv", return_length=True)
        if self.resize_size is not None:
            images_r = []
            for img in images:
                if img.size[0] > self.resize_size or img.size[1] > self.resize_size:
                    img = self.resize(img)
                images_r.append(img)
            images = images_r
        imgs_size = [img.size for img in images]
        if self.transform is not None:
            images = self.transform(images)

        # we put images into a list as pytorch dataloader does not accept Pill
        out = dict(path=path, image=images, length=length, img_size=imgs_size)
        if self.get_text_input_ids is not None:
            if self.use_text:
                out["text"] = self.get_text_input_ids(sample["text"])
            else:
                out["text"] = self.get_text_input_ids()
        else:
            if self.use_text:
                out["text"] = sample["text"]
            else:
                out["text"] = ""
        return out

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.getitem(index)


def collate_fn(batch):
    paths = [item["path"] for item in batch]
    images = [item["image"] for item in batch]
    lengths = [item["length"] for item in batch]
    img_sizes = [item["img_size"] for item in batch]
    texts = [item["text"] for item in batch]
    return paths, images, lengths, img_sizes, texts


class Timer:
    def __init__(self):
        self.time_taken = 0
        self.start_time = 0
        self.end_time = 0

    def __enter__(self):
        self.start_time = time.time()
        return self

    def __exit__(self, exc_type, exc_value, exc_tb):
        self.end_time = time.time()
        self.time_taken = self.end_time - self.start_time


================================================
FILE: Open-Sora/tools/datasets/README.md
================================================
# Dataset Management

- [Dataset Management](#dataset-management)
  - [Dataset Format](#dataset-format)
  - [Dataset to CSV](#dataset-to-csv)
  - [Manage datasets](#manage-datasets)
    - [Requirement](#requirement)
    - [Basic Usage](#basic-usage)
    - [Score filtering](#score-filtering)
    - [Documentation](#documentation)
  - [Transform datasets](#transform-datasets)
    - [Resize](#resize)
    - [Frame extraction](#frame-extraction)
    - [Crop Midjourney 4 grid](#crop-midjourney-4-grid)
  - [Analyze datasets](#analyze-datasets)
  - [Data Process Pipeline](#data-process-pipeline)

After preparing the raw dataset according to the [instructions](/docs/datasets.md), you can use the following commands to manage the dataset.

## Dataset Format

All dataset should be provided in a `.csv` file (or `parquet.gzip` to save space), which is used for both training and data preprocessing. The columns should follow the words below:

- `path`: the relative/absolute path or url to the image or video file. Required.
- `text`: the caption or description of the image or video. Required for training.
- `num_frames`: the number of frames in the video. Required for training.
- `width`: the width of the video frame. Required for dynamic bucket.
- `height`: the height of the video frame. Required for dynamic bucket.
- `aspect_ratio`: the aspect ratio of the video frame (height / width). Required for dynamic bucket.
- `resolution`: height x width. For analysis.
- `text_len`: the number of tokens in the text. For analysis.
- `aes`: aesthetic score calculated by [asethetic scorer](/tools/aesthetic/README.md). For filtering.
- `flow`: optical flow score calculated by [UniMatch](/tools/scoring/README.md). For filtering.
- `match`: matching score of a image-text/video-text pair calculated by [CLIP](/tools/scoring/README.md). For filtering.
- `fps`: the frame rate of the video. Optional.
- `cmotion`: the camera motion.

An example ready for training:

```csv
path, text, num_frames, width, height, aspect_ratio
/absolute/path/to/image1.jpg, caption, 1, 720, 1280, 0.5625
/absolute/path/to/video1.mp4, caption, 120, 720, 1280, 0.5625
/absolute/path/to/video2.mp4, caption, 20, 256, 256, 1
```

We use pandas to manage the `.csv` or `.parquet` files. The following code is for reading and writing files:

```python
df = pd.read_csv(input_path)
df = df.to_csv(output_path, index=False)
# or use parquet, which is smaller
df = pd.read_parquet(input_path)
df = df.to_parquet(output_path, index=False)
```

## Dataset to CSV

As a start point, `convert.py` is used to convert the dataset to a CSV file. You can use the following commands to convert the dataset to a CSV file:

```bash
python -m tools.datasets.convert DATASET-TYPE DATA_FOLDER

# general video folder
python -m tools.datasets.convert video VIDEO_FOLDER --output video.csv
# general image folder
python -m tools.datasets.convert image IMAGE_FOLDER --output image.csv
# imagenet
python -m tools.datasets.convert imagenet IMAGENET_FOLDER --split train
# ucf101
python -m tools.datasets.convert ucf101 UCF101_FOLDER --split videos
# vidprom
python -m tools.datasets.convert vidprom VIDPROM_FOLDER --info VidProM_semantic_unique.csv
```

## Manage datasets

Use `datautil` to manage the dataset.

### Requirement

Follow our [installation guide](../../docs/installation.md)'s "Data Dependencies" and "Datasets" section to install the required packages.
<!-- To accelerate processing speed, you can install [pandarallel](https://github.com/nalepae/pandarallel):

```bash
pip install pandarallel
``` -->

<!-- To get image and video information, you need to install [opencv-python](https://github.com/opencv/opencv-python): -->

<!-- ```bash
pip install opencv-python
# If your videos are in av1 codec instead of h264, you need to
# - install ffmpeg first
# - install via conda to support av1 codec
conda install -c conda-forge opencv
``` -->

<!-- Or to get video information, you can install ffmpeg and ffmpeg-python:

```bash
pip install ffmpeg-python
``` -->

<!-- To filter a specific language, you need to install [lingua](https://github.com/pemistahl/lingua-py):

```bash
pip install lingua-language-detector
``` -->

### Basic Usage

You can use the following commands to process the `csv` or `parquet` files. The output file will be saved in the same directory as the input, with different suffixes indicating the processed method.

```bash
# datautil takes multiple CSV files as input and merge them into one CSV file
# output: DATA1+DATA2.csv
python -m tools.datasets.datautil DATA1.csv DATA2.csv

# shard CSV files into multiple CSV files
# output: DATA1_0.csv, DATA1_1.csv, ...
python -m tools.datasets.datautil DATA1.csv --shard 10

# filter frames between 128 and 256, with captions
# output: DATA1_fmin_128_fmax_256.csv
python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256

# Disable parallel processing
python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256 --disable-parallel

# Compute num_frames, height, width, fps, aspect_ratio for videos or images
# output: IMG_DATA+VID_DATA_vinfo.csv
python -m tools.datasets.datautil IMG_DATA.csv VID_DATA.csv --video-info

# You can run multiple operations at the same time.
python -m tools.datasets.datautil DATA.csv --video-info --remove-empty-caption --remove-url --lang en
```

### Score filtering

To examine and filter the quality of the dataset by aesthetic score and clip score, you can use the following commands:

```bash
# sort the dataset by aesthetic score
# output: DATA_sort.csv
python -m tools.datasets.datautil DATA.csv --sort aesthetic_score
# View examples of high aesthetic score
head -n 10 DATA_sort.csv
# View examples of low aesthetic score
tail -n 10 DATA_sort.csv

# sort the dataset by clip score
# output: DATA_sort.csv
python -m tools.datasets.datautil DATA.csv --sort clip_score

# filter the dataset by aesthetic score
# output: DATA_aesmin_0.5.csv
python -m tools.datasets.datautil DATA.csv --aesmin 0.5
# filter the dataset by clip score
# output: DATA_matchmin_0.5.csv
python -m tools.datasets.datautil DATA.csv --matchmin 0.5
```

### Documentation

You can also use `python -m tools.datasets.datautil --help` to see usage.

| Args                        | File suffix    | Description                                                   |
| --------------------------- | -------------- | ------------------------------------------------------------- |
| `--output OUTPUT`           |                | Output path                                                   |
| `--format FORMAT`           |                | Output format (csv, parquet, parquet.gzip)                    |
| `--disable-parallel`        |                | Disable `pandarallel`                                         |
| `--seed SEED`               |                | Random seed                                                   |
| `--shard SHARD`             | `_0`,`_1`, ... | Shard the dataset                                             |
| `--sort KEY`                | `_sort`        | Sort the dataset by KEY                                       |
| `--sort-descending KEY`     | `_sort`        | Sort the dataset by KEY in descending order                   |
| `--difference DATA.csv`     |                | Remove the paths in DATA.csv from the dataset                 |
| `--intersection DATA.csv`   |                | Keep the paths in DATA.csv from the dataset and merge columns |
| `--info`                    | `_info`        | Get the basic information of each video and image (cv2)       |
| `--ext`                     | `_ext`         | Remove rows if the file does not exist                        |
| `--relpath`                 | `_relpath`     | Modify the path to relative path by root given                |
| `--abspath`                 | `_abspath`     | Modify the path to absolute path by root given                |
| `--remove-empty-caption`    | `_noempty`     | Remove rows with empty caption                                |
| `--remove-url`              | `_nourl`       | Remove rows with url in caption                               |
| `--lang LANG`               | `_lang`        | Remove rows with other language                               |
| `--remove-path-duplication` | `_noduppath`   | Remove rows with duplicated path                              |
| `--remove-text-duplication` | `_noduptext`   | Remove rows with duplicated caption                           |
| `--refine-llm-caption`      | `_llm`         | Modify the caption generated by LLM                           |
| `--clean-caption MODEL`     | `_clean`       | Modify the caption according to T5 pipeline to suit training  |
| `--unescape`                | `_unescape`    | Unescape the caption                                          |
| `--merge-cmotion`           | `_cmotion`     | Merge the camera motion to the caption                        |
| `--count-num-token`         | `_ntoken`      | Count the number of tokens in the caption                     |
| `--load-caption EXT`        | `_load`        | Load the caption from the file                                |
| `--fmin FMIN`               | `_fmin`        | Filter the dataset by minimum number of frames                |
| `--fmax FMAX`               | `_fmax`        | Filter the dataset by maximum number of frames                |
| `--hwmax HWMAX`             | `_hwmax`       | Filter the dataset by maximum height x width                  |
| `--aesmin AESMIN`           | `_aesmin`      | Filter the dataset by minimum aesthetic score                 |
| `--matchmin MATCHMIN`       | `_matchmin`    | Filter the dataset by minimum clip score                      |
| `--flowmin FLOWMIN`         | `_flowmin`     | Filter the dataset by minimum optical flow score              |

## Transform datasets

The `tools.datasets.transform` module provides a set of tools to transform the dataset. The general usage is as follows:

```bash
python -m tools.datasets.transform TRANSFORM_TYPE META.csv ORIGINAL_DATA_FOLDER DATA_FOLDER_TO_SAVE_RESULTS --additional-args
```

### Resize

Sometimes you may need to resize the images or videos to a specific resolution. You can use the following commands to resize the dataset:

```bash
python -m tools.datasets.transform meta.csv /path/to/raw/data /path/to/new/data --length 2160
```

### Frame extraction

To extract frames from videos, you can use the following commands:

```bash
python -m tools.datasets.transform vid_frame_extract meta.csv /path/to/raw/data /path/to/new/data --points 0.1 0.5 0.9
```

### Crop Midjourney 4 grid

Randomly select one of the 4 images in the 4 grid generated by Midjourney.

```bash
python -m tools.datasets.transform img_rand_crop meta.csv /path/to/raw/data /path/to/new/data
```

## Analyze datasets

You can easily get basic information about a `.csv` dataset by using the following commands:

```bash
# examine the first 10 rows of the CSV file
head -n 10 DATA1.csv
# count the number of data in the CSV file (approximately)
wc -l DATA1.csv
```

For the dataset provided in a `.csv` or `.parquet` file, you can easily analyze the dataset using the following commands. Plots will be automatically saved.

```python
pyhton -m tools.datasets.analyze DATA_info.csv
```

## Data Process Pipeline

```bash
# Suppose videos and images under ~/dataset/
# 1. Convert dataset to CSV
python -m tools.datasets.convert video ~/dataset --output meta.csv

# 2. Get video information
python -m tools.datasets.datautil meta.csv --info --fmin 1

# 3. Get caption
# 3.1. generate caption
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava meta_info_fmin1.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
# merge generated results
python -m tools.datasets.datautil meta_info_fmin1_caption_part*.csv --output meta_caption.csv
# merge caption and info
python -m tools.datasets.datautil meta_info_fmin1.csv --intersection meta_caption.csv --output meta_caption_info.csv
# clean caption
python -m tools.datasets.datautil meta_caption_info.csv --clean-caption --refine-llm-caption --remove-empty-caption --output meta_caption_processed.csv
# 3.2. extract caption
python -m tools.datasets.datautil meta_info_fmin1.csv --load-caption json --remove-empty-caption --clean-caption

# 4. Scoring
# aesthetic scoring
torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference meta_caption_processed.csv
python -m tools.datasets.datautil meta_caption_processed_part*.csv --output meta_caption_processed_aes.csv
# optical flow scoring
torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference meta_caption_processed.csv
# matching scoring
torchrun --standalone --nproc_per_node 8 -m tools.scoring.matching.inference meta_caption_processed.csv
# camera motion
python -m tools.caption.camera_motion_detect meta_caption_processed.csv
```


================================================
FILE: Open-Sora/tools/datasets/__init__.py
================================================


================================================
FILE: Open-Sora/tools/datasets/analyze.py
================================================
import argparse
import os

import matplotlib.pyplot as plt
import pandas as pd


def read_file(input_path):
    if input_path.endswith(".csv"):
        return pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        return pd.read_parquet(input_path)
    else:
        raise NotImplementedError(f"Unsupported file format: {input_path}")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="Path to the input dataset")
    parser.add_argument("--save-img", type=str, default="samples/infos/", help="Path to save the image")
    return parser.parse_args()


def plot_data(data, column, bins, name):
    plt.clf()
    data.hist(column=column, bins=bins)
    os.makedirs(os.path.dirname(name), exist_ok=True)
    plt.savefig(name)
    print(f"Saved {name}")


def plot_categorical_data(data, column, name):
    plt.clf()
    data[column].value_counts().plot(kind="bar")
    os.makedirs(os.path.dirname(name), exist_ok=True)
    plt.savefig(name)
    print(f"Saved {name}")


COLUMNS = {
    "num_frames": 100,
    "resolution": 100,
    "text_len": 100,
    "aes": 100,
    "match": 100,
    "flow": 100,
    "cmotion": None,
}


def main(args):
    data = read_file(args.input)

    # === Image Data Info ===
    image_index = data["num_frames"] == 1
    if image_index.sum() > 0:
        print("=== Image Data Info ===")
        img_data = data[image_index]
        print(f"Number of images: {len(img_data)}")
        print(img_data.head())
        print(img_data.describe())
        if args.save_img:
            for column in COLUMNS:
                if column in img_data.columns and column not in ["num_frames", "cmotion"]:
                    if COLUMNS[column] is None:
                        plot_categorical_data(img_data, column, os.path.join(args.save_img, f"image_{column}.png"))
                    else:
                        plot_data(img_data, column, COLUMNS[column], os.path.join(args.save_img, f"image_{column}.png"))

    # === Video Data Info ===
    if not image_index.all():
        print("=== Video Data Info ===")
        video_data = data[~image_index]
        print(f"Number of videos: {len(video_data)}")
        if "num_frames" in video_data.columns:
            total_num_frames = video_data["num_frames"].sum()
            print(f"Number of frames: {total_num_frames}")
            DEFAULT_FPS = 30
            total_hours = total_num_frames / DEFAULT_FPS / 3600
            print(f"Total hours (30 FPS): {int(total_hours)}")
        print(video_data.head())
        print(video_data.describe())
        if args.save_img:
            for column in COLUMNS:
                if column in video_data.columns:
                    if COLUMNS[column] is None:
                        plot_categorical_data(video_data, column, os.path.join(args.save_img, f"video_{column}.png"))
                    else:
                        plot_data(
                            video_data, column, COLUMNS[column], os.path.join(args.save_img, f"video_{column}.png")
                        )


if __name__ == "__main__":
    args = parse_args()
    main(args)


================================================
FILE: Open-Sora/tools/datasets/convert.py
================================================
import argparse
import os
import time

import pandas as pd
from torchvision.datasets import ImageNet

IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv", ".m2ts")


def scan_recursively(root):
    num = 0
    for entry in os.scandir(root):
        if entry.is_file():
            yield entry
        elif entry.is_dir():
            num += 1
            if num % 100 == 0:
                print(f"Scanned {num} directories.")
            yield from scan_recursively(entry.path)


def get_filelist(file_path, exts=None):
    filelist = []
    time_start = time.time()

    # == OS Walk ==
    # for home, dirs, files in os.walk(file_path):
    #     for filename in files:
    #         ext = os.path.splitext(filename)[-1].lower()
    #         if exts is None or ext in exts:
    #             filelist.append(os.path.join(home, filename))

    # == Scandir ==
    obj = scan_recursively(file_path)
    for entry in obj:
        if entry.is_file():
            ext = os.path.splitext(entry.name)[-1].lower()
            if exts is None or ext in exts:
                filelist.append(entry.path)

    time_end = time.time()
    print(f"Scanned {len(filelist)} files in {time_end - time_start:.2f} seconds.")
    return filelist


def split_by_capital(name):
    # BoxingPunchingBag -> Boxing Punching Bag
    new_name = ""
    for i in range(len(name)):
        if name[i].isupper() and i != 0:
            new_name += " "
        new_name += name[i]
    return new_name


def process_imagenet(root, split):
    root = os.path.expanduser(root)
    data = ImageNet(root, split=split)
    samples = [(path, data.classes[label][0]) for path, label in data.samples]
    output = f"imagenet_{split}.csv"

    df = pd.DataFrame(samples, columns=["path", "text"])
    df.to_csv(output, index=False)
    print(f"Saved {len(samples)} samples to {output}.")


def process_ucf101(root, split):
    root = os.path.expanduser(root)
    video_lists = get_filelist(os.path.join(root, split))
    classes = [x.split("/")[-2] for x in video_lists]
    classes = [split_by_capital(x) for x in classes]
    samples = list(zip(video_lists, classes))
    output = f"ucf101_{split}.csv"

    df = pd.DataFrame(samples, columns=["path", "text"])
    df.to_csv(output, index=False)
    print(f"Saved {len(samples)} samples to {output}.")


def process_vidprom(root, info):
    root = os.path.expanduser(root)
    video_lists = get_filelist(root)
    video_set = set(video_lists)
    # read info csv
    infos = pd.read_csv(info)
    abs_path = infos["uuid"].apply(lambda x: os.path.join(root, f"pika-{x}.mp4"))
    is_exist = abs_path.apply(lambda x: x in video_set)
    df = pd.DataFrame(dict(path=abs_path[is_exist], text=infos["prompt"][is_exist]))
    df.to_csv("vidprom.csv", index=False)
    print(f"Saved {len(df)} samples to vidprom.csv.")


def process_general_images(root, output):
    root = os.path.expanduser(root)
    if not os.path.exists(root):
        return
    path_list = get_filelist(root, IMG_EXTENSIONS)
    fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
    df = pd.DataFrame(dict(id=fname_list, path=path_list))

    os.makedirs(os.path.dirname(output), exist_ok=True)
    df.to_csv(output, index=False)
    print(f"Saved {len(df)} samples to {output}.")


def process_general_videos(root, output):
    root = os.path.expanduser(root)
    if not os.path.exists(root):
        return
    path_list = get_filelist(root, VID_EXTENSIONS)
    path_list = list(set(path_list))  # remove duplicates
    fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
    relpath_list = [os.path.relpath(x, root) for x in path_list]
    df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list))

    os.makedirs(os.path.dirname(output), exist_ok=True)
    df.to_csv(output, index=False)
    print(f"Saved {len(df)} samples to {output}.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101", "vidprom", "image", "video"])
    parser.add_argument("root", type=str)
    parser.add_argument("--split", type=str, default="train")
    parser.add_argument("--info", type=str, default=None)
    parser.add_argument("--output", type=str, default=None, required=True, help="Output path")
    args = parser.parse_args()

    if args.dataset == "imagenet":
        process_imagenet(args.root, args.split)
    elif args.dataset == "ucf101":
        process_ucf101(args.root, args.split)
    elif args.dataset == "vidprom":
        process_vidprom(args.root, args.info)
    elif args.dataset == "image":
        process_general_images(args.root, args.output)
    elif args.dataset == "video":
        process_general_videos(args.root, args.output)
    else:
        raise ValueError("Invalid dataset")


================================================
FILE: Open-Sora/tools/datasets/datautil.py
================================================
import argparse
import html
import json
import os
import random
import re
from functools import partial
from glob import glob

import cv2
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

from opensora.datasets.read_video import read_video

from .utils import IMG_EXTENSIONS

tqdm.pandas()

try:
    from pandarallel import pandarallel

    PANDA_USE_PARALLEL = True
except ImportError:
    PANDA_USE_PARALLEL = False


def apply(df, func, **kwargs):
    if PANDA_USE_PARALLEL:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)


TRAIN_COLUMNS = ["path", "text", "num_frames", "fps", "height", "width", "aspect_ratio", "resolution", "text_len"]

# ======================================================
# --info
# ======================================================


def get_video_length(cap, method="header"):
    assert method in ["header", "set"]
    if method == "header":
        length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    else:
        cap.set(cv2.CAP_PROP_POS_AVI_RATIO, 1)
        length = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
    return length


def get_info_old(path):
    try:
        ext = os.path.splitext(path)[1].lower()
        if ext in IMG_EXTENSIONS:
            im = cv2.imread(path)
            if im is None:
                return 0, 0, 0, np.nan, np.nan, np.nan
            height, width = im.shape[:2]
            num_frames, fps = 1, np.nan
        else:
            cap = cv2.VideoCapture(path)
            num_frames, height, width, fps = (
                get_video_length(cap, method="header"),
                int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
                int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
                float(cap.get(cv2.CAP_PROP_FPS)),
            )
        hw = height * width
        aspect_ratio = height / width if width > 0 else np.nan
        return num_frames, height, width, aspect_ratio, fps, hw
    except:
        return 0, 0, 0, np.nan, np.nan, np.nan


def get_info(path):
    try:
        ext = os.path.splitext(path)[1].lower()
        if ext in IMG_EXTENSIONS:
            return get_image_info(path)
        else:
            return get_video_info(path)
    except:
        return 0, 0, 0, np.nan, np.nan, np.nan


def get_image_info(path, backend="pillow"):
    if backend == "pillow":
        try:
            with open(path, "rb") as f:
                img = Image.open(f)
                img = img.convert("RGB")
            width, height = img.size
            num_frames, fps = 1, np.nan
            hw = height * width
            aspect_ratio = height / width if width > 0 else np.nan
            return num_frames, height, width, aspect_ratio, fps, hw
        except:
            return 0, 0, 0, np.nan, np.nan, np.nan
    elif backend == "cv2":
        try:
            im = cv2.imread(path)
            if im is None:
                return 0, 0, 0, np.nan, np.nan, np.nan
            height, width = im.shape[:2]
            num_frames, fps = 1, np.nan
            hw = height * width
            aspect_ratio = height / width if width > 0 else np.nan
            return num_frames, height, width, aspect_ratio, fps, hw
        except:
            return 0, 0, 0, np.nan, np.nan, np.nan
    else:
        raise ValueError


def get_video_info(path, backend="torchvision"):
    if backend == "torchvision":
        try:
            vframes, infos = read_video(path)
            num_frames, height, width = vframes.shape[0], vframes.shape[2], vframes.shape[3]
            if "video_fps" in infos:
                fps = infos["video_fps"]
            else:
                fps = np.nan
            hw = height * width
            aspect_ratio = height / width if width > 0 else np.nan
            return num_frames, height, width, aspect_ratio, fps, hw
        except:
            return 0, 0, 0, np.nan, np.nan, np.nan
    elif backend == "cv2":
        try:
            cap = cv2.VideoCapture(path)
            num_frames, height, width, fps = (
                get_video_length(cap, method="header"),
                int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
                int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
                float(cap.get(cv2.CAP_PROP_FPS)),
            )
            hw = height * width
            aspect_ratio = height / width if width > 0 else np.nan
            return num_frames, height, width, aspect_ratio, fps, hw
        except:
            return 0, 0, 0, np.nan, np.nan, np.nan
    else:
        raise ValueError


# ======================================================
# --refine-llm-caption
# ======================================================

LLAVA_PREFIX = [
    "The video shows",
    "The video captures",
    "The video features",
    "The video depicts",
    "The video presents",
    "The video features",
    "The video is ",
    "In the video,",
    "The image shows",
    "The image captures",
    "The image features",
    "The image depicts",
    "The image presents",
    "The image features",
    "The image is ",
    "The image portrays",
    "In the image,",
]


def remove_caption_prefix(caption):
    for prefix in LLAVA_PREFIX:
        if caption.startswith(prefix) or caption.startswith(prefix.lower()):
            caption = caption[len(prefix) :].strip()
            if caption[0].islower():
                caption = caption[0].upper() + caption[1:]
            return caption
    return caption


# ======================================================
# --merge-cmotion
# ======================================================

CMOTION_TEXT = {
    "static": "static",
    "pan_right": "pan right",
    "pan_left": "pan left",
    "zoom_in": "zoom in",
    "zoom_out": "zoom out",
    "tilt_up": "tilt up",
    "tilt_down": "tilt down",
    # "pan/tilt": "The camera is panning.",
    # "dynamic": "The camera is moving.",
    # "unknown": None,
}
CMOTION_PROBS = {
    # hard-coded probabilities
    "static": 1.0,
    "zoom_in": 1.0,
    "zoom_out": 1.0,
    "pan_left": 1.0,
    "pan_right": 1.0,
    "tilt_up": 1.0,
    "tilt_down": 1.0,
    # "dynamic": 1.0,
    # "unknown": 0.0,
    # "pan/tilt": 1.0,
}


def merge_cmotion(caption, cmotion):
    text = CMOTION_TEXT[cmotion]
    prob = CMOTION_PROBS[cmotion]
    if text is not None and random.random() < prob:
        caption = f"{caption} Camera motion: {text}."
    return caption


# ======================================================
# --lang
# ======================================================


def build_lang_detector(lang_to_detect):
    from lingua import Language, LanguageDetectorBuilder

    lang_dict = dict(en=Language.ENGLISH)
    assert lang_to_detect in lang_dict
    valid_lang = lang_dict[lang_to_detect]
    detector = LanguageDetectorBuilder.from_all_spoken_languages().with_low_accuracy_mode().build()

    def detect_lang(caption):
        confidence_values = detector.compute_language_confidence_values(caption)
        confidence = [x.language for x in confidence_values[:5]]
        if valid_lang not in confidence:
            return False
        return True

    return detect_lang


# ======================================================
# --clean-caption
# ======================================================


def basic_clean(text):
    import ftfy

    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


BAD_PUNCT_REGEX = re.compile(
    r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
)  # noqa


def clean_caption(caption):
    import urllib.parse as ul

    from bs4 import BeautifulSoup

    caption = str(caption)
    caption = ul.unquote_plus(caption)
    caption = caption.strip().lower()
    caption = re.sub("<person>", "person", caption)
    # urls:
    caption = re.sub(
        r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    caption = re.sub(
        r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    # html:
    caption = BeautifulSoup(caption, features="html.parser").text

    # @<nickname>
    caption = re.sub(r"@[\w\d]+\b", "", caption)

    # 31C0—31EF CJK Strokes
    # 31F0—31FF Katakana Phonetic Extensions
    # 3200—32FF Enclosed CJK Letters and Months
    # 3300—33FF CJK Compatibility
    # 3400—4DBF CJK Unified Ideographs Extension A
    # 4DC0—4DFF Yijing Hexagram Symbols
    # 4E00—9FFF CJK Unified Ideographs
    caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
    caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
    caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
    caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
    caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
    caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
    caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
    #######################################################

    # все виды тире / all types of dash --> "-"
    caption = re.sub(
        r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
        "-",
        caption,
    )

    # кавычки к одному стандарту
    caption = re.sub(r"[`´«»“”¨]", '"', caption)
    caption = re.sub(r"[‘’]", "'", caption)

    # &quot;
    caption = re.sub(r"&quot;?", "", caption)
    # &amp
    caption = re.sub(r"&amp", "", caption)

    # ip adresses:
    caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)

    # article ids:
    caption = re.sub(r"\d:\d\d\s+$", "", caption)

    # \n
    caption = re.sub(r"\\n", " ", caption)

    # "#123"
    caption = re.sub(r"#\d{1,3}\b", "", caption)
    # "#12345.."
    caption = re.sub(r"#\d{5,}\b", "", caption)
    # "123456.."
    caption = re.sub(r"\b\d{6,}\b", "", caption)
    # filenames:
    caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)

    #
    caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
    caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""

    caption = re.sub(BAD_PUNCT_REGEX, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
    caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "

    # this-is-my-cute-cat / this_is_my_cute_cat
    regex2 = re.compile(r"(?:\-|\_)")
    if len(re.findall(regex2, caption)) > 3:
        caption = re.sub(regex2, " ", caption)

    caption = basic_clean(caption)

    caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
    caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
    caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231

    caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
    caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
    caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
    caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
    caption = re.sub(r"\bpage\s+\d+\b", "", caption)

    caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...

    caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)

    caption = re.sub(r"\b\s+\:\s+", r": ", caption)
    caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
    caption = re.sub(r"\s+", " ", caption)

    caption.strip()

    caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
    caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
    caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
    caption = re.sub(r"^\.\S+$", "", caption)

    return caption.strip()


def text_preprocessing(text, use_text_preprocessing: bool = True):
    if use_text_preprocessing:
        # The exact text cleaning as was in the training stage:
        text = clean_caption(text)
        text = clean_caption(text)
        return text
    else:
        return text.lower().strip()


# ======================================================
# load caption
# ======================================================


def load_caption(path, ext):
    try:
        assert ext in ["json"]
        json_path = path.split(".")[0] + ".json"
        with open(json_path, "r") as f:
            data = json.load(f)
        caption = data["caption"]
        return caption
    except:
        return ""


# ======================================================
# --clean-caption
# ======================================================

DROP_SCORE_PROB = 0.2


def score_to_text(data):
    text = data["text"]
    scores = []
    # aesthetic
    if "aes" in data:
        aes = data["aes"]
        if random.random() > DROP_SCORE_PROB:
            score_text = f"aesthetic score: {aes:.1f}"
            scores.append(score_text)
    if "flow" in data:
        flow = data["flow"]
        if random.random() > DROP_SCORE_PROB:
            score_text = f"motion score: {flow:.1f}"
            scores.append(score_text)
    if len(scores) > 0:
        text = f"{text} [{', '.join(scores)}]"
    return text


# ======================================================
# read & write
# ======================================================


def read_file(input_path):
    if input_path.endswith(".csv"):
        return pd.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        return pd.read_parquet(input_path)
    else:
        raise NotImplementedError(f"Unsupported file format: {input_path}")


def save_file(data, output_path):
    output_dir = os.path.dirname(output_path)
    if not os.path.exists(output_dir) and output_dir != "":
        os.makedirs(output_dir)
    if output_path.endswith(".csv"):
        return data.to_csv(output_path, index=False)
    elif output_path.endswith(".parquet"):
        return data.to_parquet(output_path, index=False)
    else:
        raise NotImplementedError(f"Unsupported file format: {output_path}")


def read_data(input_paths):
    data = []
    input_name = ""
    input_list = []
    for input_path in input_paths:
        input_list.extend(glob(input_path))
    print("Input files:", input_list)
    for i, input_path in enumerate(input_list):
        if not os.path.exists(input_path):
            continue
        data.append(read_file(input_path))
        input_name += os.path.basename(input_path).split(".")[0]
        if i != len(input_list) - 1:
            input_name += "+"
        print(f"Loaded {len(data[-1])} samples from '{input_path}'.")
    if len(data) == 0:
        print(f"No samples to process. Exit.")
        exit()
    data = pd.concat(data, ignore_index=True, sort=False)
    print(f"Total number of samples: {len(data)}")
    return data, input_name


# ======================================================
# main
# ======================================================
# To add a new method, register it in the main, parse_args, and get_output_path functions, and update the doc at /tools/datasets/README.md#documentation


def main(args):
    # reading data
    data, input_name = read_data(args.input)

    # make difference
    if args.difference is not None:
        data_diff = pd.read_csv(args.difference)
        print(f"Difference csv contains {len(data_diff)} samples.")
        data = data[~data["path"].isin(data_diff["path"])]
        input_name += f"-{os.path.basename(args.difference).split('.')[0]}"
        print(f"Filtered number of samples: {len(data)}.")

    # make intersection
    if args.intersection is not None:
        data_new = pd.read_csv(args.intersection)
        print(f"Intersection csv contains {len(data_new)} samples.")
        cols_to_use = data_new.columns.difference(data.columns)

        col_on = "path"
        # if 'id' in data.columns and 'id' in data_new.columns:
        #     col_on = 'id'
        cols_to_use = cols_to_use.insert(0, col_on)
        data = pd.merge(data, data_new[cols_to_use], on=col_on, how="inner")
        print(f"Intersection number of samples: {len(data)}.")

    # get output path
    output_path = get_output_path(args, input_name)

    # preparation
    if args.lang is not None:
        detect_lang = build_lang_detector(args.lang)
    if args.count_num_token == "t5":
        from transformers import AutoTokenizer

        tokenizer = AutoTokenizer.from_pretrained("DeepFloyd/t5-v1_1-xxl")

    # IO-related
    if args.load_caption is not None:
        assert "path" in data.columns
        data["text"] = apply(data["path"], load_caption, ext=args.load_caption)
    if args.info:
        info = apply(data["path"], get_info)
        (
            data["num_frames"],
            data["height"],
            data["width"],
            data["aspect_ratio"],
            data["fps"],
            data["resolution"],
        ) = zip(*info)
    if args.video_info:
        info = apply(data["path"], get_video_info)
        (
            data["num_frames"],
            data["height"],
            data["width"],
            data["aspect_ratio"],
            data["fps"],
            data["resolution"],
        ) = zip(*info)
    if args.ext:
        assert "path" in data.columns
        data = data[apply(data["path"], os.path.exists)]

    # filtering
    if args.remove_url:
        assert "text" in data.columns
        data = data[~data["text"].str.contains(r"(?P<url>https?://[^\s]+)", regex=True)]
    if args.lang is not None:
        assert "text" in data.columns
        data = data[data["text"].progress_apply(detect_lang)]  # cannot parallelize
    if args.remove_empty_path:
        assert "path" in data.columns
        data = data[data["path"].str.len() > 0]
        data = data[~data["path"].isna()]
    if args.remove_empty_caption:
        assert "text" in data.columns
        data = data[data["text"].str.len() > 0]
        data = data[~data["text"].isna()]
    if args.remove_path_duplication:
        assert "path" in data.columns
        data = data.drop_duplicates(subset=["path"])
    if args.path_subset:
        data = data[data["path"].str.contains(args.path_subset)]

    # processing
    if args.relpath is not None:
        data["path"] = apply(data["path"], lambda x: os.path.relpath(x, args.relpath))
    if args.abspath is not None:
        data["path"] = apply(data["path"], lambda x: os.path.join(args.abspath, x))
    if args.path_to_id:
        data["id"] = apply(data["path"], lambda x: os.path.splitext(os.path.basename(x))[0])
    if args.merge_cmotion:
        data["text"] = apply(data, lambda x: merge_cmotion(x["text"], x["cmotion"]), axis=1)
    if args.refine_llm_caption:
        assert "text" in data.columns
        data["text"] = apply(data["text"], remove_caption_prefix)
    if args.append_text is not None:
        assert "text" in data.columns
        data["text"] = data["text"] + args.append_text
    if args.score_to_text:
        data["text"] = apply(data, score_to_text, axis=1)
    if args.clean_caption:
        assert "text" in data.columns
        data["text"] = apply(
            data["text"],
            partial(text_preprocessing, use_text_preprocessing=True),
        )
    if args.count_num_token is not None:
        assert "text" in data.columns
        data["text_len"] = apply(data["text"], lambda x: len(tokenizer(x)["input_ids"]))
    if args.update_text is not None:
        data_new = pd.read_csv(args.update_text)
        num_updated = data.path.isin(data_new.path).sum()
        print(f"Number of updated samples: {num_updated}.")
        data = data.set_index("path")
        data_new = data_new[["path", "text"]].set_index("path")
        data.update(data_new)
        data = data.reset_index()

    # sort
    if args.sort is not None:
        data = data.sort_values(by=args.sort, ascending=False)
    if args.sort_ascending is not None:
        data = data.sort_values(by=args.sort_ascending, ascending=True)

    # filtering
    if args.filesize:
        assert "path" in data.columns
        data["filesize"] = apply(data["path"], lambda x: os.stat(x).st_size / 1024 / 1024)
    if args.fsmax is not None:
        assert "filesize" in data.columns
        data = data[data["filesize"] <= args.fsmax]
    if args.remove_empty_caption:
        assert "text" in data.columns
        data = data[data["text"].str.len() > 0]
        data = data[~data["text"].isna()]
    if args.fmin is not None:
        assert "num_frames" in data.columns
        data = data[data["num_frames"] >= args.fmin]
    if args.fmax is not None:
        assert "num_frames" in data.columns
        data = data[data["num_frames"] <= args.fmax]
    if args.fpsmax is not None:
        assert "fps" in data.columns
        data = data[(data["fps"] <= args.fpsmax) | np.isnan(data["fps"])]
    if args.hwmax is not None:
        if "resolution" not in data.columns:
            height = data["height"]
            width = data["width"]
            data["resolution"] = height * width
        data = data[data["resolution"] <= args.hwmax]
    if args.aesmin is not None:
        assert "aes" in data.columns
        data = data[data["aes"] >= args.aesmin]
    if args.matchmin is not None:
        assert "match" in data.columns
        data = data[data["match"] >= args.matchmin]
    if args.flowmin is not None:
        assert "flow" in data.columns
        data = data[data["flow"] >= args.flowmin]
    if args.remove_text_duplication:
        data = data.drop_duplicates(subset=["text"], keep="first")
    if args.img_only:
        data = data[data["path"].str.lower().str.endswith(IMG_EXTENSIONS)]
    if args.vid_only:
        data = data[~data["path"].str.lower().str.endswith(IMG_EXTENSIONS)]

    # process data
    if args.shuffle:
        data = data.sample(frac=1).reset_index(drop=True)  # shuffle
    if args.head is not None:
        data = data.head(args.head)

    # train columns
    if args.train_column:
        all_columns = data.columns
        columns_to_drop = all_columns.difference(TRAIN_COLUMNS)
        data = data.drop(columns=columns_to_drop)

    print(f"Filtered number of samples: {len(data)}.")

    # shard data
    if args.shard is not None:
        sharded_data = np.array_split(data, args.shard)
        for i in range(args.shard):
            output_path_part = output_path.split(".")
            output_path_s = ".".join(output_path_part[:-1]) + f"_{i}." + output_path_part[-1]
            save_file(sharded_data[i], output_path_s)
            print(f"Saved {len(sharded_data[i])} samples to {output_path_s}.")
    else:
        save_file(data, output_path)
        print(f"Saved {len(data)} samples to {output_path}.")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, nargs="+", help="path to the input dataset")
    parser.add_argument("--output", type=str, default=None, help="output path")
    parser.add_argument("--format", type=str, default="csv", help="output format", choices=["csv", "parquet"])
    parser.add_argument("--disable-parallel", action="store_true", help="disable parallel processing")
    parser.add_argument("--num-workers", type=int, default=None, help="number of workers")
    parser.add_argument("--seed", type=int, default=42, help="random seed")

    # special case
    parser.add_argument("--shard", type=int, default=None, help="shard the dataset")
    parser.add_argument("--sort", type=str, default=None, help="sort by column")
    parser.add_argument("--sort-ascending", type=str, default=None, help="sort by column (ascending order)")
    parser.add_argument("--difference", type=str, default=None, help="get difference from the dataset")
    parser.add_argument(
        "--intersection", type=str, default=None, help="keep the paths in csv from the dataset and merge columns"
    )
    parser.add_argument("--train-column", action="store_true", help="only keep the train column")

    # IO-related
    parser.add_argument("--info", action="store_true", help="get the basic information of each video and image")
    parser.add_argument("--video-info", action="store_true", help="get the basic information of each video")
    parser.add_argument("--ext", action="store_true", help="check if the file exists")
    parser.add_argument(
        "--load-caption", type=str, default=None, choices=["json", "txt"], help="load the caption from json or txt"
    )

    # path processing
    parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given")
    parser.add_argument("--abspath", type=str, default=None, help="modify the path to absolute path by root given")
    parser.add_argument("--path-to-id", action="store_true", help="add id based on path")
    parser.add_argument(
        "--path-subset", type=str, default=None, help="extract a subset data containing the given `path-subset` value"
    )
    parser.add_argument(
        "--remove-empty-path",
        action="store_true",
        help="remove rows with empty path",  # caused by transform, cannot read path
    )

    # caption filtering
    parser.add_argument(
        "--remove-empty-caption",
        action="store_true",
        help="remove rows with empty caption",
    )
    parser.add_argument("--remove-url", action="store_true", help="remove rows with url in caption")
    parser.add_argument("--lang", type=str, default=None, help="remove rows with other language")
    parser.add_argument("--remove-path-duplication", action="store_true", help="remove rows with duplicated path")
    parser.add_argument("--remove-text-duplication", action="store_true", help="remove rows with duplicated caption")

    # caption processing
    parser.add_argument("--refine-llm-caption", action="store_true", help="modify the caption generated by LLM")
    parser.add_argument(
        "--clean-caption", action="store_true", help="modify the caption according to T5 pipeline to suit training"
    )
    parser.add_argument("--merge-cmotion", action="store_true", help="merge the camera motion to the caption")
    parser.add_argument(
        "--count-num-token", type=str, choices=["t5"], default=None, help="Count the number of tokens in the caption"
    )
    parser.add_argument("--append-text", type=str, default=None, help="append text to the caption")
    parser.add_argument("--score-to-text", action="store_true", help="convert score to text")
    parser.add_argument("--update-text", type=str, default=None, help="update the text with the given text")

    # score filtering
    parser.add_argument("--filesize", action="store_true", help="get the filesize of each video and image in MB")
    parser.add_argument("--fsmax", type=int, default=None, help="filter the dataset by maximum filesize")
    parser.add_argument("--fmin", type=int, default=None, help="filter the dataset by minimum number of frames")
    parser.add_argument("--fmax", type=int, default=None, help="filter the dataset by maximum number of frames")
    parser.add_argument("--hwmax", type=int, default=None, help="filter the dataset by maximum resolution")
    parser.add_argument("--aesmin", type=float, default=None, help="filter the dataset by minimum aes score")
    parser.add_argument("--matchmin", type=float, default=None, help="filter the dataset by minimum match score")
    parser.add_argument("--flowmin", type=float, default=None, help="filter the dataset by minimum flow score")
    parser.add_argument("--fpsmax", type=float, default=None, help="filter the dataset by maximum fps")
    parser.add_argument("--img-only", action="store_true", help="only keep the image data")
    parser.add_argument("--vid-only", action="store_true", help="only keep the video data")

    # data processing
    parser.add_argument("--shuffle", default=False, action="store_true", help="shuffle the dataset")
    parser.add_argument("--head", type=int, default=None, help="return the first n rows of data")

    return parser.parse_args()


def get_output_path(args, input_name):
    if args.output is not None:
        return args.output
    name = input_name
    dir_path = os.path.dirname(args.input[0])

    # sort
    if args.sort is not None:
        assert args.sort_ascending is None
        name += "_sort"
    if args.sort_ascending is not None:
        assert args.sort is None
        name += "_sort"

    # IO-related
    # for IO-related, the function must be wrapped in try-except
    if args.info:
        name += "_info"
    if args.video_info:
        name += "_vinfo"
    if args.ext:
        name += "_ext"
    if args.load_caption:
        name += f"_load{args.load_caption}"

    # path processing
    if args.relpath is not None:
        name += "_relpath"
    if args.abspath is not None:
        name += "_abspath"
    if args.remove_empty_path:
        name += "_noemptypath"

    # caption filtering
    if args.remove_empty_caption:
        name += "_noempty"
    if args.remove_url:
        name += "_nourl"
    if args.lang is not None:
        name += f"_{args.lang}"
    if args.remove_path_duplication:
        name += "_noduppath"
    if args.remove_text_duplication:
        name += "_noduptext"
    if args.path_subset:
        name += "_subset"

    # caption processing
    if args.refine_llm_caption:
        name += "_llm"
    if args.clean_caption:
        name += "_clean"
    if args.merge_cmotion:
        name += "_cmcaption"
    if args.count_num_token:
        name += "_ntoken"
    if args.append_text is not None:
        name += "_appendtext"
    if args.score_to_text:
        name += "_score2text"
    if args.update_text is not None:
        name += "_update"

    # score filtering
    if args.filesize:
        name += "_filesize"
    if args.fsmax is not None:
        name += f"_fsmax{args.fsmax}"
    if args.fmin is not None:
        name += f"_fmin{args.fmin}"
    if args.fmax is not None:
        name += f"_fmax{args.fmax}"
    if args.fpsmax is not None:
        name += f"_fpsmax{args.fpsmax}"
    if args.hwmax is not None:
        name += f"_hwmax{args.hwmax}"
    if args.aesmin is not None:
        name += f"_aesmin{args.aesmin}"
    if args.matchmin is not None:
        name += f"_matchmin{args.matchmin}"
    if args.flowmin is not None:
        name += f"_flowmin{args.flowmin}"
    if args.img_only:
        name += "_img"
    if args.vid_only:
        name += "_vid"

    # processing
    if args.shuffle:
        name += f"_shuffled_seed{args.seed}"
    if args.head is not None:
        name += f"_first_{args.head}_data"

    output_path = os.path.join(dir_path, f"{name}.{args.format}")
    return output_path


if __name__ == "__main__":
    args = parse_args()
    if args.disable_parallel:
        PANDA_USE_PARALLEL = False
    if PANDA_USE_PARALLEL:
        if args.num_workers is not None:
            pandarallel.initialize(nb_workers=args.num_workers, progress_bar=True)
        else:
            pandarallel.initialize(progress_bar=True)
    if args.seed is not None:
        random.seed(args.seed)
        np.random.seed(args.seed)
    main(args)


================================================
FILE: Open-Sora/tools/datasets/filter_panda10m.py
================================================
# TODO: remove this file before releasing

import argparse
import html
import os
import re

import pandas as pd
from tqdm import tqdm

tqdm.pandas()

try:
    from pandarallel import pandarallel

    pandarallel.initialize(progress_bar=True)
    pandas_has_parallel = True
except ImportError:
    pandas_has_parallel = False


def apply(df, func, **kwargs):
    if pandas_has_parallel:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)


def basic_clean(text):
    import ftfy

    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


BAD_PUNCT_REGEX = re.compile(
    r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
)  # noqa


def clean_caption(caption):
    import urllib.parse as ul

    from bs4 import BeautifulSoup

    caption = str(caption)
    caption = ul.unquote_plus(caption)
    caption = caption.strip().lower()
    caption = re.sub("<person>", "person", caption)
    # urls:
    caption = re.sub(
        r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    caption = re.sub(
        r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
        "",
        caption,
    )  # regex for urls
    # html:
    caption = BeautifulSoup(caption, features="html.parser").text

    # @<nickname>
    caption = re.sub(r"@[\w\d]+\b", "", caption)

    # 31C0—31EF CJK Strokes
    # 31F0—31FF Katakana Phonetic Extensions
    # 3200—32FF Enclosed CJK Letters and Months
    # 3300—33FF CJK Compatibility
    # 3400—4DBF CJK Unified Ideographs Extension A
    # 4DC0—4DFF Yijing Hexagram Symbols
    # 4E00—9FFF CJK Unified Ideographs
    caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
    caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
    caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
    caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
    caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
    caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
    caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
    #######################################################

    # все виды тире / all types of dash --> "-"
    caption = re.sub(
        r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
        "-",
        caption,
    )

    # кавычки к одному стандарту
    caption = re.sub(r"[`´«»“”¨]", '"', caption)
    caption = re.sub(r"[‘’]", "'", caption)

    # &quot;
    caption = re.sub(r"&quot;?", "", caption)
    # &amp
    caption = re.sub(r"&amp", "", caption)

    # ip adresses:
    caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)

    # article ids:
    caption = re.sub(r"\d:\d\d\s+$", "", caption)

    # \n
    caption = re.sub(r"\\n", " ", caption)

    # "#123"
    caption = re.sub(r"#\d{1,3}\b", "", caption)
    # "#12345.."
    caption = re.sub(r"#\d{5,}\b", "", caption)
    # "123456.."
    caption = re.sub(r"\b\d{6,}\b", "", caption)
    # filenames:
    caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)

    #
    caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
    caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""

    caption = re.sub(BAD_PUNCT_REGEX, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
    caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "

    # this-is-my-cute-cat / this_is_my_cute_cat
    regex2 = re.compile(r"(?:\-|\_)")
    if len(re.findall(regex2, caption)) > 3:
        caption = re.sub(regex2, " ", caption)

    caption = basic_clean(caption)

    caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
    caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
    caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231

    caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
    caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
    caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
    caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
    caption = re.sub(r"\bpage\s+\d+\b", "", caption)

    caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...

    caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)

    caption = re.sub(r"\b\s+\:\s+", r": ", caption)
    caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
    caption = re.sub(r"\s+", " ", caption)

    caption.strip()

    caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
    caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
    caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
    caption = re.sub(r"^\.\S+$", "", caption)

    return caption.strip()


def get_10m_set():
    meta_path_10m = "/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv"
    meta_10m = pd.read_csv(meta_path_10m)

    def process_single_caption(row):
        text_list = eval(row["caption"])
        clean_list = [clean_caption(x) for x in text_list]
        return str(clean_list)

    ret = apply(meta_10m, process_single_caption, axis=1)
    # ret = meta_10m.progress_apply(process_single_caption, axis=1)
    print("==> text processed.")

    text_list = []
    for x in ret:
        text_list += eval(x)
        # text_set = text_set.union(set(eval(x)))
    text_set = set(text_list)
    # meta_10m['caption_new'] = ret
    # meta_10m.to_csv('/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m_new-cap.csv')

    # video_id_set = set(meta_10m['videoID'])
    # id2t = {}
    # for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)):
    #     video_id = row['videoID']
    #     text_list = eval(row['caption'])
    #     id2t[video_id] = set(text_list)

    print(f"==> Loaded meta_10m from '{meta_path_10m}'")
    return text_set


def filter_panda10m_text(meta_path, text_set):
    def process_single_row(row):
        # path = row['path']
        t = row["text"]
        # fname = os.path.basename(path)
        # video_id = fname[:fname.rindex('_')]
        if t not in text_set:
            return False
        return True

    meta = pd.read_csv(meta_path)
    ret = apply(meta, process_single_row, axis=1)
    # ret = meta.progress_apply(process_single_row, axis=1)

    meta = meta[ret]
    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_filter-10m{ext}"
    meta.to_csv(out_path, index=False)
    print(f"New meta (shape={meta.shape}) saved to '{out_path}'.")


def filter_panda10m_timestamp(meta_path):
    meta_path_10m = "/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv"
    meta_10m = pd.read_csv(meta_path_10m)

    id2t = {}
    for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)):
        video_id = row["videoID"]
        timestamp = eval(row["timestamp"])
        timestamp = [str(tuple(x)) for x in timestamp]
        id2t[video_id] = timestamp

    # video_id_set_10m = set(meta_10m['videoID'])
    print(f"==> Loaded meta_10m from '{meta_path_10m}'")

    def process_single_row(row):
        path = row["path"]
        t = row["timestamp"]
        fname = os.path.basename(path)
        video_id = fname[: fname.rindex("_")]
        if video_id not in id2t:
            return False
        if t not in id2t[video_id]:
            return False
        return True
        # return video_id in video_id_set_10m

    meta = pd.read_csv(meta_path)
    ret = apply(meta, process_single_row, axis=1)

    meta = meta[ret]
    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_filter-10m{ext}"
    meta.to_csv(out_path, index=False)
    print(f"New meta (shape={meta.shape}) saved to '{out_path}'.")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--meta_path", type=str, nargs="+")
    parser.add_argument("--num_workers", default=5, type=int)

    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()

    text_set = get_10m_set()
    for x in args.meta_path:
        filter_panda10m_text(x, text_set)


================================================
FILE: Open-Sora/tools/datasets/split.py
================================================
import argparse
from typing import List

import pandas as pd
from mmengine.config import Config

from opensora.datasets.bucket import Bucket


def split_by_bucket(
    bucket: Bucket,
    input_files: List[str],
    output_path: str,
    limit: int,
    frame_interval: int,
):
    print(f"Split {len(input_files)} files into {len(bucket)} buckets")
    total_limit = len(bucket) * limit
    bucket_cnt = {}
    # get all bucket id
    for hw_id, d in bucket.ar_criteria.items():
        for t_id, v in d.items():
            for ar_id in v.keys():
                bucket_id = (hw_id, t_id, ar_id)
                bucket_cnt[bucket_id] = 0
    output_df = None
    # split files
    for path in input_files:
        df = pd.read_csv(path)
        if output_df is None:
            output_df = pd.DataFrame(columns=df.columns)
        for i in range(len(df)):
            row = df.iloc[i]
            t, h, w = row["num_frames"], row["height"], row["width"]
            bucket_id = bucket.get_bucket_id(t, h, w, frame_interval)
            if bucket_id is None:
                continue
            if bucket_cnt[bucket_id] < limit:
                bucket_cnt[bucket_id] += 1
                output_df = pd.concat([output_df, pd.DataFrame([row])], ignore_index=True)
                if len(output_df) >= total_limit:
                    break
        if len(output_df) >= total_limit:
            break
    assert len(output_df) <= total_limit
    if len(output_df) == total_limit:
        print(f"All buckets are full ({total_limit} samples)")
    else:
        print(f"Only {len(output_df)} files are used")
    output_df.to_csv(output_path, index=False)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, nargs="+")
    parser.add_argument("-o", "--output", required=True)
    parser.add_argument("-c", "--config", required=True)
    parser.add_argument("-l", "--limit", default=200, type=int)
    args = parser.parse_args()
    assert args.limit > 0

    cfg = Config.fromfile(args.config)
    bucket_config = cfg.bucket_config
    # rewrite bucket_config
    for ar, d in bucket_config.items():
        for frames, t in d.items():
            p, bs = t
            if p > 0.0:
                p = 1.0
            d[frames] = (p, bs)
    bucket = Bucket(bucket_config)
    split_by_bucket(bucket, args.input, args.output, args.limit, cfg.dataset.frame_interval)


================================================
FILE: Open-Sora/tools/datasets/transform.py
================================================
import argparse
import os
import random

import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

from .utils import IMG_EXTENSIONS, extract_frames

tqdm.pandas()

try:
    from pandarallel import pandarallel

    pandarallel.initialize(progress_bar=True)
    pandas_has_parallel = True
except ImportError:
    pandas_has_parallel = False


def apply(df, func, **kwargs):
    if pandas_has_parallel:
        return df.parallel_apply(func, **kwargs)
    return df.progress_apply(func, **kwargs)


def get_new_path(path, input_dir, output):
    path_new = os.path.join(output, os.path.relpath(path, input_dir))
    os.makedirs(os.path.dirname(path_new), exist_ok=True)
    return path_new


def resize(path, length, input_dir, output):
    path_new = get_new_path(path, input_dir, output)
    ext = os.path.splitext(path)[1].lower()
    assert ext in IMG_EXTENSIONS
    img = cv2.imread(path)
    if img is not None:
        h, w = img.shape[:2]
        if min(h, w) > length:
            if h > w:
                new_h = length
                new_w = int(w * new_h / h)
            else:
                new_w = length
                new_h = int(h * new_w / w)
            img = cv2.resize(img, (new_w, new_h))
        cv2.imwrite(path_new, img)
    else:
        path_new = ""
    return path_new


def rand_crop(path, input_dir, output):
    ext = os.path.splitext(path)[1].lower()
    path_new = get_new_path(path, input_dir, output)
    assert ext in IMG_EXTENSIONS
    img = cv2.imread(path)
    if img is not None:
        h, w = img.shape[:2]
        width, height, _ = img.shape
        pos = random.randint(0, 3)
        if pos == 0:
            img_cropped = img[: width // 2, : height // 2]
        elif pos == 1:
            img_cropped = img[width // 2 :, : height // 2]
        elif pos == 2:
            img_cropped = img[: width // 2, height // 2 :]
        else:
            img_cropped = img[width // 2 :, height // 2 :]
        cv2.imwrite(path_new, img_cropped)
    else:
        path_new = ""
    return path_new


def main(args):
    data = pd.read_csv(args.input)
    if args.method == "img_rand_crop":
        data["path"] = apply(data["path"], lambda x: rand_crop(x, args.input_dir, args.output))
        output_csv = args.input.replace(".csv", f"_rand_crop.csv")
    elif args.method == "img_resize":
        data["path"] = apply(data["path"], lambda x: resize(x, args.length, args.input_dir, args.output))
        output_csv = args.input.replace(".csv", f"_resized{args.length}.csv")
    elif args.method == "vid_frame_extract":
        points = args.points if args.points is not None else args.points_index
        data = pd.DataFrame(np.repeat(data.values, 3, axis=0), columns=data.columns)
        num_points = len(points)
        data["point"] = np.nan
        for i, point in enumerate(points):
            if isinstance(point, int):
                data.loc[i::num_points, "point"] = point
            else:
                data.loc[i::num_points, "point"] = data.loc[i::num_points, "num_frames"] * point
        data["path"] = apply(data, lambda x: extract_frames(x["path"], args.input_dir, args.output, x["point"]), axis=1)
        output_csv = args.input.replace(".csv", f"_vid_frame_extract.csv")

    data.to_csv(output_csv, index=False)
    print(f"Saved to {output_csv}")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("method", type=str, choices=["img_resize", "img_rand_crop", "vid_frame_extract"])
    parser.add_argument("input", type=str)
    parser.add_argument("input_dir", type=str)
    parser.add_argument("output", type=str)
    parser.add_argument("--disable-parallel", action="store_true")
    parser.add_argument("--length", type=int, default=2160)
    parser.add_argument("--seed", type=int, default=42, help="seed for random")
    parser.add_argument("--points", nargs="+", type=float, default=None)
    parser.add_argument("--points_index", nargs="+", type=int, default=None)
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()
    random.seed(args.seed)
    if args.disable_parallel:
        pandas_has_parallel = False
    main(args)


================================================
FILE: Open-Sora/tools/datasets/utils.py
================================================
import os

import cv2
import numpy as np
from PIL import Image

IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")


def is_video(filename):
    ext = os.path.splitext(filename)[-1].lower()
    return ext in VID_EXTENSIONS


def extract_frames(
    video_path,
    frame_inds=None,
    points=None,
    backend="opencv",
    return_length=False,
    num_frames=None,
):
    """
    Args:
        video_path (str): path to video
        frame_inds (List[int]): indices of frames to extract
        points (List[float]): values within [0, 1); multiply #frames to get frame indices
    Return:
        List[PIL.Image]
    """
    assert backend in ["av", "opencv", "decord"]
    assert (frame_inds is None) or (points is None)

    if backend == "av":
        import av

        container = av.open(video_path)
        if num_frames is not None:
            total_frames = num_frames
        else:
            total_frames = container.streams.video[0].frames

        if points is not None:
            frame_inds = [int(p * total_frames) for p in points]

        frames = []
        for idx in frame_inds:
            if idx >= total_frames:
                idx = total_frames - 1
            target_timestamp = int(idx * av.time_base / container.streams.video[0].average_rate)
            container.seek(target_timestamp)
            frame = next(container.decode(video=0)).to_image()
            frames.append(frame)

        if return_length:
            return frames, total_frames
        return frames

    elif backend == "decord":
        import decord

        container = decord.VideoReader(video_path, num_threads=1)
        if num_frames is not None:
            total_frames = num_frames
        else:
            total_frames = len(container)

        if points is not None:
            frame_inds = [int(p * total_frames) for p in points]

        frame_inds = np.array(frame_inds).astype(np.int32)
        frame_inds[frame_inds >= total_frames] = total_frames - 1
        frames = container.get_batch(frame_inds).asnumpy()  # [N, H, W, C]
        frames = [Image.fromarray(x) for x in frames]

        if return_length:
            return frames, total_frames
        return frames

    elif backend == "opencv":
        cap = cv2.VideoCapture(video_path)
        if num_frames is not None:
            total_frames = num_frames
        else:
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        if points is not None:
            frame_inds = [int(p * total_frames) for p in points]

        frames = []
        for idx in frame_inds:
            if idx >= total_frames:
                idx = total_frames - 1

            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)

            # HACK: sometimes OpenCV fails to read frames, return a black frame instead
            try:
                ret, frame = cap.read()
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = Image.fromarray(frame)
            except Exception as e:
                print(f"[Warning] Error reading frame {idx} from {video_path}: {e}")
                # First, try to read the first frame
                try:
                    print(f"[Warning] Try reading first frame.")
                    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
                    ret, frame = cap.read()
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    frame = Image.fromarray(frame)
                # If that fails, return a black frame
                except Exception as e:
                    print(f"[Warning] Error in reading first frame from {video_path}: {e}")
                    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                    frame = Image.new("RGB", (width, height), (0, 0, 0))

            # HACK: if height or width is 0, return a black frame instead
            if frame.height == 0 or frame.width == 0:
                height = width = 256
                frame = Image.new("RGB", (width, height), (0, 0, 0))

            frames.append(frame)

        if return_length:
            return frames, total_frames
        return frames
    else:
        raise ValueError


================================================
FILE: Open-Sora/tools/frame_interpolation/README.md
================================================
# Frame Interpolation

For current version, we sample 1 frame out of 3 frames in the video. Although we are going to use VAE to avoid frame loss, we provide a frame interpolation tool to interpolate the video now. The frame interpolation tool is based on [AMT](https://github.com/MCG-NKU/AMT).

Interpolation can be useful for scenery videos, but it may not be suitable for videos with fast motion.

## Requirement

Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "Frame Interpolation" sections.

<!-- ```bash
conda install -c conda-forge opencv
pip install imageio
``` -->

## Model

We use **AMT** as our frame interpolation model. After sampling, you can use frame interpolation model to interpolate your video smoothly.

## Usage

The ckpt file will be automatically downloaded in user's `.cache` directory. You can use frame interpolation to your video file or a video folder.

1. Process a video file

```python
python -m tools.frame_interpolation.interpolation your_video.mp4
```

2. Process all video file in target directory

```python
python -m tools.frame_interpolation.interpolation your_video_dir --output_path samples/interpolation
```

The output video will be stored at `output_path` and its duration time is equal `the total number of frames after frame interpolation / the frame rate`

### Command Line Arguments

* `input`: Path of the input video. **Video path** or **Folder path(with --folder)**
* `--ckpt`: Pretrained model of [AMT](https://github.com/MCG-NKU/AMT). Default path: `~/.cache/amt-g.pth`.
* `--niter`: Iterations of interpolation. With $m$ input frames, `[N_ITER]` $=n$ corresponds to $2^n\times (m-1)+1$ output frames.
* `--fps`: Frame rate of the input video. (Default: 8)
* `--output_path`: **Folder Path** of the output video.


================================================
FILE: Open-Sora/tools/frame_interpolation/__init__.py
================================================


================================================
FILE: Open-Sora/tools/frame_interpolation/interpolation.py
================================================
# this script is modified from https://github.com/MCG-NKU/AMT/blob/main/demos/demo_2x.py
import argparse
import os
import os.path as osp

import cv2
import numpy as np
import torch

from opensora.utils.ckpt_utils import download_model

from .networks.amt_g import Model
from .utils.utils import InputPadder, img2tensor, tensor2img

hf_endpoint = os.environ.get("HF_ENDPOINT")
if hf_endpoint is None:
    hf_endpoint = "https://huggingface.co"
VID_EXT = [".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm"]
network_cfg = {
    "params": {
        "corr_radius": 3,
        "corr_lvls": 4,
        "num_flows": 5,
    },
}
device = "cuda" if torch.cuda.is_available() else "cpu"


def init():
    """
    initialize the device and the anchor resolution.
    """

    if device == "cuda":
        anchor_resolution = 1024 * 512
        anchor_memory = 1500 * 1024**2
        anchor_memory_bias = 2500 * 1024**2
        vram_avail = torch.cuda.get_device_properties(device).total_memory
        print("VRAM available: {:.1f} MB".format(vram_avail / 1024**2))
    else:
        # Do not resize in cpu mode
        anchor_resolution = 8192 * 8192
        anchor_memory = 1
        anchor_memory_bias = 0
        vram_avail = 1

    return anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail


def get_input_video_from_path(input_path):
    """
    Get the input video from the input_path.

    params:
        input_path: str, the path of the input video.
        devices: str, the device to run the model.
    returns:
        inputs: list, the list of the input frames.
        scale: float, the scale of the input frames.
        padder: InputPadder, the padder to pad the input frames.
    """

    anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail = init()

    if osp.splitext(input_path)[-1].lower() in VID_EXT:
        vcap = cv2.VideoCapture(input_path)

        inputs = []
        w = int(vcap.get(cv2.CAP_PROP_FRAME_WIDTH))
        h = int(vcap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        scale = anchor_resolution / (h * w) * np.sqrt((vram_avail - anchor_memory_bias) / anchor_memory)
        scale = 1 if scale > 1 else scale
        scale = 1 / np.floor(1 / np.sqrt(scale) * 16) * 16
        if scale < 1:
            print(f"Due to the limited VRAM, the video will be scaled by {scale:.2f}")
        padding = int(16 / scale)
        padder = InputPadder((h, w), padding)
        while True:
            ret, frame = vcap.read()
            if ret is False:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_t = img2tensor(frame).to(device)
            frame_t = padder.pad(frame_t)
            inputs.append(frame_t)
        print(f"Loading the [video] from {input_path}, the number of frames [{len(inputs)}]")
    else:
        raise TypeError("Input should be a video.")

    return inputs, scale, padder


def load_model(ckpt):
    """
    load the frame interpolation model.
    """
    params = network_cfg.get("params", {})
    model = Model(**params)
    model.load_state_dict(ckpt["state_dict"])
    model = model.to(device)
    model.eval()
    return model


def interpolater(model, inputs, scale, padder, iters=1):
    """
    interpolating with the interpolation model.

    params:
        model: nn.Module, the frame interpolation model.
        inputs: list, the list of the input frames.
        scale: float, the scale of the input frames.
        iters: int, the number of iterations of interpolation. The final frames model generating is 2 ** iters * (m - 1) + 1 and m is input frames.
    returns:
        outputs: list, the list of the output frames.
    """

    print("Start frame interpolation:")
    embt = torch.tensor(1 / 2).float().view(1, 1, 1, 1).to(device)

    for i in range(iters):
        print(f"Iter {i+1}. input_frames={len(inputs)} output_frames={2*len(inputs)-1}")
        outputs = [inputs[0]]
        for in_0, in_1 in zip(inputs[:-1], inputs[1:]):
            in_0 = in_0.to(device)
            in_1 = in_1.to(device)
            with torch.no_grad():
                imgt_pred = model(in_0, in_1, embt, scale_factor=scale, eval=True)["imgt_pred"]
            outputs += [imgt_pred.cpu(), in_1.cpu()]
        inputs = outputs

    outputs = padder.unpad(*outputs)
    return outputs


def write(outputs, input_path, output_path, fps=30):
    """
    write results to the output_path.
    """

    if osp.exists(output_path) is False:
        os.makedirs(output_path)

    size = outputs[0].shape[2:][::-1]

    _, file_name_with_extension = os.path.split(input_path)
    file_name, _ = os.path.splitext(file_name_with_extension)

    save_video_path = f"{output_path}/fps{fps}_{file_name}.mp4"
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(save_video_path, fourcc, fps, size)

    for i, imgt_pred in enumerate(outputs):
        imgt_pred = tensor2img(imgt_pred)
        imgt_pred = cv2.cvtColor(imgt_pred, cv2.COLOR_RGB2BGR)
        writer.write(imgt_pred)
    print(f"Demo video is saved to [{save_video_path}]")

    writer.release()


def process(
    model,
    image_path,
    output_path,
    fps,
    iters,
):
    inputs, scale, padder = get_input_video_from_path(image_path)
    outputs = interpolater(model, inputs, scale, padder, iters)
    write(outputs, image_path, output_path, fps)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="Input video.")
    parser.add_argument("--ckpt", type=str, default="./pretrained_models/amt-g.pth", help="The pretrained model.")
    parser.add_argument(
        "--niters",
        type=int,
        default=1,
        help="Iter of Interpolation. The number of frames will be double after per iter.",
    )
    parser.add_argument("--output_path", type=str, default="samples", help="Output path.")
    parser.add_argument("--fps", type=int, default=8, help="Frames rate of the output video.")
    parser.add_argument("--folder", action="store_true", help="If the input is a folder, set this flag.")
    args = parser.parse_args()

    times_frame = 2**args.niters
    old_fps = args.fps
    args.fps = args.fps * times_frame
    print(f"Interpolation will turn {old_fps}fps video to {args.fps}fps video.")
    args.input = os.path.expanduser(args.input)
    args.ckpt = os.path.expanduser(args.ckpt)
    args.folder = osp.splitext(args.input)[-1].lower() not in VID_EXT
    args.ckpt = download_model(local_path=args.ckpt, url=hf_endpoint + "/lalala125/AMT/resolve/main/amt-g.pth")
    return args


if __name__ == "__main__":
    args = parse_args()
    ckpt_path = args.ckpt
    input_path = args.input
    output_path = args.output_path
    iters = int(args.niters)
    fps = int(args.fps)

    model = load_model(ckpt_path)

    if args.folder:
        for file in os.listdir(input_path):
            if osp.splitext(file)[-1].lower() in VID_EXT:
                vid_path = os.path.join(input_path, file)
                process(model, vid_path, output_path, fps, iters)
    else:
        process(model, input_path, output_path, fps, iters)

    print("Interpolation is done.")
    print(f"Output path: {output_path}")


================================================
FILE: Open-Sora/tools/frame_interpolation/networks/__init__.py
================================================
from .amt_g import Model


================================================
FILE: Open-Sora/tools/frame_interpolation/networks/amt_g.py
================================================
import torch
import torch.nn as nn

from .blocks.feat_enc import LargeEncoder
from .blocks.ifrnet import Encoder, InitDecoder, IntermediateDecoder, resize
from .blocks.multi_flow import MultiFlowDecoder, multi_flow_combine
from .blocks.raft import BasicUpdateBlock, BidirCorrBlock, coords_grid


class Model(nn.Module):
    def __init__(self, corr_radius=3, corr_lvls=4, num_flows=5, channels=[84, 96, 112, 128], skip_channels=84):
        super(Model, self).__init__()
        self.radius = corr_radius
        self.corr_levels = corr_lvls
        self.num_flows = num_flows

        self.feat_encoder = LargeEncoder(output_dim=128, norm_fn="instance", dropout=0.0)
        self.encoder = Encoder(channels, large=True)
        self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels)
        self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels)
        self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels)
        self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows)

        self.update4 = self._get_updateblock(112, None)
        self.update3_low = self._get_updateblock(96, 2.0)
        self.update2_low = self._get_updateblock(84, 4.0)

        self.update3_high = self._get_updateblock(96, None)
        self.update2_high = self._get_updateblock(84, None)

        self.comb_block = nn.Sequential(
            nn.Conv2d(3 * self.num_flows, 6 * self.num_flows, 7, 1, 3),
            nn.PReLU(6 * self.num_flows),
            nn.Conv2d(6 * self.num_flows, 3, 7, 1, 3),
        )

    def _get_updateblock(self, cdim, scale_factor=None):
        return BasicUpdateBlock(
            cdim=cdim,
            hidden_dim=192,
            flow_dim=64,
            corr_dim=256,
            corr_dim2=192,
            fc_dim=188,
            scale_factor=scale_factor,
            corr_levels=self.corr_levels,
            radius=self.radius,
        )

    def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1):
        # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0
        # based on linear assumption
        t1_scale = 1.0 / embt
        t0_scale = 1.0 / (1.0 - embt)
        if downsample != 1:
            inv = 1 / downsample
            flow0 = inv * resize(flow0, scale_factor=inv)
            flow1 = inv * resize(flow1, scale_factor=inv)

        corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale)
        corr = torch.cat([corr0, corr1], dim=1)
        flow = torch.cat([flow0, flow1], dim=1)
        return corr, flow

    def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs):
        mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True)
        img0 = img0 - mean_
        img1 = img1 - mean_
        img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0
        img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1
        b, _, h, w = img0_.shape
        coord = coords_grid(b, h // 8, w // 8, img0.device)

        fmap0, fmap1 = self.feat_encoder([img0_, img1_])  # [1, 128, H//8, W//8]
        corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels)

        # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4]
        # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16]
        f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
        f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)

        ######################################### the 4th decoder #########################################
        up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt)
        corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, up_flow0_4, up_flow1_4, embt, downsample=1)

        # residue update with lookup corr
        delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4)
        delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1)
        up_flow0_4 = up_flow0_4 + delta_flow0_4
        up_flow1_4 = up_flow1_4 + delta_flow1_4
        ft_3_ = ft_3_ + delta_ft_3_

        ######################################### the 3rd decoder #########################################
        up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
        corr_3, flow_3 = self._corr_scale_lookup(corr_fn, coord, up_flow0_3, up_flow1_3, embt, downsample=2)

        # residue update with lookup corr
        delta_ft_2_, delta_flow_3 = self.update3_low(ft_2_, flow_3, corr_3)
        delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1)
        up_flow0_3 = up_flow0_3 + delta_flow0_3
        up_flow1_3 = up_flow1_3 + delta_flow1_3
        ft_2_ = ft_2_ + delta_ft_2_

        # residue update with lookup corr (hr)
        corr_3 = resize(corr_3, scale_factor=2.0)
        up_flow_3 = torch.cat([up_flow0_3, up_flow1_3], dim=1)
        delta_ft_2_, delta_up_flow_3 = self.update3_high(ft_2_, up_flow_3, corr_3)
        ft_2_ += delta_ft_2_
        up_flow0_3 += delta_up_flow_3[:, 0:2]
        up_flow1_3 += delta_up_flow_3[:, 2:4]

        ######################################### the 2nd decoder #########################################
        up_flow0_2, up_flow1_2, ft_1_ = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
        corr_2, flow_2 = self._corr_scale_lookup(corr_fn, coord, up_flow0_2, up_flow1_2, embt, downsample=4)

        # residue update with lookup corr
        delta_ft_1_, delta_flow_2 = self.update2_low(ft_1_, flow_2, corr_2)
        delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1)
        up_flow0_2 = up_flow0_2 + delta_flow0_2
        up_flow1_2 = up_flow1_2 + delta_flow1_2
        ft_1_ = ft_1_ + delta_ft_1_

        # residue update with lookup corr (hr)
        corr_2 = resize(corr_2, scale_factor=4.0)
        up_flow_2 = torch.cat([up_flow0_2, up_flow1_2], dim=1)
        delta_ft_1_, delta_up_flow_2 = self.update2_high(ft_1_, up_flow_2, corr_2)
        ft_1_ += delta_ft_1_
        up_flow0_2 += delta_up_flow_2[:, 0:2]
        up_flow1_2 += delta_up_flow_2[:, 2:4]

        ######################################### the 1st decoder #########################################
        up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)

        if scale_factor != 1.0:
            up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor)
            up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor)
            mask = resize(mask, scale_factor=(1.0 / scale_factor))
            img_res = resize(img_res, scale_factor=(1.0 / scale_factor))

        # Merge multiple predictions
        imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, mask, img_res, mean_)
        imgt_pred = torch.clamp(imgt_pred, 0, 1)

        if eval:
            return {
                "imgt_pred": imgt_pred,
            }
        else:
            up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w)
            up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w)
            return {
                "imgt_pred": imgt_pred,
                "flow0_pred": [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4],
                "flow1_pred": [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4],
                "ft_pred": [ft_1_, ft_2_, ft_3_],
            }


================================================
FILE: Open-Sora/tools/frame_interpolation/networks/blocks/__init__.py
================================================


================================================
FILE: Open-Sora/tools/frame_interpolation/networks/blocks/feat_enc.py
================================================
import torch
import torch.nn as nn


class BottleneckBlock(nn.Module):
    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
        super(BottleneckBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_planes, planes // 4, kernel_size=1, padding=0)
        self.conv2 = nn.Conv2d(planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride)
        self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0)
        self.relu = nn.ReLU(inplace=True)

        num_groups = planes // 8

        if norm_fn == "group":
            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4)
            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4)
            self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            if not stride == 1:
                self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)

        elif norm_fn == "batch":
            self.norm1 = nn.BatchNorm2d(planes // 4)
            self.norm2 = nn.BatchNorm2d(planes // 4)
            self.norm3 = nn.BatchNorm2d(planes)
            if not stride == 1:
                self.norm4 = nn.BatchNorm2d(planes)

        elif norm_fn == "instance":
            self.norm1 = nn.InstanceNorm2d(planes // 4)
            self.norm2 = nn.InstanceNorm2d(planes // 4)
            self.norm3 = nn.InstanceNorm2d(planes)
            if not stride == 1:
                self.norm4 = nn.InstanceNorm2d(planes)

        elif norm_fn == "none":
            self.norm1 = nn.Sequential()
            self.norm2 = nn.Sequential()
            self.norm3 = nn.Sequential()
            if not stride == 1:
                self.norm4 = nn.Sequential()

        if stride == 1:
            self.downsample = None

        else:
            self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4)

    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))
        y = self.relu(self.norm3(self.conv3(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x + y)


class ResidualBlock(nn.Module):
    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
        super(ResidualBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
        self.relu = nn.ReLU(inplace=True)

        num_groups = planes // 8

        if norm_fn == "group":
            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            if not stride == 1:
                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)

        elif norm_fn == "batch":
            self.norm1 = nn.BatchNorm2d(planes)
            self.norm2 = nn.BatchNorm2d(planes)
            if not stride == 1:
                self.norm3 = nn.BatchNorm2d(planes)

        elif norm_fn == "instance":
            self.norm1 = nn.InstanceNorm2d(planes)
            self.norm2 = nn.InstanceNorm2d(planes)
            if not stride == 1:
                self.norm3 = nn.InstanceNorm2d(planes)

        elif norm_fn == "none":
            self.norm1 = nn.Sequential()
            self.norm2 = nn.Sequential()
            if not stride == 1:
                self.norm3 = nn.Sequential()

        if stride == 1:
            self.downsample = None

        else:
            self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)

    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x + y)


class SmallEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
        super(SmallEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == "group":
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)

        elif self.norm_fn == "batch":
            self.norm1 = nn.BatchNorm2d(32)

        elif self.norm_fn == "instance":
            self.norm1 = nn.InstanceNorm2d(32)

        elif self.norm_fn == "none":
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 32
        self.layer1 = self._make_layer(32, stride=1)
        self.layer2 = self._make_layer(64, stride=2)
        self.layer3 = self._make_layer(96, stride=2)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)

        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)

        self.in_planes = dim
        return nn.Sequential(*layers)

    def forward(self, x):
        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x


class BasicEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
        super(BasicEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == "group":
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)

        elif self.norm_fn == "batch":
            self.norm1 = nn.BatchNorm2d(64)

        elif self.norm_fn == "instance":
            self.norm1 = nn.InstanceNorm2d(64)

        elif self.norm_fn == "none":
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 64
        self.layer1 = self._make_layer(64, stride=1)
        self.layer2 = self._make_layer(72, stride=2)
        self.layer3 = self._make_layer(128, stride=2)

        # output convolution
        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)

        self.in_planes = dim
        return nn.Sequential(*layers)

    def forward(self, x):
        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x


class LargeEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
        super(LargeEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == "group":
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)

        elif self.norm_fn == "batch":
            self.norm1 = nn.BatchNorm2d(64)

        elif self.norm_fn == "instance":
            self.norm1 = nn.InstanceNorm2d(64)

        elif self.norm_fn == "none":
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 64
        self.layer1 = self._make_layer(64, stride=1)
        self.layer2 = self._make_layer(112, stride=2)
        self.layer3 = self._make_layer(160, stride=2)
        self.layer3_2 = self._make_layer(160, stride=1)

        # output convolution
        self.conv2 = nn.Conv2d(self.in_planes, output_dim, kernel_size=1)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)

        self.in_planes = dim
        return nn.Sequential(*layers)

    def forward(self, x):
        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer3_2(x)

        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x


================================================
FILE: Open-Sora/tools/frame_interpolation/networks/blocks/ifrnet.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from tools.frame_interpolation.utils.flow_utils import warp


def resize(x, scale_factor):
    return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)


def convrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=bias),
        nn.PReLU(out_channels),
    )


class ResBlock(nn.Module):
    def __init__(self, in_channels, side_channels, bias=True):
        super(ResBlock, self).__init__()
        self.side_channels = side_channels
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias),
            nn.PReLU(side_channels),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias),
            nn.PReLU(side_channels),
        )
        self.conv5 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias)
        self.prelu = nn.PReLU(in_channels)

    def forward(self, x):
        out = self.conv1(x)

        res_feat = out[:, : -self.side_channels, ...]
        side_feat = out[:, -self.side_channels :, :, :]
        side_feat = self.conv2(side_feat)
        out = self.conv3(torch.cat([res_feat, side_feat], 1))

        res_feat = out[:, : -self.side_channels, ...]
        side_feat = out[:, -self.side_channels :, :, :]
        side_feat = self.conv4(side_feat)
        out = self.conv5(torch.cat([res_feat, side_feat], 1))

        out = self.prelu(x + out)
        return out


class Encoder(nn.Module):
    def __init__(self, channels, large=False):
        super(Encoder, self).__init__()
        self.channels = channels
        prev_ch = 3
        for idx, ch in enumerate(channels, 1):
            k = 7 if large and idx == 1 else 3
            p = 3 if k == 7 else 1
            self.register_module(
                f"pyramid{idx}", nn.Sequential(convrelu(prev_ch, ch, k, 2, p), convrelu(ch, ch, 3, 1, 1))
            )
            prev_ch = ch

    def forward(self, in_x):
        fs = []
        for idx in range(len(self.channels)):
            out_x = getattr(self, f"pyramid{idx+1}")(in_x)
            fs.append(out_x)
            in_x = out_x
        return fs


class InitDecoder(nn.Module):
    def __init__(self, in_ch, out_ch, skip_ch) -> None:
        super().__init__()
        self.convblock = nn.Sequential(
            convrelu(in_ch * 2 + 1, in_ch * 2),
            ResBlock(in_ch * 2, skip_ch),
            nn.ConvTranspose2d(in_ch * 2, out_ch + 4, 4, 2, 1, bias=True),
        )

    def forward(self, f0, f1, embt):
        h, w = f0.shape[2:]
        embt = embt.repeat(1, 1, h, w)
        out = self.convblock(torch.cat([f0, f1, embt], 1))
        flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
        ft_ = out[:, 4:, ...]
        return flow0, flow1, ft_


class IntermediateDecoder(nn.Module):
    def __init__(self, in_ch, out_ch, skip_ch) -> None:
        super().__init__()
        self.convblock = nn.Sequential(
            convrelu(in_ch * 3 + 4, in_ch * 3),
            ResBlock(in_ch * 3, skip_ch),
            nn.ConvTranspose2d(in_ch * 3, out_ch + 4, 4, 2, 1, bias=True),
        )

    def forward(self, ft_, f0, f1, flow0_in, flow1_in):
        f0_warp = warp(f0, flow0_in)
        f1_warp = warp(f1, flow1_in)
        f_in = torch.cat([ft_, f0_warp, f1_warp, flow0_in, flow1_in], 1)
        out = self.convblock(f_in)
        flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
        ft_ = out[:, 4:, ...]
        flow0 = flow0 + 2.0 * resize(flow0_in, scale_factor=2.0)
        flow1 = flow1 + 2.0 * resize(flow1_in, scale_factor=2.0)
        return flow0, flow1, ft_


================================================
FILE: Open-Sora/tools/frame_interpolation/networks/blocks/multi_flow.py
================================================
import torch
import torch.nn as nn

from tools.frame_interpolation.utils.flow_utils import warp

from .ifrnet import ResBlock, convrelu, resize


def multi_flow_combine(comb_block, img0, img1, flow0, flow1, mask=None, img_res=None, mean=None):
    """
    A parallel implementation of multiple flow field warping
    comb_block: An nn.Seqential object.
    img shape: [b, c, h, w]
    flow shape: [b, 2*num_flows, h, w]
    mask (opt):
        If 'mask' is None, the function conduct a simple average.
    img_res (opt):
        If 'img_res' is None, the function adds zero instead.
    mean (opt):
        If 'mean' is None, the function adds zero instead.
    """
    b, c, h, w = flow0.shape
    num_flows = c // 2
    flow0 = flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
    flow1 = flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)

    mask = mask.reshape(b, num_flows, 1, h, w).reshape(-1, 1, h, w) if mask is not None else None
    img_res = img_res.reshape(b, num_flows, 3, h, w).reshape(-1, 3, h, w) if img_res is not None else 0
    img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w)
    img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w)
    mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1) if mean is not None else 0

    img0_warp = warp(img0, flow0)
    img1_warp = warp(img1, flow1)
    img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res
    img_warps = img_warps.reshape(b, num_flows, 3, h, w)
    imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w))
    return imgt_pred


class MultiFlowDecoder(nn.Module):
    def __init__(self, in_ch, skip_ch, num_flows=3):
        super(MultiFlowDecoder, self).__init__()
        self.num_flows = num_flows
        self.convblock = nn.Sequential(
            convrelu(in_ch * 3 + 4, in_ch * 3),
            ResBlock(in_ch * 3, skip_ch),
            nn.ConvTranspose2d(in_ch * 3, 8 * num_flows, 4, 2, 1, bias=True),
        )

    def forward(self, ft_, f0, f1, flow0, flow1):
        n = self.num_flows
        f0_warp = warp(f0, flow0)
        f1_warp = warp(f1, flow1)
        out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1))
        delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2 * n, 2 * n, n, 3 * n], 1)
        mask = torch.sigmoid(mask)

        flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0).repeat(1, self.num_flows, 1, 1)
        flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0).repeat(1, self.num_flows, 1, 1)

        return flow0, flow1, mask, img_res


================================================
FILE: Open-Sora/tools/frame_interpolation/networks/blocks/raft.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


def resize(x, scale_factor):
    return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)


def bilinear_sampler(img, coords, mask=False):
    """Wrapper for grid_sample, uses pixel coordinates"""
    H, W = img.shape[-2:]
    xgrid, ygrid = coords.split([1, 1], dim=-1)
    xgrid = 2 * xgrid / (W - 1) - 1
    ygrid = 2 * ygrid / (H - 1) - 1

    grid = torch.cat([xgrid, ygrid], dim=-1)
    img = F.grid_sample(img, grid, align_corners=True)

    if mask:
        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
        return img, mask.float()

    return img


def coords_grid(batch, ht, wd, device):
    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device), indexing="ij")
    coords = torch.stack(coords[::-1], dim=0).float()
    return coords[None].repeat(batch, 1, 1, 1)


class SmallUpdateBlock(nn.Module):
    def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, fc_dim, corr_levels=4, radius=3, scale_factor=None):
        super(SmallUpdateBlock, self).__init__()
        cor_planes = corr_levels * (2 * radius + 1) ** 2
        self.scale_factor = scale_factor

        self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
        self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3)
        self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1)
        self.conv = nn.Conv2d(corr_dim + flow_dim, fc_dim, 3, padding=1)

        self.gru = nn.Sequential(
            nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
        )

        self.feat_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, cdim, 3, padding=1),
        )

        self.flow_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, 4, 3, padding=1),
        )

        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)

    def forward(self, net, flow, corr):
        net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net
        cor = self.lrelu(self.convc1(corr))
        flo = self.lrelu(self.convf1(flow))
        flo = self.lrelu(self.convf2(flo))
        cor_flo = torch.cat([cor, flo], dim=1)
        inp = self.lrelu(self.conv(cor_flo))
        inp = torch.cat([inp, flow, net], dim=1)

        out = self.gru(inp)
        delta_net = self.feat_head(out)
        delta_flow = self.flow_head(out)

        if self.scale_factor is not None:
            delta_net = resize(delta_net, scale_factor=self.scale_factor)
            delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)

        return delta_net, delta_flow


class BasicUpdateBlock(nn.Module):
    def __init__(
        self,
        cdim,
        hidden_dim,
        flow_dim,
        corr_dim,
        corr_dim2,
        fc_dim,
        corr_levels=4,
        radius=3,
        scale_factor=None,
        out_num=1,
    ):
        super(BasicUpdateBlock, self).__init__()
        cor_planes = corr_levels * (2 * radius + 1) ** 2

        self.scale_factor = scale_factor
        self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
        self.convc2 = nn.Conv2d(corr_dim, corr_dim2, 3, padding=1)
        self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3)
        self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1)
        self.conv = nn.Conv2d(flow_dim + corr_dim2, fc_dim, 3, padding=1)

        self.gru = nn.Sequential(
            nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
        )

        self.feat_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, cdim, 3, padding=1),
        )

        self.flow_head = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
            nn.LeakyReLU(negative_slope=0.1, inplace=True),
            nn.Conv2d(hidden_dim, 4 * out_num, 3, padding=1),
        )

        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)

    def forward(self, net, flow, corr):
        net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net
        cor = self.lrelu(self.convc1(corr))
        cor = self.lrelu(self.convc2(cor))
        flo = self.lrelu(self.convf1(flow))
        flo = self.lrelu(self.convf2(flo))
        cor_flo = torch.cat([cor, flo], dim=1)
        inp = self.lrelu(self.conv(cor_flo))
        inp = torch.cat([inp, flow, net], dim=1)

        out = self.gru(inp)
        delta_net = self.feat_head(out)
        delta_flow = self.flow_head(out)

        if self.scale_factor is not None:
            delta_net = resize(delta_net, scale_factor=self.scale_factor)
            delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)
        return delta_net, delta_flow


class BidirCorrBlock:
    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
        self.num_levels = num_levels
        self.radius = radius
        self.corr_pyramid = []
        self.corr_pyramid_T = []

        corr = BidirCorrBlock.corr(fmap1, fmap2)
        batch, h1, w1, dim, h2, w2 = corr.shape
        corr_T = corr.clone().permute(0, 4, 5, 3, 1, 2)

        corr = corr.reshape(batch * h1 * w1, dim, h2, w2)
        corr_T = corr_T.reshape(batch * h2 * w2, dim, h1, w1)

        self.corr_pyramid.append(corr)
        self.corr_pyramid_T.append(corr_T)

        for _ in range(self.num_levels - 1):
            corr = F.avg_pool2d(corr, 2, stride=2)
            corr_T = F.avg_pool2d(corr_T, 2, stride=2)
            self.corr_pyramid.append(corr)
            self.corr_pyramid_T.append(corr_T)

    def __call__(self, coords0, coords1):
        r = self.radius
        coords0 = coords0.permute(0, 2, 3, 1)
        coords1 = coords1.permute(0, 2, 3, 1)
        assert coords0.shape == coords1.shape, f"coords0 shape: [{coords0.shape}] is not equal to [{coords1.shape}]"
        batch, h1, w1, _ = coords0.shape

        out_pyramid = []
        out_pyramid_T = []
        for i in range(self.num_levels):
            corr = self.corr_pyramid[i]
            corr_T = self.corr_pyramid_T[i]

            dx = torch.linspace(-r, r, 2 * r + 1, device=coords0.device)
            dy = torch.linspace(-r, r, 2 * r + 1, device=coords0.device)
            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1)
            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)

            centroid_lvl_0 = coords0.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
            centroid_lvl_1 = coords1.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
            coords_lvl_0 = centroid_lvl_0 + delta_lvl
            coords_lvl_1 = centroid_lvl_1 + delta_lvl

            corr = bilinear_sampler(corr, coords_lvl_0)
            corr_T = bilinear_sampler(corr_T, coords_lvl_1)
            corr = corr.view(batch, h1, w1, -1)
            corr_T = corr_T.view(batch, h1, w1, -1)
            out_pyramid.append(corr)
            out_pyramid_T.append(corr_T)

        out = torch.cat(out_pyramid, dim=-1)
        out_T = torch.cat(out_pyramid_T, dim=-1)
        return out.permute(0, 3, 1, 2).contiguous().float(), out_T.permute(0, 3, 1, 2).contiguous().float()

    @staticmethod
    def corr(fmap1, fmap2):
        batch, dim, ht, wd = fmap1.shape
        fmap1 = fmap1.view(batch, dim, ht * wd)
        fmap2 = fmap2.view(batch, dim, ht * wd)

        corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
        corr = corr.view(batch, ht, wd, 1, ht, wd)
        return corr / torch.sqrt(torch.tensor(dim).float())


================================================
FILE: Open-Sora/tools/frame_interpolation/utils/__init__.py
================================================


================================================
FILE: Open-Sora/tools/frame_interpolation/utils/dist_utils.py
================================================
import os

import torch


def get_world_size():
    """Find OMPI world size without calling mpi functions
    :rtype: int
    """
    if os.environ.get("PMI_SIZE") is not None:
        return int(os.environ.get("PMI_SIZE") or 1)
    elif os.environ.get("OMPI_COMM_WORLD_SIZE") is not None:
        return int(os.environ.get("OMPI_COMM_WORLD_SIZE") or 1)
    else:
        return torch.cuda.device_count()


def get_global_rank():
    """Find OMPI world rank without calling mpi functions
    :rtype: int
    """
    if os.environ.get("PMI_RANK") is not None:
        return int(os.environ.get("PMI_RANK") or 0)
    elif os.environ.get("OMPI_COMM_WORLD_RANK") is not None:
        return int(os.environ.get("OMPI_COMM_WORLD_RANK") or 0)
    else:
        return 0


def get_local_rank():
    """Find OMPI local rank without calling mpi functions
    :rtype: int
    """
    if os.environ.get("MPI_LOCALRANKID") is not None:
        return int(os.environ.get("MPI_LOCALRANKID") or 0)
    elif os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") is not None:
        return int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") or 0)
    else:
        return 0


def get_master_ip():
    if os.environ.get("AZ_BATCH_MASTER_NODE") is not None:
        return os.environ.get("AZ_BATCH_MASTER_NODE").split(":")[0]
    elif os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") is not None:
        return os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE")
    else:
        return "127.0.0.1"


================================================
FILE: Open-Sora/tools/frame_interpolation/utils/flow_utils.py
================================================
import numpy as np
import torch
import torch.nn.functional as F
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


def warp(img, flow):
    B, _, H, W = flow.shape
    xx = torch.linspace(-1.0, 1.0, W).view(1, 1, 1, W).expand(B, -1, H, -1)
    yy = torch.linspace(-1.0, 1.0, H).view(1, 1, H, 1).expand(B, -1, -1, W)
    grid = torch.cat([xx, yy], 1).to(img)
    flow_ = torch.cat([flow[:, 0:1, :, :] / ((W - 1.0) / 2.0), flow[:, 1:2, :, :] / ((H - 1.0) / 2.0)], 1)
    grid_ = (grid + flow_).permute(0, 2, 3, 1)
    output = F.grid_sample(input=img, grid=grid_, mode="bilinear", padding_mode="border", align_corners=True)
    return output


def make_colorwheel():
    """
    Generates a color wheel for optical flow visualization as presented in:
        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
    Code follows the original C++ source code of Daniel Scharstein.
    Code follows the Matlab source code of Deqing Sun.
    Returns:
        np.ndarray: Color wheel
    """

    RY = 15
    YG = 6
    GC = 4
    CB = 11
    BM = 13
    MR = 6

    ncols = RY + YG + GC + CB + BM + MR
    colorwheel = np.zeros((ncols, 3))
    col = 0

    # RY
    colorwheel[0:RY, 0] = 255
    colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY)
    col = col + RY
    # YG
    colorwheel[col : col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG)
    colorwheel[col : col + YG, 1] = 255
    col = col + YG
    # GC
    colorwheel[col : col + GC, 1] = 255
    colorwheel[col : col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC)
    col = col + GC
    # CB
    colorwheel[col : col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB)
    colorwheel[col : col + CB, 2] = 255
    col = col + CB
    # BM
    colorwheel[col : col + BM, 2] = 255
    colorwheel[col : col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM)
    col = col + BM
    # MR
    colorwheel[col : col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR)
    colorwheel[col : col + MR, 0] = 255
    return colorwheel


def flow_uv_to_colors(u, v, convert_to_bgr=False):
    """
    Applies the flow color wheel to (possibly clipped) flow components u and v.
    According to the C++ source code of Daniel Scharstein
    According to the Matlab source code of Deqing Sun
    Args:
        u (np.ndarray): Input horizontal flow of shape [H,W]
        v (np.ndarray): Input vertical flow of shape [H,W]
        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
    Returns:
        np.ndarray: Flow visualization image of shape [H,W,3]
    """
    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
    colorwheel = make_colorwheel()  # shape [55x3]
    ncols = colorwheel.shape[0]
    rad = np.sqrt(np.square(u) + np.square(v))
    a = np.arctan2(-v, -u) / np.pi
    fk = (a + 1) / 2 * (ncols - 1)
    k0 = np.floor(fk).astype(np.int32)
    k1 = k0 + 1
    k1[k1 == ncols] = 0
    f = fk - k0
    for i in range(colorwheel.shape[1]):
        tmp = colorwheel[:, i]
        col0 = tmp[k0] / 255.0
        col1 = tmp[k1] / 255.0
        col = (1 - f) * col0 + f * col1
        idx = rad <= 1
        col[idx] = 1 - rad[idx] * (1 - col[idx])
        col[~idx] = col[~idx] * 0.75  # out of range
        # Note the 2-i => BGR instead of RGB
        ch_idx = 2 - i if convert_to_bgr else i
        flow_image[:, :, ch_idx] = np.floor(255 * col)
    return flow_image


def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
    """
    Expects a two dimensional flow image of shape.
    Args:
        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
    Returns:
        np.ndarray: Flow visualization image of shape [H,W,3]
    """
    assert flow_uv.ndim == 3, "input flow must have three dimensions"
    assert flow_uv.shape[2] == 2, "input flow must have shape [H,W,2]"
    if clip_flow is not None:
        flow_uv = np.clip(flow_uv, 0, clip_flow)
    u = flow_uv[:, :, 0]
    v = flow_uv[:, :, 1]
    rad = np.sqrt(np.square(u) + np.square(v))
    rad_max = np.max(rad)
    epsilon = 1e-5
    u = u / (rad_max + epsilon)
    v = v / (rad_max + epsilon)
    return flow_uv_to_colors(u, v, convert_to_bgr)


================================================
FILE: Open-Sora/tools/frame_interpolation/utils/utils.py
================================================
import random
import re
import sys

import numpy as np
import torch
import torch.nn.functional as F
from imageio import imread, imwrite
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0.0
        self.avg = 0.0
        self.sum = 0.0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class AverageMeterGroups:
    def __init__(self) -> None:
        self.meter_dict = dict()

    def update(self, dict, n=1):
        for name, val in dict.items():
            if self.meter_dict.get(name) is None:
                self.meter_dict[name] = AverageMeter()
            self.meter_dict[name].update(val, n)

    def reset(self, name=None):
        if name is None:
            for v in self.meter_dict.values():
                v.reset()
        else:
            meter = self.meter_dict.get(name)
            if meter is not None:
                meter.reset()

    def avg(self, name):
        meter = self.meter_dict.get(name)
        if meter is not None:
            return meter.avg


class InputPadder:
    """Pads images such that dimensions are divisible by divisor"""

    def __init__(self, dims, divisor=16):
        self.ht, self.wd = dims[-2:]
        pad_ht = (((self.ht // divisor) + 1) * divisor - self.ht) % divisor
        pad_wd = (((self.wd // divisor) + 1) * divisor - self.wd) % divisor
        self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2]

    def pad(self, *inputs):
        if len(inputs) == 1:
            return F.pad(inputs[0], self._pad, mode="replicate")
        else:
            return [F.pad(x, self._pad, mode="replicate") for x in inputs]

    def unpad(self, *inputs):
        if len(inputs) == 1:
            return self._unpad(inputs[0])
        else:
            return [self._unpad(x) for x in inputs]

    def _unpad(self, x):
        ht, wd = x.shape[-2:]
        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
        return x[..., c[0] : c[1], c[2] : c[3]]


def img2tensor(img):
    if img.shape[-1] > 3:
        img = img[:, :, :3]
    return torch.tensor(img).permute(2, 0, 1).unsqueeze(0) / 255.0


def tensor2img(img_t):
    return (img_t * 255.0).detach().squeeze(0).permute(1, 2, 0).cpu().numpy().clip(0, 255).astype(np.uint8)


def seed_all(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def read(file):
    if file.endswith(".float3"):
        return readFloat(file)
    elif file.endswith(".flo"):
        return readFlow(file)
    elif file.endswith(".ppm"):
        return readImage(file)
    elif file.endswith(".pgm"):
        return readImage(file)
    elif file.endswith(".png"):
        return readImage(file)
    elif file.endswith(".jpg"):
        return readImage(file)
    elif file.endswith(".pfm"):
        return readPFM(file)[0]
    else:
        raise Exception("don't know how to read %s" % file)


def write(file, data):
    if file.endswith(".float3"):
        return writeFloat(file, data)
    elif file.endswith(".flo"):
        return writeFlow(file, data)
    elif file.endswith(".ppm"):
        return writeImage(file, data)
    elif file.endswith(".pgm"):
        return writeImage(file, data)
    elif file.endswith(".png"):
        return writeImage(file, data)
    elif file.endswith(".jpg"):
        return writeImage(file, data)
    elif file.endswith(".pfm"):
        return writePFM(file, data)
    else:
        raise Exception("don't know how to write %s" % file)


def readPFM(file):
    file = open(file, "rb")

    color = None
    width = None
    height = None
    scale = None
    endian = None

    header = file.readline().rstrip()
    if header.decode("ascii") == "PF":
        color = True
    elif header.decode("ascii") == "Pf":
        color = False
    else:
        raise Exception("Not a PFM file.")

    dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
    if dim_match:
        width, height = list(map(int, dim_match.groups()))
    else:
        raise Exception("Malformed PFM header.")

    scale = float(file.readline().decode("ascii").rstrip())
    if scale < 0:
        endian = "<"
        scale = -scale
    else:
        endian = ">"

    data = np.fromfile(file, endian + "f")
    shape = (height, width, 3) if color else (height, width)

    data = np.reshape(data, shape)
    data = np.flipud(data)
    return data, scale


def writePFM(file, image, scale=1):
    file = open(file, "wb")

    color = None

    if image.dtype.name != "float32":
        raise Exception("Image dtype must be float32.")

    image = np.flipud(image)

    if len(image.shape) == 3 and image.shape[2] == 3:
        color = True
    elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1:
        color = False
    else:
        raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")

    file.write("PF\n" if color else "Pf\n".encode())
    file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))

    endian = image.dtype.byteorder

    if endian == "<" or endian == "=" and sys.byteorder == "little":
        scale = -scale

    file.write("%f\n".encode() % scale)

    image.tofile(file)


def readFlow(name):
    if name.endswith(".pfm") or name.endswith(".PFM"):
        return readPFM(name)[0][:, :, 0:2]

    f = open(name, "rb")

    header = f.read(4)
    if header.decode("utf-8") != "PIEH":
        raise Exception("Flow file header does not contain PIEH")

    width = np.fromfile(f, np.int32, 1).squeeze()
    height = np.fromfile(f, np.int32, 1).squeeze()

    flow = np.fromfile(f, np.float32, width * height * 2).reshape((height, width, 2))

    return flow.astype(np.float32)


def readImage(name):
    if name.endswith(".pfm") or name.endswith(".PFM"):
        data = readPFM(name)[0]
        if len(data.shape) == 3:
            return data[:, :, 0:3]
        else:
            return data
    return imread(name)


def writeImage(name, data):
    if name.endswith(".pfm") or name.endswith(".PFM"):
        return writePFM(name, data, 1)
    return imwrite(name, data)


def writeFlow(name, flow):
    f = open(name, "wb")
    f.write("PIEH".encode("utf-8"))
    np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
    flow = flow.astype(np.float32)
    flow.tofile(f)


def readFloat(name):
    f = open(name, "rb")

    if (f.readline().decode("utf-8")) != "float\n":
        raise Exception("float file %s did not contain <float> keyword" % name)

    dim = int(f.readline())

    dims = []
    count = 1
    for i in range(0, dim):
        d = int(f.readline())
        dims.append(d)
        count *= d

    dims = list(reversed(dims))

    data = np.fromfile(f, np.float32, count).reshape(dims)
    if dim > 2:
        data = np.transpose(data, (2, 1, 0))
        data = np.transpose(data, (1, 0, 2))

    return data


def writeFloat(name, data):
    f = open(name, "wb")

    dim = len(data.shape)
    if dim > 3:
        raise Exception("bad float file dimension: %d" % dim)

    f.write(("float\n").encode("ascii"))
    f.write(("%d\n" % dim).encode("ascii"))

    if dim == 1:
        f.write(("%d\n" % data.shape[0]).encode("ascii"))
    else:
        f.write(("%d\n" % data.shape[1]).encode("ascii"))
        f.write(("%d\n" % data.shape[0]).encode("ascii"))
        for i in range(2, dim):
            f.write(("%d\n" % data.shape[i]).encode("ascii"))

    data = data.astype(np.float32)
    if dim == 2:
        data.tofile(f)

    else:
        np.transpose(data, (2, 0, 1)).tofile(f)


def check_dim_and_resize(tensor_list):
    shape_list = []
    for t in tensor_list:
        shape_list.append(t.shape[2:])

    if len(set(shape_list)) > 1:
        desired_shape = shape_list[0]
        print(f"Inconsistent size of input video frames. All frames will be resized to {desired_shape}")

        resize_tensor_list = []
        for t in tensor_list:
            resize_tensor_list.append(torch.nn.functional.interpolate(t, size=tuple(desired_shape), mode="bilinear"))

        tensor_list = resize_tensor_list

    return tensor_list


================================================
FILE: Open-Sora/tools/scene_cut/README.md
================================================
# Scene Detection and Video Splitting

- [Scene Detection and Video Splitting](#scene-detection-and-video-splitting)
    - [Prepare Meta Files](#prepare-meta-files)
    - [Scene Detection](#scene-detection)
    - [Video Splitting](#video-splitting)

In many cases, raw videos contain several scenes and are too long for training. Thus, it is essential to split them into shorter
clips based on scenes. Here, we provide code for scene detection and video splitting.

## Prepare Meta Files
At this step, you should have a raw video dataset prepared. A meta file of the dataset information is needed for data processing. To create a meta file from a folder, run:

```bash
python -m tools.datasets.convert video /path/to/video/folder --output /path/to/save/meta.csv
```
This should output a `.csv` file with column `path`.

If you already have a meta file for the videos and want to keep the information.
**Make sure** the meta file has column `id`, which is the id for each video, and the video is named as `{id}.mp4`.
The following command will add a new column `path` to the meta file.

```bash
python tools/scene_cut/convert_id_to_path.py /path/to/meta.csv --folder_path /path/to/video/folder
```
This should output
- `{prefix}_path-filtered.csv` with column `path` (broken videos filtered)
- `{prefix}_path_intact.csv` with column `path` and `intact` (`intact` indicating a video is intact or not)


## Scene Detection

Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "Scene Detection" sections.

<!-- The next step is to detect scenes in a video.
We use [`PySceneDetect`](https://github.com/Breakthrough/PySceneDetect) for this job.
```bash
pip install scenedetect[opencv] --upgrade
``` -->

**Make sure** the input meta file has column `path`, which is the path of a video.

```bash
python tools/scene_cut/scene_detect.py /path/to/meta.csv
```
The output is `{prefix}_timestamp.csv` with column `timestamp`. Each cell in column `timestamp` is a list of tuples,
with each tuple indicating the start and end timestamp of a scene
(e.g., `[('00:00:01.234', '00:00:02.345'), ('00:00:03.456', '00:00:04.567')]`).

## Video Splitting
After obtaining timestamps for scenes, we conduct video splitting (cutting).
**Make sure** the meta file contains column `timestamp`.

```bash
python tools/scene_cut/cut.py /path/to/meta.csv --save_dir /path/to/output/dir
```

This will save video clips to `/path/to/output/dir`. The video clips are named as `{video_id}_scene-{scene_id}.mp4`

To create a new meta file for the generated clips, run:
```bash
python -m tools.datasets.convert video /path/to/video/folder --output /path/to/save/meta.csv
```


================================================
FILE: Open-Sora/tools/scene_cut/__init__.py
================================================


================================================
FILE: Open-Sora/tools/scene_cut/convert_id_to_path.py
================================================
import argparse
import json
import os
from functools import partial

import cv2
import numpy as np
import pandas as pd
from mmengine.logging import print_log
from moviepy.editor import VideoFileClip
from pandarallel import pandarallel
from tqdm import tqdm

tqdm.pandas()


def is_intact_video(video_path, mode="moviepy", verbose=False, logger=None):
    if not os.path.exists(video_path):
        if verbose:
            print_log(f"Could not find '{video_path}'", logger=logger)
        return False

    if mode == "moviepy":
        try:
            VideoFileClip(video_path)
            if verbose:
                print_log(f"The video file '{video_path}' is intact.", logger=logger)
            return True
        except Exception as e:
            if verbose:
                print_log(f"Error: {e}", logger=logger)
                print_log(f"The video file '{video_path}' is not intact.", logger=logger)
            return False
    elif mode == "cv2":
        try:
            cap = cv2.VideoCapture(video_path)
            if cap.isOpened():
                if verbose:
                    print_log(f"The video file '{video_path}' is intact.", logger=logger)
                return True
        except Exception as e:
            if verbose:
                print_log(f"Error: {e}", logger=logger)
                print_log(f"The video file '{video_path}' is not intact.", logger=logger)
            return False
    else:
        raise ValueError


def has_downloaded_success(json_path):
    if not os.path.exists(json_path):
        return False

    try:
        with open(json_path, "r") as f:
            data = json.load(f)
            if "success" not in data or isinstance(data["success"], bool) is False or data["success"] is False:
                return False
    except Exception:
        return False

    return True


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str)
    parser.add_argument("--folder_path", type=str, required=True)
    parser.add_argument("--mode", type=str, default=None)
    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")

    args = parser.parse_args()
    return args


def main():
    args = parse_args()

    meta_path = args.meta_path
    folder_path = args.folder_path
    mode = args.mode

    def is_intact(row, mode=None):
        video_id = row["id"]
        video_path = os.path.join(folder_path, f"{video_id}.mp4")
        row["path"] = video_path

        if mode == ".mp4":
            if is_intact_video(video_path):
                return True, video_path
            return False, video_path
        elif mode == ".json":
            # json_path = os.path.join(root_raw, f"data/{split}/{video_id}.json")
            json_path = os.path.join(folder_path, f"{video_id}.json")
            if has_downloaded_success(json_path):
                return True, video_path
            return False, video_path
        elif mode is None:
            return True, video_path
        else:
            raise ValueError

    meta_dirpath = os.path.dirname(meta_path)
    meta_fname = os.path.basename(meta_path)
    wo_ext, ext = os.path.splitext(meta_fname)

    if args.num_workers is not None:
        pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
    else:
        pandarallel.initialize(progress_bar=True)
    is_intact_partial = partial(is_intact, mode=mode)

    meta = pd.read_csv(meta_path)
    ret = meta.parallel_apply(is_intact_partial, axis=1)
    intact, paths = list(zip(*ret))

    meta["intact"] = intact
    meta["path"] = paths
    out_path = os.path.join(meta_dirpath, f"{wo_ext}_path_intact.csv")
    meta.to_csv(out_path, index=False)
    print(f"New meta (shape={meta.shape}) with intact info saved to '{out_path}'")

    meta_format = meta[np.array(intact)]
    meta_format.drop("intact", axis=1, inplace=True)
    out_path = os.path.join(meta_dirpath, f"{wo_ext}_path-filtered.csv")
    meta_format.to_csv(out_path, index=False)
    print(f"New meta (shape={meta_format.shape}) with format info saved to '{out_path}'")


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/tools/scene_cut/cut.py
================================================
import cv2  # isort:skip

import argparse
import os
import subprocess
from functools import partial

import pandas as pd
from imageio_ffmpeg import get_ffmpeg_exe
from pandarallel import pandarallel
from scenedetect import FrameTimecode
from tqdm import tqdm

tqdm.pandas()


def print_log(s, logger=None):
    if logger is not None:
        logger.info(s)
    else:
        print(s)


def process_single_row(row, args):
    video_path = row["path"]

    logger = None

    # check mp4 integrity
    # if not is_intact_video(video_path, logger=logger):
    #     return False
    try:
        if "timestamp" in row:
            timestamp = row["timestamp"]
            if not (timestamp.startswith("[") and timestamp.endswith("]")):
                return False
            scene_list = eval(timestamp)
            scene_list = [(FrameTimecode(s, fps=100), FrameTimecode(t, fps=100)) for s, t in scene_list]
        else:
            scene_list = [None]
        if args.drop_invalid_timestamps:
            return True
    except Exception as e:
        if args.drop_invalid_timestamps:
            return False

    if "relpath" in row:
        save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"]))
        os.makedirs(save_dir, exist_ok=True)
    else:
        save_dir = args.save_dir

    shorter_size = args.shorter_size
    if (shorter_size is not None) and ("height" in row) and ("width" in row):
        min_size = min(row["height"], row["width"])
        if min_size <= shorter_size:
            shorter_size = None

    split_video(
        video_path,
        scene_list,
        save_dir=save_dir,
        min_seconds=args.min_seconds,
        max_seconds=args.max_seconds,
        target_fps=args.target_fps,
        shorter_size=shorter_size,
        logger=logger,
    )
    return True

def split_video(
    video_path,
    scene_list,
    save_dir,
    min_seconds=2,
    max_seconds=15,
    target_fps=30,
    shorter_size=None,
    verbose=False,
    logger=None,
):
    """
    scenes shorter than min_seconds will be ignored;
    scenes longer than max_seconds will be cut to save the beginning max_seconds.
    Currently, the saved file name pattern is f'{fname}_scene-{idx}'.mp4

    Args:
        scene_list (List[Tuple[FrameTimecode, FrameTimecode]]): each element is (s, t): start and end of a scene.
        min_seconds (float | None)
        max_seconds (float | None)
        target_fps (int | None)
        shorter_size (int | None)
    """
    FFMPEG_PATH = get_ffmpeg_exe()

    save_path_list = []
    for idx, scene in enumerate(scene_list):
        if scene is not None:
            s, t = scene  # FrameTimecode
            if min_seconds is not None:
                if (t - s).get_seconds() < min_seconds:
                    continue

            duration = t - s
            if max_seconds is not None:
                fps = s.framerate
                max_duration = FrameTimecode(max_seconds, fps=fps)
                duration = min(max_duration, duration)

        # save path
        fname = os.path.basename(video_path)
        fname_wo_ext = os.path.splitext(fname)[0]
        # TODO: fname pattern
        save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
        if os.path.exists(save_path):
            # print_log(f"File '{save_path}' already exists. Skip.", logger=logger)
            continue
        
        # ffmpeg cmd
        cmd = [FFMPEG_PATH]

        # Only show ffmpeg output for the first call, which will display any
        # errors if it fails, and then break the loop. We only show error messages
        # for the remaining calls.
        # cmd += ['-v', 'error']

        # clip to cut
        # Note: -ss after -i is very slow; put -ss before -i !!!
        if scene is None:
            cmd += ["-nostdin", "-y", "-i", video_path]
        else:
            cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-i", video_path, "-t", str(duration.get_seconds())]

        # target fps
        if target_fps is not None:
            cmd += ["-r", f"{target_fps}"]

        # aspect ratio
        if shorter_size is not None:
            cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"]
            # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]

        cmd += ["-map", "0:v", save_path]
        # print(cmd)
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        stdout, stderr = proc.communicate()
        # stdout = stdout.decode("utf-8")
        # print_log(stdout, logger=logger)

        save_path_list.append(video_path)
        if verbose:
            print_log(f"Video clip saved to '{save_path}'", logger=logger)

    return save_path_list


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str)
    parser.add_argument("--save_dir", type=str)
    parser.add_argument(
        "--min_seconds", type=float, default=None, help="if not None, clip shorter than min_seconds is ignored"
    )
    parser.add_argument(
        "--max_seconds", type=float, default=None, help="if not None, clip longer than max_seconds is truncated"
    )
    parser.add_argument("--target_fps", type=int, default=None, help="target fps of clips")
    parser.add_argument(
        "--shorter_size", type=int, default=None, help="resize the shorter size by keeping ratio; will not do upscale"
    )
    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
    parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
    parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps")
    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    meta_path = args.meta_path
    if not os.path.exists(meta_path):
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

    # create save_dir
    os.makedirs(args.save_dir, exist_ok=True)

    # initialize pandarallel
    if not args.disable_parallel:
        if args.num_workers is not None:
            pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
        else:
            pandarallel.initialize(progress_bar=True)
    process_single_row_partial = partial(process_single_row, args=args)

    # process
    meta = pd.read_csv(args.meta_path)
    if not args.disable_parallel:
        results = meta.parallel_apply(process_single_row_partial, axis=1)
    else:
        results = meta.apply(process_single_row_partial, axis=1)
    if args.drop_invalid_timestamps:
        meta = meta[results]
        assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv"
        meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False)
        print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'")
if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/tools/scene_cut/scene_detect.py
================================================
import argparse
import os

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from scenedetect import AdaptiveDetector, detect
from tqdm import tqdm

tqdm.pandas()


def process_single_row(row):
    # windows
    # from scenedetect import detect, ContentDetector, AdaptiveDetector

    video_path = row["path"]

    detector = AdaptiveDetector(
        adaptive_threshold=3.0,
        # luma_only=True,
    )
    # detector = ContentDetector()
    # TODO: catch error here
    try:
        scene_list = detect(video_path, detector, start_in_scene=True)
        timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list]
        return True, str(timestamp)
    except Exception as e:
        print(f"Video '{video_path}' with error {e}")
        return False, ""


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str)
    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")

    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    meta_path = args.meta_path
    if not os.path.exists(meta_path):
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

    if args.num_workers is not None:
        pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
    else:
        pandarallel.initialize(progress_bar=True)

    meta = pd.read_csv(meta_path)
    ret = meta.parallel_apply(process_single_row, axis=1)

    succ, timestamps = list(zip(*ret))
    meta["timestamp"] = timestamps
    meta = meta[np.array(succ)]

    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_timestamp{ext}"
    meta.to_csv(out_path, index=False)
    print(f"New meta (shape={meta.shape}) with timestamp saved to '{out_path}'.")


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/tools/scoring/README.md
================================================
# Scoring and Filtering

- [Scoring and Filtering](#scoring-and-filtering)
  - [Aesthetic Score](#aesthetic-score)
  - [Optical Flow Score](#optical-flow-score)
  - [OCR](#ocr)
  - [Matching Score](#matching-score)
  - [Filtering](#filtering)

## Aesthetic Score

To evaluate the aesthetic quality of videos, we use the scoring model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.

The aesthetic score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for high aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.

For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images as input.
The throughput of our code is ~1K videos/s on a single H800 GPU. It also supports running on multiple GPUs for further acceleration.

First, install the required packages following our [installation instructions](../../docs/installation.md)'s "Data Dependencies".

Next, download the scoring model to `./pretrained_models/aesthetic.pth`.

```bash
wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
```

<!-- First, install the required packages and download the scoring model to `./pretrained_models/aesthetic.pth`.
```bash
# pip install
pip install git+https://github.com/openai/CLIP.git
pip install decord

# get pretrained model
wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
``` -->

Then, run the following command. **Make sure** the meta file has column `path` (path to the sample).
```bash
torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference /path/to/meta.csv --bs 1024 --num_workers 16
```
This will generate multiple part files, each corresponding to a node . Run `python -m tools.datasets.datautil /path/to/meta_aes_part*.csv --output /path/to/meta_aes.csv` to merge them.

## Optical Flow Score

Optical flow scores are used to assess the motion of a video. Higher optical flow scores indicate larger movement.
We use the [UniMatch](https://github.com/autonomousvision/unimatch) model for this task.

First, install the required packages following our [installation instructions](../../docs/installation.md)'s "Data Dependencies".

Next, download the pretrained model to `./pretrained_model/unimatch/`
```bash
wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P ./pretrained_models/unimatch/
```

Then, run the following command. **Make sure** the meta file has column `path` (path to the sample).
```bash
torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py /path/to/meta.csv
```

This should output `/path/to/meta_flow.csv` with column `flow`.

## OCR
Some videos are of dense text scenes like news broadcast and advertisement, which are not desired for training.
We apply Optical Character Recognition (OCR) to detect texts and drop samples with dense texts. Here, we use
the [DBNet++](https://arxiv.org/abs/2202.10304) model implemented by [MMOCR](https://github.com/open-mmlab/mmocr/).

First, install the required packages following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "OCR" section.

<!-- First, install [MMOCR](https://mmocr.readthedocs.io/en/dev-1.x/get_started/install.html).
For reference, we install packages of these versions.
```
torch==2.0.1
mmcv==2.0.1
mmdet==3.1.0
mmocr==1.0.1
``` -->

Then, run the following command. **Make sure** the meta file has column `path` (path to the sample).
<!-- ```bash
torchrun --standalone --nproc_per_node 8 tools/scoring/ocr/inference.py /path/to/meta.csv
``` -->
```bash
torchrun --standalone --nproc_per_node 8 -m tools.scoring.ocr.inference /path/to/meta.csv
```
This should output `/path/to/meta_ocr.csv` with column `ocr`, indicating the number of text regions with detection confidence > 0.3.


## Matching Score

Matching scores are calculated to evaluate the alignment between an image/video and its caption.
Here, we use the [CLIP](https://github.com/openai/CLIP) model, which is trained on image-text pairs.
We simply use the cosine similarity as the matching score.
For videos, we extract the middle frame and compare it with the caption.

First, install OpenAI CLIP.
```bash
pip install git+https://github.com/openai/CLIP.git
```

Then, run the following command. **Make sure** the meta file has column `path` (path to the sample) and `text` (caption of the sample).

```bash
torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py /path/to/meta.csv
```

This should output `/path/to/meta_match.csv` with column `match`. Higher matching scores indicate better image-text/video-text alignment.


## Filtering
Once scores are obtained, it is simple to filter samples based on these scores. Here is an example to remove
samples of aesthetic score < 5.0.
```
python -m tools.datasets.datautil /path/to/meta.csv --aesmin 5.0
```
This should output `/path/to/meta_aesmin5.0.csv` with column `aes` >= 5.0


================================================
FILE: Open-Sora/tools/scoring/__init__.py
================================================


================================================
FILE: Open-Sora/tools/scoring/aesthetic/__init__.py
================================================


================================================
FILE: Open-Sora/tools/scoring/aesthetic/inference.py
================================================
# adapted from https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/main/simple_inference.py
import cv2  # isort:skip

import argparse
import gc
import os
from datetime import timedelta

import clip
import numpy as np
import pandas as pd
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from torch.utils.data import DataLoader, DistributedSampler
from torchvision.datasets.folder import pil_loader
from tqdm import tqdm

from tools.datasets.utils import extract_frames, is_video

NUM_FRAMES_POINTS = {
    1: (0.5,),
    2: (0.25, 0.5),
    3: (0.1, 0.5, 0.9),
}


def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
    # reorder
    indices_list = list(map(lambda x: x[0], gathered_list))
    scores_list = list(map(lambda x: x[1], gathered_list))

    flat_indices = []
    for x in zip(*indices_list):
        flat_indices.extend(x)
    flat_scores = []
    for x in zip(*scores_list):
        flat_scores.extend(x)
    flat_indices = np.array(flat_indices)
    flat_scores = np.array(flat_scores)

    # filter duplicates
    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
    meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]

    # drop indices in meta not in unique_indices
    meta = meta.loc[unique_indices]
    return meta


class VideoTextDataset(torch.utils.data.Dataset):
    def __init__(self, meta_path, transform=None, num_frames=3):
        self.meta_path = meta_path
        self.meta = pd.read_csv(meta_path)
        self.transform = transform
        self.points = NUM_FRAMES_POINTS[num_frames]

    def __getitem__(self, index):
        sample = self.meta.iloc[index]
        path = sample["path"]

        # extract frames
        if not is_video(path):
            images = [pil_loader(path)]
        else:
            num_frames = sample["num_frames"] if "num_frames" in sample else None
            images = extract_frames(sample["path"], points=self.points, backend="opencv", num_frames=num_frames)

        # transform
        images = [self.transform(img) for img in images]

        # stack
        images = torch.stack(images)

        ret = dict(index=index, images=images)
        return ret

    def __len__(self):
        return len(self.meta)


class MLP(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.input_size = input_size
        self.layers = nn.Sequential(
            nn.Linear(self.input_size, 1024),
            nn.Dropout(0.2),
            nn.Linear(1024, 128),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.Dropout(0.1),
            nn.Linear(64, 16),
            nn.Linear(16, 1),
        )

    def forward(self, x):
        return self.layers(x)


class AestheticScorer(nn.Module):
    def __init__(self, input_size, device):
        super().__init__()
        self.mlp = MLP(input_size)
        self.clip, self.preprocess = clip.load("ViT-L/14", device=device)

        self.eval()
        self.to(device)

    def forward(self, x):
        image_features = self.clip.encode_image(x)
        image_features = F.normalize(image_features, p=2, dim=-1).float()
        return self.mlp(image_features)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
    parser.add_argument("--bs", type=int, default=1024, help="Batch size")
    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
    parser.add_argument("--prefetch_factor", type=int, default=3, help="Prefetch factor")
    parser.add_argument("--num_frames", type=int, default=3, help="Number of frames to extract")
    parser.add_argument("--skip_if_existing", action="store_true")
    args = parser.parse_args()

    return args


def main():
    args = parse_args()

    meta_path = args.meta_path
    if not os.path.exists(meta_path):
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_aes{ext}"
    if args.skip_if_existing and os.path.exists(out_path):
        print(f"Output meta file '{out_path}' already exists. Exit.")
        exit()

    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())

    # build model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AestheticScorer(768, device)
    model.mlp.load_state_dict(torch.load("pretrained_models/aesthetic.pth", map_location=device))
    preprocess = model.preprocess

    # build dataset
    dataset = VideoTextDataset(args.meta_path, transform=preprocess, num_frames=args.num_frames)
    dataloader = DataLoader(
        dataset,
        batch_size=args.bs,
        num_workers=args.num_workers,
        sampler=DistributedSampler(
            dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=False,
            drop_last=False,
        ),
    )

    # compute aesthetic scores
    indices_list = []
    scores_list = []
    model.eval()
    for batch in tqdm(dataloader, disable=dist.get_rank() != 0):
        indices = batch["index"]
        images = batch["images"].to(device, non_blocking=True)

        B = images.shape[0]
        images = rearrange(images, "B N C H W -> (B N) C H W")

        # compute score
        with torch.no_grad():
            scores = model(images)

        scores = rearrange(scores, "(B N) 1 -> B N", B=B)
        scores = scores.mean(dim=1)
        scores_np = scores.to(torch.float32).cpu().numpy()

        indices_list.extend(indices.tolist())
        scores_list.extend(scores_np.tolist())

    # save local results
    meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="aes")
    save_dir_local = os.path.join(os.path.dirname(out_path), "parts")
    os.makedirs(save_dir_local, exist_ok=True)
    out_path_local = os.path.join(
        save_dir_local, os.path.basename(out_path).replace(".csv", f"_part_{dist.get_rank()}.csv")
    )
    meta_local.to_csv(out_path_local, index=False)

    # wait for all ranks to finish data processing
    dist.barrier()

    torch.cuda.empty_cache()
    gc.collect()
    gathered_list = [None] * dist.get_world_size()
    dist.all_gather_object(gathered_list, (indices_list, scores_list))
    if dist.get_rank() == 0:
        meta_new = merge_scores(gathered_list, dataset.meta, column="aes")
        meta_new.to_csv(out_path, index=False)
        print(f"New meta with aesthetic scores saved to '{out_path}'.")


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/tools/scoring/matching/__init__.py
================================================


================================================
FILE: Open-Sora/tools/scoring/matching/inference.py
================================================
import argparse
import os

import clip
import colossalai
import numpy as np
import pandas as pd
import torch
import torch.distributed as dist
import torch.nn.functional as F
from torch.utils.data import DataLoader, DistributedSampler
from torchvision.datasets.folder import pil_loader
from tqdm import tqdm

from tools.datasets.utils import extract_frames, is_video


def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
    # reorder
    indices_list = list(map(lambda x: x[0], gathered_list))
    scores_list = list(map(lambda x: x[1], gathered_list))

    flat_indices = []
    for x in zip(*indices_list):
        flat_indices.extend(x)
    flat_scores = []
    for x in zip(*scores_list):
        flat_scores.extend(x)
    flat_indices = np.array(flat_indices)
    flat_scores = np.array(flat_scores)

    # filter duplicates
    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
    meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]
    return meta


class VideoTextDataset(torch.utils.data.Dataset):
    def __init__(self, meta_path, transform):
        self.meta_path = meta_path
        self.meta = pd.read_csv(meta_path)
        self.transform = transform

    def __getitem__(self, index):
        row = self.meta.iloc[index]
        path = row["path"]

        if is_video(path):
            img = extract_frames(path, points=[0.5], backend="opencv")[0]
        else:
            img = pil_loader(path)

        img = self.transform(img)

        text = row["text"]
        text = clip.tokenize(text, truncate=True).squeeze()

        return img, text, index

    def __len__(self):
        return len(self.meta)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
    parser.add_argument("--bs", type=int, default=16, help="Batch size")
    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
    parser.add_argument("--skip_if_existing", action="store_true")
    args = parser.parse_args()
    return args


def main():
    args = parse_args()

    meta_path = args.meta_path
    if not os.path.exists(meta_path):
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_match{ext}"
    if args.skip_if_existing and os.path.exists(out_path):
        print(f"Output meta file '{out_path}' already exists. Exit.")
        exit()

    colossalai.launch_from_torch({})

    # build model
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model, preprocess = clip.load("ViT-L/14", device=device)
    logit_scale = model.logit_scale.exp().item()

    # build dataset
    dataset = VideoTextDataset(meta_path=meta_path, transform=preprocess)
    dataloader = DataLoader(
        dataset,
        batch_size=args.bs,
        num_workers=args.num_workers,
        sampler=DistributedSampler(
            dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=False,
            drop_last=False,
        ),
    )

    # compute scores
    indices_list = []
    scores_list = []
    model.eval()
    for imgs, text, indices in tqdm(dataloader, disable=dist.get_rank() != 0):
        imgs = imgs.to(device)
        text = text.to(device)

        with torch.no_grad():
            feat_img = model.encode_image(imgs)
            feat_text = model.encode_text(text)

        feat_img = F.normalize(feat_img, dim=1)
        feat_text = F.normalize(feat_text, dim=1)
        clip_scores = logit_scale * (feat_img * feat_text).sum(dim=1)
        clip_scores = clip_scores.cpu().tolist()
        indices_list.extend(indices)
        scores_list.extend(clip_scores)

    gathered_list = [None] * dist.get_world_size()
    dist.all_gather_object(gathered_list, (indices_list, scores_list))
    if dist.get_rank() == 0:
        meta_new = merge_scores(gathered_list, dataset.meta, column="match")
        meta_new.to_csv(out_path, index=False)
        print(f"New meta with matching scores saved to '{out_path}'.")


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/tools/scoring/ocr/__init__.py
================================================


================================================
FILE: Open-Sora/tools/scoring/ocr/dbnetpp.py
================================================
model = dict(
    type="DBNet",
    backbone=dict(
        type="CLIPResNet",
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=-1,
        norm_cfg=dict(type="BN", requires_grad=True),
        norm_eval=False,
        style="pytorch",
        dcn=dict(type="DCNv2", deform_groups=1, fallback_on_stride=False),
        # init_cfg=dict(
        #     type='Pretrained',
        #     checkpoint='https://download.openmmlab.com/mmocr/backbone/resnet50-oclip-7ba0c533.pth'),
        stage_with_dcn=(False, True, True, True),
    ),
    neck=dict(
        type="FPNC",
        in_channels=[256, 512, 1024, 2048],
        lateral_channels=256,
        asf_cfg=dict(attention_type="ScaleChannelSpatial"),
    ),
    det_head=dict(
        type="DBHead",
        in_channels=256,
        module_loss=dict(type="DBModuleLoss"),
        postprocessor=dict(
            type="DBPostprocessor",
            text_repr_type="quad",
            epsilon_ratio=0.002,
        ),
    ),
    data_preprocessor=dict(
        type="TextDetDataPreprocessor",
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        bgr_to_rgb=True,
        pad_size_divisor=32,
    ),
    init_cfg=dict(
        type="Pretrained",
        checkpoint="https://download.openmmlab.com/mmocr/textdet/dbnetpp/"
        "dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/"
        "dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth",
    ),
)

test_pipeline = [
    # dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
    dict(type="Resize", scale=(4068, 1024), keep_ratio=True),
    dict(
        type="PackTextDetInputs",
        # meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'),
        meta_keys=("img_shape", "scale_factor"),
    ),
]

# Visualization
vis_backends = [dict(type="LocalVisBackend")]
visualizer = dict(
    type="TextDetLocalVisualizer",
    name="visualizer",
    vis_backends=vis_backends,
)


================================================
FILE: Open-Sora/tools/scoring/ocr/inference.py
================================================
import argparse
import os

import colossalai
import numpy as np
import pandas as pd
import torch
import torch.distributed as dist
from mmengine import Config
from mmengine.dataset import Compose, default_collate
from mmengine.registry import DefaultScope
from mmocr.datasets import PackTextDetInputs
from mmocr.registry import MODELS
from torch.utils.data import DataLoader, DistributedSampler
from torchvision.datasets.folder import pil_loader
from torchvision.transforms import CenterCrop, Compose, Resize
from tqdm import tqdm

from tools.datasets.utils import extract_frames, is_video


def merge_scores(gathered_list: list, meta: pd.DataFrame):
    # reorder
    indices_list = list(map(lambda x: x[0], gathered_list))
    scores_list = list(map(lambda x: x[1], gathered_list))
    flat_indices = []
    for x in zip(*indices_list):
        flat_indices.extend(x)
    flat_scores = []
    for x in zip(*scores_list):
        flat_scores.extend(x)
    flat_indices = np.array(flat_indices)
    flat_scores = np.array(flat_scores)
    # filter duplicates
    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
    meta.loc[unique_indices, "ocr"] = flat_scores[unique_indices_idx]


class VideoTextDataset(torch.utils.data.Dataset):
    def __init__(self, meta_path, transform):
        self.meta_path = meta_path
        self.meta = pd.read_csv(meta_path)
        self.transform = transform
        self.transform = Compose(
            [
                Resize(1024),
                CenterCrop(1024),
            ]
        )
        self.formatting = PackTextDetInputs(meta_keys=["scale_factor"])

    def __getitem__(self, index):
        row = self.meta.iloc[index]
        path = row["path"]

        if is_video(path):
            img = extract_frames(path, frame_inds=[10], backend="opencv")[0]
        else:
            img = pil_loader(path)

        img = self.transform(img)
        img_array = np.array(img)[:, :, ::-1].copy()  # bgr
        results = {
            "img": img_array,
            "scale_factor": 1.0,
            # 'img_shape': img_array.shape[-2],
            # 'ori_shape': img_array.shape[-2],
        }
        results = self.formatting(results)
        results["index"] = index

        return results

    def __len__(self):
        return len(self.meta)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
    parser.add_argument("--bs", type=int, default=16, help="Batch size")
    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
    parser.add_argument("--skip_if_existing", action="store_true")
    args = parser.parse_args()

    return args


def main():
    args = parse_args()

    meta_path = args.meta_path
    if not os.path.exists(meta_path):
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_ocr{ext}"
    if args.skip_if_existing and os.path.exists(out_path):
        print(f"Output meta file '{out_path}' already exists. Exit.")
        exit()

    cfg = Config.fromfile("./tools/scoring/ocr/dbnetpp.py")
    colossalai.launch_from_torch({})

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    DefaultScope.get_instance("ocr", scope_name="mmocr")  # use mmocr Registry as default

    # build model
    model = MODELS.build(cfg.model)
    model.init_weights()
    model.to(device)  # set data_preprocessor._device
    print("==> Model built.")

    # build dataset
    transform = Compose(cfg.test_pipeline)
    dataset = VideoTextDataset(meta_path=meta_path, transform=transform)
    dataloader = DataLoader(
        dataset,
        batch_size=args.bs,
        num_workers=args.num_workers,
        sampler=DistributedSampler(
            dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=False,
            drop_last=False,
        ),
        collate_fn=default_collate,
    )
    print("==> Dataloader built.")

    # compute scores
    dataset.meta["ocr"] = np.nan
    indices_list = []
    scores_list = []
    model.eval()
    for data in tqdm(dataloader, disable=dist.get_rank() != 0):
        indices_i = data["index"]
        indices_list.extend(indices_i.tolist())
        del data["index"]

        pred = model.test_step(data)  # this line will cast data to device

        num_texts_i = [(x.pred_instances.scores > 0.3).sum().item() for x in pred]
        scores_list.extend(num_texts_i)

    gathered_list = [None] * dist.get_world_size()
    dist.all_gather_object(gathered_list, (indices_list, scores_list))

    if dist.get_rank() == 0:
        merge_scores(gathered_list, dataset.meta)
        dataset.meta.to_csv(out_path, index=False)
        print(f"New meta (shape={dataset.meta.shape}) with ocr results saved to '{out_path}'.")


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/tools/scoring/optical_flow/__init__.py
================================================


================================================
FILE: Open-Sora/tools/scoring/optical_flow/inference.py
================================================
import cv2  # isort:skip

import argparse
import gc
import os
from datetime import timedelta

import numpy as np
import pandas as pd
import torch
import torch.distributed as dist
import torch.nn.functional as F
from einops import rearrange
from torch.utils.data import DataLoader, DistributedSampler
from torchvision.transforms.functional import pil_to_tensor
from tqdm import tqdm

from tools.datasets.utils import extract_frames
from tools.scoring.optical_flow.unimatch import UniMatch

# torch.backends.cudnn.enabled = False # This line enables large batch, but the speed is similar


def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
    # reorder
    indices_list = list(map(lambda x: x[0], gathered_list))
    scores_list = list(map(lambda x: x[1], gathered_list))

    flat_indices = []
    for x in zip(*indices_list):
        flat_indices.extend(x)
    flat_scores = []
    for x in zip(*scores_list):
        flat_scores.extend(x)
    flat_indices = np.array(flat_indices)
    flat_scores = np.array(flat_scores)

    # filter duplicates
    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
    meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]

    # drop indices in meta not in unique_indices
    meta = meta.loc[unique_indices]
    return meta


class VideoTextDataset(torch.utils.data.Dataset):
    def __init__(self, meta_path, frame_inds=[0, 10, 20, 30]):
        self.meta_path = meta_path
        self.meta = pd.read_csv(meta_path)
        self.frame_inds = frame_inds

    def __getitem__(self, index):
        sample = self.meta.iloc[index]
        path = sample["path"]

        # extract frames
        images = extract_frames(path, frame_inds=self.frame_inds, backend="opencv")

        # transform
        images = torch.stack([pil_to_tensor(x) for x in images])

        # stack
        # shape: [N, C, H, W]; dtype: torch.uint8
        images = images.float()
        H, W = images.shape[-2:]
        if H > W:
            images = rearrange(images, "N C H W -> N C W H")
        images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True)

        ret = dict(index=index, images=images)
        return ret

    def __len__(self):
        return len(self.meta)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
    parser.add_argument("--bs", type=int, default=4, help="Batch size")  # don't use too large bs for unimatch
    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
    parser.add_argument("--skip_if_existing", action="store_true")
    args = parser.parse_args()
    return args


def main():
    args = parse_args()

    meta_path = args.meta_path
    if not os.path.exists(meta_path):
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

    wo_ext, ext = os.path.splitext(meta_path)
    out_path = f"{wo_ext}_flow{ext}"
    if args.skip_if_existing and os.path.exists(out_path):
        print(f"Output meta file '{out_path}' already exists. Exit.")
        exit()

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())

    # build model
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = UniMatch(
        feature_channels=128,
        num_scales=2,
        upsample_factor=4,
        num_head=1,
        ffn_dim_expansion=4,
        num_transformer_layers=6,
        reg_refine=True,
        task="flow",
    )
    ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth")
    model.load_state_dict(ckpt["model"])
    model = model.to(device)

    # build dataset
    dataset = VideoTextDataset(meta_path=meta_path, frame_inds=[0, 10, 20, 30])
    dataloader = DataLoader(
        dataset,
        batch_size=args.bs,
        num_workers=args.num_workers,
        sampler=DistributedSampler(
            dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=False,
            drop_last=False,
        ),
    )

    # compute optical flow scores
    indices_list = []
    scores_list = []
    model.eval()
    for batch in tqdm(dataloader, disable=dist.get_rank() != 0):
        indices = batch["index"]
        images = batch["images"].to(device, non_blocking=True)

        B = images.shape[0]
        batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous()
        batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous()

        with torch.no_grad():
            res = model(
                batch_0,
                batch_1,
                attn_type="swin",
                attn_splits_list=[2, 8],
                corr_radius_list=[-1, 4],
                prop_radius_list=[-1, 1],
                num_reg_refine=6,
                task="flow",
                pred_bidir_flow=False,
            )
            flow_maps = res["flow_preds"][-1].cpu()  # [B * (N-1), 2, H, W]
            flow_maps = rearrange(flow_maps, "(B N) C H W -> B N H W C", B=B)
            flow_scores = flow_maps.abs().mean(dim=[1, 2, 3, 4])
            flow_scores = flow_scores.tolist()

        indices_list.extend(indices.tolist())
        scores_list.extend(flow_scores)

    # save local results
    meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="flow")
    save_dir_local = os.path.join(os.path.dirname(out_path), "parts")
    os.makedirs(save_dir_local, exist_ok=True)
    out_path_local = os.path.join(
        save_dir_local, os.path.basename(out_path).replace(".csv", f"_part_{dist.get_rank()}.csv")
    )
    meta_local.to_csv(out_path_local, index=False)

    # wait for all ranks to finish data processing
    dist.barrier()

    torch.cuda.empty_cache()
    gc.collect()
    gathered_list = [None] * dist.get_world_size()
    dist.all_gather_object(gathered_list, (indices_list, scores_list))
    if dist.get_rank() == 0:
        meta_new = merge_scores(gathered_list, dataset.meta, column="flow")
        meta_new.to_csv(out_path, index=False)
        print(f"New meta with optical flow scores saved to '{out_path}'.")


if __name__ == "__main__":
    main()


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/__init__.py
================================================
from .unimatch import UniMatch


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/attention.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from .utils import merge_splits, merge_splits_1d, split_feature, split_feature_1d


def single_head_full_attention(q, k, v):
    # q, k, v: [B, L, C]
    assert q.dim() == k.dim() == v.dim() == 3

    scores = torch.matmul(q, k.permute(0, 2, 1)) / (q.size(2) ** 0.5)  # [B, L, L]
    attn = torch.softmax(scores, dim=2)  # [B, L, L]
    out = torch.matmul(attn, v)  # [B, L, C]

    return out


def single_head_full_attention_1d(
    q,
    k,
    v,
    h=None,
    w=None,
):
    # q, k, v: [B, L, C]

    assert h is not None and w is not None
    assert q.size(1) == h * w

    b, _, c = q.size()

    q = q.view(b, h, w, c)  # [B, H, W, C]
    k = k.view(b, h, w, c)
    v = v.view(b, h, w, c)

    scale_factor = c**0.5

    scores = torch.matmul(q, k.permute(0, 1, 3, 2)) / scale_factor  # [B, H, W, W]

    attn = torch.softmax(scores, dim=-1)

    out = torch.matmul(attn, v).view(b, -1, c)  # [B, H*W, C]

    return out


def single_head_split_window_attention(
    q,
    k,
    v,
    num_splits=1,
    with_shift=False,
    h=None,
    w=None,
    attn_mask=None,
):
    # ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
    # q, k, v: [B, L, C]
    assert q.dim() == k.dim() == v.dim() == 3

    assert h is not None and w is not None
    assert q.size(1) == h * w

    b, _, c = q.size()

    b_new = b * num_splits * num_splits

    window_size_h = h // num_splits
    window_size_w = w // num_splits

    q = q.view(b, h, w, c)  # [B, H, W, C]
    k = k.view(b, h, w, c)
    v = v.view(b, h, w, c)

    scale_factor = c**0.5

    if with_shift:
        assert attn_mask is not None  # compute once
        shift_size_h = window_size_h // 2
        shift_size_w = window_size_w // 2

        q = torch.roll(q, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
        k = torch.roll(k, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
        v = torch.roll(v, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))

    q = split_feature(q, num_splits=num_splits, channel_last=True)  # [B*K*K, H/K, W/K, C]
    k = split_feature(k, num_splits=num_splits, channel_last=True)
    v = split_feature(v, num_splits=num_splits, channel_last=True)

    scores = (
        torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1)) / scale_factor
    )  # [B*K*K, H/K*W/K, H/K*W/K]

    if with_shift:
        scores += attn_mask.repeat(b, 1, 1)

    attn = torch.softmax(scores, dim=-1)

    out = torch.matmul(attn, v.view(b_new, -1, c))  # [B*K*K, H/K*W/K, C]

    out = merge_splits(
        out.view(b_new, h // num_splits, w // num_splits, c), num_splits=num_splits, channel_last=True
    )  # [B, H, W, C]

    # shift back
    if with_shift:
        out = torch.roll(out, shifts=(shift_size_h, shift_size_w), dims=(1, 2))

    out = out.view(b, -1, c)

    return out


def single_head_split_window_attention_1d(
    q,
    k,
    v,
    relative_position_bias=None,
    num_splits=1,
    with_shift=False,
    h=None,
    w=None,
    attn_mask=None,
):
    # q, k, v: [B, L, C]

    assert h is not None and w is not None
    assert q.size(1) == h * w

    b, _, c = q.size()

    b_new = b * num_splits * h

    window_size_w = w // num_splits

    q = q.view(b * h, w, c)  # [B*H, W, C]
    k = k.view(b * h, w, c)
    v = v.view(b * h, w, c)

    scale_factor = c**0.5

    if with_shift:
        assert attn_mask is not None  # compute once
        shift_size_w = window_size_w // 2

        q = torch.roll(q, shifts=-shift_size_w, dims=1)
        k = torch.roll(k, shifts=-shift_size_w, dims=1)
        v = torch.roll(v, shifts=-shift_size_w, dims=1)

    q = split_feature_1d(q, num_splits=num_splits)  # [B*H*K, W/K, C]
    k = split_feature_1d(k, num_splits=num_splits)
    v = split_feature_1d(v, num_splits=num_splits)

    scores = (
        torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1)) / scale_factor
    )  # [B*H*K, W/K, W/K]

    if with_shift:
        # attn_mask: [K, W/K, W/K]
        scores += attn_mask.repeat(b * h, 1, 1)  # [B*H*K, W/K, W/K]

    attn = torch.softmax(scores, dim=-1)

    out = torch.matmul(attn, v.view(b_new, -1, c))  # [B*H*K, W/K, C]

    out = merge_splits_1d(out, h, num_splits=num_splits)  # [B, H, W, C]

    # shift back
    if with_shift:
        out = torch.roll(out, shifts=shift_size_w, dims=2)

    out = out.view(b, -1, c)

    return out


class SelfAttnPropagation(nn.Module):
    """
    flow propagation with self-attention on feature
    query: feature0, key: feature0, value: flow
    """

    def __init__(
        self,
        in_channels,
        **kwargs,
    ):
        super(SelfAttnPropagation, self).__init__()

        self.q_proj = nn.Linear(in_channels, in_channels)
        self.k_proj = nn.Linear(in_channels, in_channels)

        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(
        self,
        feature0,
        flow,
        local_window_attn=False,
        local_window_radius=1,
        **kwargs,
    ):
        # q, k: feature [B, C, H, W], v: flow [B, 2, H, W]
        if local_window_attn:
            return self.forward_local_window_attn(feature0, flow, local_window_radius=local_window_radius)

        b, c, h, w = feature0.size()

        query = feature0.view(b, c, h * w).permute(0, 2, 1)  # [B, H*W, C]

        # a note: the ``correct'' implementation should be:
        # ``query = self.q_proj(query), key = self.k_proj(query)''
        # this problem is observed while cleaning up the code
        # however, this doesn't affect the performance since the projection is a linear operation,
        # thus the two projection matrices for key can be merged
        # so I just leave it as is in order to not re-train all models :)
        query = self.q_proj(query)  # [B, H*W, C]
        key = self.k_proj(query)  # [B, H*W, C]

        value = flow.view(b, flow.size(1), h * w).permute(0, 2, 1)  # [B, H*W, 2]

        scores = torch.matmul(query, key.permute(0, 2, 1)) / (c**0.5)  # [B, H*W, H*W]
        prob = torch.softmax(scores, dim=-1)

        out = torch.matmul(prob, value)  # [B, H*W, 2]
        out = out.view(b, h, w, value.size(-1)).permute(0, 3, 1, 2)  # [B, 2, H, W]

        return out

    def forward_local_window_attn(
        self,
        feature0,
        flow,
        local_window_radius=1,
    ):
        assert flow.size(1) == 2 or flow.size(1) == 1  # flow or disparity or depth
        assert local_window_radius > 0

        b, c, h, w = feature0.size()

        value_channel = flow.size(1)

        feature0_reshape = self.q_proj(feature0.view(b, c, -1).permute(0, 2, 1)).reshape(
            b * h * w, 1, c
        )  # [B*H*W, 1, C]

        kernel_size = 2 * local_window_radius + 1

        feature0_proj = self.k_proj(feature0.view(b, c, -1).permute(0, 2, 1)).permute(0, 2, 1).reshape(b, c, h, w)

        feature0_window = F.unfold(
            feature0_proj, kernel_size=kernel_size, padding=local_window_radius
        )  # [B, C*(2R+1)^2), H*W]

        feature0_window = (
            feature0_window.view(b, c, kernel_size**2, h, w)
            .permute(0, 3, 4, 1, 2)
            .reshape(b * h * w, c, kernel_size**2)
        )  # [B*H*W, C, (2R+1)^2]

        flow_window = F.unfold(flow, kernel_size=kernel_size, padding=local_window_radius)  # [B, 2*(2R+1)^2), H*W]

        flow_window = (
            flow_window.view(b, value_channel, kernel_size**2, h, w)
            .permute(0, 3, 4, 2, 1)
            .reshape(b * h * w, kernel_size**2, value_channel)
        )  # [B*H*W, (2R+1)^2, 2]

        scores = torch.matmul(feature0_reshape, feature0_window) / (c**0.5)  # [B*H*W, 1, (2R+1)^2]

        prob = torch.softmax(scores, dim=-1)

        out = (
            torch.matmul(prob, flow_window).view(b, h, w, value_channel).permute(0, 3, 1, 2).contiguous()
        )  # [B, 2, H, W]

        return out


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/backbone.py
================================================
import torch.nn as nn

from .trident_conv import MultiScaleTridentConv


class ResidualBlock(nn.Module):
    def __init__(
        self,
        in_planes,
        planes,
        norm_layer=nn.InstanceNorm2d,
        stride=1,
        dilation=1,
    ):
        super(ResidualBlock, self).__init__()

        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, dilation=dilation, padding=dilation, stride=stride, bias=False
        )
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, dilation=dilation, padding=dilation, bias=False)
        self.relu = nn.ReLU(inplace=True)

        self.norm1 = norm_layer(planes)
        self.norm2 = norm_layer(planes)
        if not stride == 1 or in_planes != planes:
            self.norm3 = norm_layer(planes)

        if stride == 1 and in_planes == planes:
            self.downsample = None
        else:
            self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)

    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x + y)


class CNNEncoder(nn.Module):
    def __init__(
        self,
        output_dim=128,
        norm_layer=nn.InstanceNorm2d,
        num_output_scales=1,
        **kwargs,
    ):
        super(CNNEncoder, self).__init__()
        self.num_branch = num_output_scales

        feature_dims = [64, 96, 128]

        self.conv1 = nn.Conv2d(3, feature_dims[0], kernel_size=7, stride=2, padding=3, bias=False)  # 1/2
        self.norm1 = norm_layer(feature_dims[0])
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = feature_dims[0]
        self.layer1 = self._make_layer(feature_dims[0], stride=1, norm_layer=norm_layer)  # 1/2
        self.layer2 = self._make_layer(feature_dims[1], stride=2, norm_layer=norm_layer)  # 1/4

        # highest resolution 1/4 or 1/8
        stride = 2 if num_output_scales == 1 else 1
        self.layer3 = self._make_layer(
            feature_dims[2],
            stride=stride,
            norm_layer=norm_layer,
        )  # 1/4 or 1/8

        self.conv2 = nn.Conv2d(feature_dims[2], output_dim, 1, 1, 0)

        if self.num_branch > 1:
            if self.num_branch == 4:
                strides = (1, 2, 4, 8)
            elif self.num_branch == 3:
                strides = (1, 2, 4)
            elif self.num_branch == 2:
                strides = (1, 2)
            else:
                raise ValueError

            self.trident_conv = MultiScaleTridentConv(
                output_dim,
                output_dim,
                kernel_size=3,
                strides=strides,
                paddings=1,
                num_branch=self.num_branch,
            )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1, dilation=1, norm_layer=nn.InstanceNorm2d):
        layer1 = ResidualBlock(self.in_planes, dim, norm_layer=norm_layer, stride=stride, dilation=dilation)
        layer2 = ResidualBlock(dim, dim, norm_layer=norm_layer, stride=1, dilation=dilation)

        layers = (layer1, layer2)

        self.in_planes = dim
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)  # 1/2
        x = self.layer2(x)  # 1/4
        x = self.layer3(x)  # 1/8 or 1/4

        x = self.conv2(x)

        if self.num_branch > 1:
            out = self.trident_conv([x] * self.num_branch)  # high to low res
        else:
            out = [x]

        return out


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/geometry.py
================================================
import torch
import torch.nn.functional as F


def coords_grid(b, h, w, homogeneous=False, device=None):
    y, x = torch.meshgrid(torch.arange(h), torch.arange(w))  # [H, W]

    stacks = [x, y]

    if homogeneous:
        ones = torch.ones_like(x)  # [H, W]
        stacks.append(ones)

    grid = torch.stack(stacks, dim=0).float()  # [2, H, W] or [3, H, W]

    grid = grid[None].repeat(b, 1, 1, 1)  # [B, 2, H, W] or [B, 3, H, W]

    if device is not None:
        grid = grid.to(device)

    return grid


def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None):
    assert device is not None

    x, y = torch.meshgrid(
        [torch.linspace(w_min, w_max, len_w, device=device), torch.linspace(h_min, h_max, len_h, device=device)],
    )
    grid = torch.stack((x, y), -1).transpose(0, 1).float()  # [H, W, 2]

    return grid


def normalize_coords(coords, h, w):
    # coords: [B, H, W, 2]
    c = torch.Tensor([(w - 1) / 2.0, (h - 1) / 2.0]).float().to(coords.device)
    return (coords - c) / c  # [-1, 1]


def bilinear_sample(img, sample_coords, mode="bilinear", padding_mode="zeros", return_mask=False):
    # img: [B, C, H, W]
    # sample_coords: [B, 2, H, W] in image scale
    if sample_coords.size(1) != 2:  # [B, H, W, 2]
        sample_coords = sample_coords.permute(0, 3, 1, 2)

    b, _, h, w = sample_coords.shape

    # Normalize to [-1, 1]
    x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1
    y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1

    grid = torch.stack([x_grid, y_grid], dim=-1)  # [B, H, W, 2]

    img = F.grid_sample(img, grid, mode=mode, padding_mode=padding_mode, align_corners=True)

    if return_mask:
        mask = (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1)  # [B, H, W]

        return img, mask

    return img


def flow_warp(feature, flow, mask=False, padding_mode="zeros"):
    b, c, h, w = feature.size()
    assert flow.size(1) == 2

    grid = coords_grid(b, h, w).to(flow.device) + flow  # [B, 2, H, W]

    return bilinear_sample(feature, grid, padding_mode=padding_mode, return_mask=mask)


def forward_backward_consistency_check(fwd_flow, bwd_flow, alpha=0.01, beta=0.5):
    # fwd_flow, bwd_flow: [B, 2, H, W]
    # alpha and beta values are following UnFlow (https://arxiv.org/abs/1711.07837)
    assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4
    assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2
    flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1)  # [B, H, W]

    warped_bwd_flow = flow_warp(bwd_flow, fwd_flow)  # [B, 2, H, W]
    warped_fwd_flow = flow_warp(fwd_flow, bwd_flow)  # [B, 2, H, W]

    diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1)  # [B, H, W]
    diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1)

    threshold = alpha * flow_mag + beta

    fwd_occ = (diff_fwd > threshold).float()  # [B, H, W]
    bwd_occ = (diff_bwd > threshold).float()

    return fwd_occ, bwd_occ


def back_project(depth, intrinsics):
    # Back project 2D pixel coords to 3D points
    # depth: [B, H, W]
    # intrinsics: [B, 3, 3]
    b, h, w = depth.shape
    grid = coords_grid(b, h, w, homogeneous=True, device=depth.device)  # [B, 3, H, W]

    intrinsics_inv = torch.inverse(intrinsics)  # [B, 3, 3]

    points = intrinsics_inv.bmm(grid.view(b, 3, -1)).view(b, 3, h, w) * depth.unsqueeze(1)  # [B, 3, H, W]

    return points


def camera_transform(points_ref, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None):
    # Transform 3D points from reference camera to target camera
    # points_ref: [B, 3, H, W]
    # extrinsics_ref: [B, 4, 4]
    # extrinsics_tgt: [B, 4, 4]
    # extrinsics_rel: [B, 4, 4], relative pose transform
    b, _, h, w = points_ref.shape

    if extrinsics_rel is None:
        extrinsics_rel = torch.bmm(extrinsics_tgt, torch.inverse(extrinsics_ref))  # [B, 4, 4]

    points_tgt = (
        torch.bmm(extrinsics_rel[:, :3, :3], points_ref.view(b, 3, -1)) + extrinsics_rel[:, :3, -1:]
    )  # [B, 3, H*W]

    points_tgt = points_tgt.view(b, 3, h, w)  # [B, 3, H, W]

    return points_tgt


def reproject(points_tgt, intrinsics, return_mask=False):
    # reproject to target view
    # points_tgt: [B, 3, H, W]
    # intrinsics: [B, 3, 3]

    b, _, h, w = points_tgt.shape

    proj_points = torch.bmm(intrinsics, points_tgt.view(b, 3, -1)).view(b, 3, h, w)  # [B, 3, H, W]

    X = proj_points[:, 0]
    Y = proj_points[:, 1]
    Z = proj_points[:, 2].clamp(min=1e-3)

    pixel_coords = torch.stack([X / Z, Y / Z], dim=1).view(b, 2, h, w)  # [B, 2, H, W] in image scale

    if return_mask:
        # valid mask in pixel space
        mask = (
            (pixel_coords[:, 0] >= 0)
            & (pixel_coords[:, 0] <= (w - 1))
            & (pixel_coords[:, 1] >= 0)
            & (pixel_coords[:, 1] <= (h - 1))
        )  # [B, H, W]

        return pixel_coords, mask

    return pixel_coords


def reproject_coords(
    depth_ref, intrinsics, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None, return_mask=False
):
    # Compute reprojection sample coords
    points_ref = back_project(depth_ref, intrinsics)  # [B, 3, H, W]
    points_tgt = camera_transform(points_ref, extrinsics_ref, extrinsics_tgt, extrinsics_rel=extrinsics_rel)

    if return_mask:
        reproj_coords, mask = reproject(points_tgt, intrinsics, return_mask=return_mask)  # [B, 2, H, W] in image scale

        return reproj_coords, mask

    reproj_coords = reproject(points_tgt, intrinsics, return_mask=return_mask)  # [B, 2, H, W] in image scale

    return reproj_coords


def compute_flow_with_depth_pose(
    depth_ref, intrinsics, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None, return_mask=False
):
    b, h, w = depth_ref.shape
    coords_init = coords_grid(b, h, w, device=depth_ref.device)  # [B, 2, H, W]

    if return_mask:
        reproj_coords, mask = reproject_coords(
            depth_ref,
            intrinsics,
            extrinsics_ref,
            extrinsics_tgt,
            extrinsics_rel=extrinsics_rel,
            return_mask=return_mask,
        )  # [B, 2, H, W]
        rigid_flow = reproj_coords - coords_init

        return rigid_flow, mask

    reproj_coords = reproject_coords(
        depth_ref, intrinsics, extrinsics_ref, extrinsics_tgt, extrinsics_rel=extrinsics_rel, return_mask=return_mask
    )  # [B, 2, H, W]

    rigid_flow = reproj_coords - coords_init

    return rigid_flow


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/matching.py
================================================
import torch
import torch.nn.functional as F

from .geometry import coords_grid, generate_window_grid, normalize_coords


def global_correlation_softmax(
    feature0,
    feature1,
    pred_bidir_flow=False,
):
    # global correlation
    b, c, h, w = feature0.shape
    feature0 = feature0.view(b, c, -1).permute(0, 2, 1)  # [B, H*W, C]
    feature1 = feature1.view(b, c, -1)  # [B, C, H*W]

    correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (c**0.5)  # [B, H, W, H, W]

    # flow from softmax
    init_grid = coords_grid(b, h, w).to(correlation.device)  # [B, 2, H, W]
    grid = init_grid.view(b, 2, -1).permute(0, 2, 1)  # [B, H*W, 2]

    correlation = correlation.view(b, h * w, h * w)  # [B, H*W, H*W]

    if pred_bidir_flow:
        correlation = torch.cat((correlation, correlation.permute(0, 2, 1)), dim=0)  # [2*B, H*W, H*W]
        init_grid = init_grid.repeat(2, 1, 1, 1)  # [2*B, 2, H, W]
        grid = grid.repeat(2, 1, 1)  # [2*B, H*W, 2]
        b = b * 2

    prob = F.softmax(correlation, dim=-1)  # [B, H*W, H*W]

    correspondence = torch.matmul(prob, grid).view(b, h, w, 2).permute(0, 3, 1, 2)  # [B, 2, H, W]

    # when predicting bidirectional flow, flow is the concatenation of forward flow and backward flow
    flow = correspondence - init_grid

    return flow, prob


def local_correlation_softmax(
    feature0,
    feature1,
    local_radius,
    padding_mode="zeros",
):
    b, c, h, w = feature0.size()
    coords_init = coords_grid(b, h, w).to(feature0.device)  # [B, 2, H, W]
    coords = coords_init.view(b, 2, -1).permute(0, 2, 1)  # [B, H*W, 2]

    local_h = 2 * local_radius + 1
    local_w = 2 * local_radius + 1

    window_grid = generate_window_grid(
        -local_radius, local_radius, -local_radius, local_radius, local_h, local_w, device=feature0.device
    )  # [2R+1, 2R+1, 2]
    window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1)  # [B, 1, (2R+1)^2, 2]
    sample_coords = coords.unsqueeze(-2) + window_grid  # [B, H*W, (2R+1)^2, 2]

    sample_coords_softmax = sample_coords

    # exclude coords that are out of image space
    valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w)  # [B, H*W, (2R+1)^2]
    valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h)  # [B, H*W, (2R+1)^2]

    valid = valid_x & valid_y  # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax

    # normalize coordinates to [-1, 1]
    sample_coords_norm = normalize_coords(sample_coords, h, w)  # [-1, 1]
    window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=True).permute(
        0, 2, 1, 3
    )  # [B, H*W, C, (2R+1)^2]
    feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c)  # [B, H*W, 1, C]

    corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5)  # [B, H*W, (2R+1)^2]

    # mask invalid locations
    corr[~valid] = -1e9

    prob = F.softmax(corr, -1)  # [B, H*W, (2R+1)^2]

    correspondence = (
        torch.matmul(prob.unsqueeze(-2), sample_coords_softmax).squeeze(-2).view(b, h, w, 2).permute(0, 3, 1, 2)
    )  # [B, 2, H, W]

    flow = correspondence - coords_init
    match_prob = prob

    return flow, match_prob


def local_correlation_with_flow(
    feature0,
    feature1,
    flow,
    local_radius,
    padding_mode="zeros",
    dilation=1,
):
    b, c, h, w = feature0.size()
    coords_init = coords_grid(b, h, w).to(feature0.device)  # [B, 2, H, W]
    coords = coords_init.view(b, 2, -1).permute(0, 2, 1)  # [B, H*W, 2]

    local_h = 2 * local_radius + 1
    local_w = 2 * local_radius + 1

    window_grid = generate_window_grid(
        -local_radius, local_radius, -local_radius, local_radius, local_h, local_w, device=feature0.device
    )  # [2R+1, 2R+1, 2]
    window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1)  # [B, 1, (2R+1)^2, 2]
    sample_coords = coords.unsqueeze(-2) + window_grid * dilation  # [B, H*W, (2R+1)^2, 2]

    # flow can be zero when using features after transformer
    if not isinstance(flow, float):
        sample_coords = sample_coords + flow.view(b, 2, -1).permute(0, 2, 1).unsqueeze(-2)  # [B, H*W, (2R+1)^2, 2]
    else:
        assert flow == 0.0

    # normalize coordinates to [-1, 1]
    sample_coords_norm = normalize_coords(sample_coords, h, w)  # [-1, 1]
    window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=True).permute(
        0, 2, 1, 3
    )  # [B, H*W, C, (2R+1)^2]
    feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c)  # [B, H*W, 1, C]

    corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5)  # [B, H*W, (2R+1)^2]

    corr = corr.view(b, h, w, -1).permute(0, 3, 1, 2).contiguous()  # [B, (2R+1)^2, H, W]

    return corr


def global_correlation_softmax_stereo(
    feature0,
    feature1,
):
    # global correlation on horizontal direction
    b, c, h, w = feature0.shape

    x_grid = torch.linspace(0, w - 1, w, device=feature0.device)  # [W]

    feature0 = feature0.permute(0, 2, 3, 1)  # [B, H, W, C]
    feature1 = feature1.permute(0, 2, 1, 3)  # [B, H, C, W]

    correlation = torch.matmul(feature0, feature1) / (c**0.5)  # [B, H, W, W]

    # mask subsequent positions to make disparity positive
    mask = torch.triu(torch.ones((w, w)), diagonal=1).type_as(feature0)  # [W, W]
    valid_mask = (mask == 0).unsqueeze(0).unsqueeze(0).repeat(b, h, 1, 1)  # [B, H, W, W]

    correlation[~valid_mask] = -1e9

    prob = F.softmax(correlation, dim=-1)  # [B, H, W, W]

    correspondence = (x_grid.view(1, 1, 1, w) * prob).sum(-1)  # [B, H, W]

    # NOTE: unlike flow, disparity is typically positive
    disparity = x_grid.view(1, 1, w).repeat(b, h, 1) - correspondence  # [B, H, W]

    return disparity.unsqueeze(1), prob  # feature resolution


def local_correlation_softmax_stereo(
    feature0,
    feature1,
    local_radius,
):
    b, c, h, w = feature0.size()
    coords_init = coords_grid(b, h, w).to(feature0.device)  # [B, 2, H, W]
    coords = coords_init.view(b, 2, -1).permute(0, 2, 1).contiguous()  # [B, H*W, 2]

    local_h = 1
    local_w = 2 * local_radius + 1

    window_grid = generate_window_grid(
        0, 0, -local_radius, local_radius, local_h, local_w, device=feature0.device
    )  # [1, 2R+1, 2]
    window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1)  # [B, 1, (2R+1), 2]
    sample_coords = coords.unsqueeze(-2) + window_grid  # [B, H*W, (2R+1), 2]

    sample_coords_softmax = sample_coords

    # exclude coords that are out of image space
    valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w)  # [B, H*W, (2R+1)^2]
    valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h)  # [B, H*W, (2R+1)^2]

    valid = valid_x & valid_y  # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax

    # normalize coordinates to [-1, 1]
    sample_coords_norm = normalize_coords(sample_coords, h, w)  # [-1, 1]
    window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode="zeros", align_corners=True).permute(
        0, 2, 1, 3
    )  # [B, H*W, C, (2R+1)]
    feature0_view = feature0.permute(0, 2, 3, 1).contiguous().view(b, h * w, 1, c)  # [B, H*W, 1, C]

    corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5)  # [B, H*W, (2R+1)]

    # mask invalid locations
    corr[~valid] = -1e9

    prob = F.softmax(corr, -1)  # [B, H*W, (2R+1)]

    correspondence = (
        torch.matmul(prob.unsqueeze(-2), sample_coords_softmax)
        .squeeze(-2)
        .view(b, h, w, 2)
        .permute(0, 3, 1, 2)
        .contiguous()
    )  # [B, 2, H, W]

    flow = correspondence - coords_init  # flow at feature resolution
    match_prob = prob

    flow_x = -flow[:, :1]  # [B, 1, H, W]

    return flow_x, match_prob


def correlation_softmax_depth(
    feature0,
    feature1,
    intrinsics,
    pose,
    depth_candidates,
    depth_from_argmax=False,
    pred_bidir_depth=False,
):
    b, c, h, w = feature0.size()
    assert depth_candidates.dim() == 4  # [B, D, H, W]
    scale_factor = c**0.5

    if pred_bidir_depth:
        feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0)
        intrinsics = intrinsics.repeat(2, 1, 1)
        pose = torch.cat((pose, torch.inverse(pose)), dim=0)
        depth_candidates = depth_candidates.repeat(2, 1, 1, 1)

    # depth candidates are actually inverse depth
    warped_feature1 = warp_with_pose_depth_candidates(
        feature1,
        intrinsics,
        pose,
        1.0 / depth_candidates,
    )  # [B, C, D, H, W]

    correlation = (feature0.unsqueeze(2) * warped_feature1).sum(1) / scale_factor  # [B, D, H, W]

    match_prob = F.softmax(correlation, dim=1)  # [B, D, H, W]

    # for cross-task transfer (flow -> depth), extract depth with argmax at test time
    if depth_from_argmax:
        index = torch.argmax(match_prob, dim=1, keepdim=True)
        depth = torch.gather(depth_candidates, dim=1, index=index)
    else:
        depth = (match_prob * depth_candidates).sum(dim=1, keepdim=True)  # [B, 1, H, W]

    return depth, match_prob


def warp_with_pose_depth_candidates(
    feature1,
    intrinsics,
    pose,
    depth,
    clamp_min_depth=1e-3,
):
    """
    feature1: [B, C, H, W]
    intrinsics: [B, 3, 3]
    pose: [B, 4, 4]
    depth: [B, D, H, W]
    """

    assert intrinsics.size(1) == intrinsics.size(2) == 3
    assert pose.size(1) == pose.size(2) == 4
    assert depth.dim() == 4

    b, d, h, w = depth.size()
    c = feature1.size(1)

    with torch.no_grad():
        # pixel coordinates
        grid = coords_grid(b, h, w, homogeneous=True, device=depth.device)  # [B, 3, H, W]
        # back project to 3D and transform viewpoint
        points = torch.inverse(intrinsics).bmm(grid.view(b, 3, -1))  # [B, 3, H*W]
        points = torch.bmm(pose[:, :3, :3], points).unsqueeze(2).repeat(1, 1, d, 1) * depth.view(
            b, 1, d, h * w
        )  # [B, 3, D, H*W]
        points = points + pose[:, :3, -1:].unsqueeze(-1)  # [B, 3, D, H*W]
        # reproject to 2D image plane
        points = torch.bmm(intrinsics, points.view(b, 3, -1)).view(b, 3, d, h * w)  # [B, 3, D, H*W]
        pixel_coords = points[:, :2] / points[:, -1:].clamp(min=clamp_min_depth)  # [B, 2, D, H*W]

        # normalize to [-1, 1]
        x_grid = 2 * pixel_coords[:, 0] / (w - 1) - 1
        y_grid = 2 * pixel_coords[:, 1] / (h - 1) - 1

        grid = torch.stack([x_grid, y_grid], dim=-1)  # [B, D, H*W, 2]

    # sample features
    warped_feature = F.grid_sample(
        feature1, grid.view(b, d * h, w, 2), mode="bilinear", padding_mode="zeros", align_corners=True
    ).view(
        b, c, d, h, w
    )  # [B, C, D, H, W]

    return warped_feature


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/position.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# https://github.com/facebookresearch/detr/blob/main/models/position_encoding.py

import math

import torch
import torch.nn as nn


class PositionEmbeddingSine(nn.Module):
    """
    This is a more standard version of the position embedding, very similar to the one
    used by the Attention is all you need paper, generalized to work on images.
    """

    def __init__(self, num_pos_feats=64, temperature=10000, normalize=True, scale=None):
        super().__init__()
        self.num_pos_feats = num_pos_feats
        self.temperature = temperature
        self.normalize = normalize
        if scale is not None and normalize is False:
            raise ValueError("normalize should be True if scale is passed")
        if scale is None:
            scale = 2 * math.pi
        self.scale = scale

    def forward(self, x):
        # x = tensor_list.tensors  # [B, C, H, W]
        # mask = tensor_list.mask  # [B, H, W], input with padding, valid as 0
        b, c, h, w = x.size()
        mask = torch.ones((b, h, w), device=x.device)  # [B, H, W]
        y_embed = mask.cumsum(1, dtype=torch.float32)
        x_embed = mask.cumsum(2, dtype=torch.float32)
        if self.normalize:
            eps = 1e-6
            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale

        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)

        pos_x = x_embed[:, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, None] / dim_t
        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
        return pos


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/reg_refine.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


class FlowHead(nn.Module):
    def __init__(
        self,
        input_dim=128,
        hidden_dim=256,
        out_dim=2,
    ):
        super(FlowHead, self).__init__()

        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
        self.conv2 = nn.Conv2d(hidden_dim, out_dim, 3, padding=1)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        out = self.conv2(self.relu(self.conv1(x)))

        return out


class SepConvGRU(nn.Module):
    def __init__(
        self,
        hidden_dim=128,
        input_dim=192 + 128,
        kernel_size=5,
    ):
        padding = (kernel_size - 1) // 2

        super(SepConvGRU, self).__init__()
        self.convz1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
        self.convr1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
        self.convq1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))

        self.convz2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
        self.convr2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
        self.convq2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))

    def forward(self, h, x):
        # horizontal
        hx = torch.cat([h, x], dim=1)
        z = torch.sigmoid(self.convz1(hx))
        r = torch.sigmoid(self.convr1(hx))
        q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1)))
        h = (1 - z) * h + z * q

        # vertical
        hx = torch.cat([h, x], dim=1)
        z = torch.sigmoid(self.convz2(hx))
        r = torch.sigmoid(self.convr2(hx))
        q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1)))
        h = (1 - z) * h + z * q

        return h


class BasicMotionEncoder(nn.Module):
    def __init__(
        self,
        corr_channels=324,
        flow_channels=2,
    ):
        super(BasicMotionEncoder, self).__init__()

        self.convc1 = nn.Conv2d(corr_channels, 256, 1, padding=0)
        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
        self.convf1 = nn.Conv2d(flow_channels, 128, 7, padding=3)
        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
        self.conv = nn.Conv2d(64 + 192, 128 - flow_channels, 3, padding=1)

    def forward(self, flow, corr):
        cor = F.relu(self.convc1(corr))
        cor = F.relu(self.convc2(cor))
        flo = F.relu(self.convf1(flow))
        flo = F.relu(self.convf2(flo))

        cor_flo = torch.cat([cor, flo], dim=1)
        out = F.relu(self.conv(cor_flo))
        return torch.cat([out, flow], dim=1)


class BasicUpdateBlock(nn.Module):
    def __init__(
        self,
        corr_channels=324,
        hidden_dim=128,
        context_dim=128,
        downsample_factor=8,
        flow_dim=2,
        bilinear_up=False,
    ):
        super(BasicUpdateBlock, self).__init__()

        self.encoder = BasicMotionEncoder(
            corr_channels=corr_channels,
            flow_channels=flow_dim,
        )

        self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=context_dim + hidden_dim)

        self.flow_head = FlowHead(
            hidden_dim,
            hidden_dim=256,
            out_dim=flow_dim,
        )

        if bilinear_up:
            self.mask = None
        else:
            self.mask = nn.Sequential(
                nn.Conv2d(hidden_dim, 256, 3, padding=1),
                nn.ReLU(inplace=True),
                nn.Conv2d(256, downsample_factor**2 * 9, 1, padding=0),
            )

    def forward(self, net, inp, corr, flow):
        motion_features = self.encoder(flow, corr)

        inp = torch.cat([inp, motion_features], dim=1)

        net = self.gru(net, inp)
        delta_flow = self.flow_head(net)

        if self.mask is not None:
            mask = self.mask(net)
        else:
            mask = None

        return net, mask, delta_flow


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/transformer.py
================================================
import torch
import torch.nn as nn

from .attention import (
    single_head_full_attention,
    single_head_full_attention_1d,
    single_head_split_window_attention,
    single_head_split_window_attention_1d,
)
from .utils import generate_shift_window_attn_mask, generate_shift_window_attn_mask_1d


class TransformerLayer(nn.Module):
    def __init__(
        self,
        d_model=128,
        nhead=1,
        no_ffn=False,
        ffn_dim_expansion=4,
    ):
        super(TransformerLayer, self).__init__()

        self.dim = d_model
        self.nhead = nhead
        self.no_ffn = no_ffn

        # multi-head attention
        self.q_proj = nn.Linear(d_model, d_model, bias=False)
        self.k_proj = nn.Linear(d_model, d_model, bias=False)
        self.v_proj = nn.Linear(d_model, d_model, bias=False)

        self.merge = nn.Linear(d_model, d_model, bias=False)

        self.norm1 = nn.LayerNorm(d_model)

        # no ffn after self-attn, with ffn after cross-attn
        if not self.no_ffn:
            in_channels = d_model * 2
            self.mlp = nn.Sequential(
                nn.Linear(in_channels, in_channels * ffn_dim_expansion, bias=False),
                nn.GELU(),
                nn.Linear(in_channels * ffn_dim_expansion, d_model, bias=False),
            )

            self.norm2 = nn.LayerNorm(d_model)

    def forward(
        self,
        source,
        target,
        height=None,
        width=None,
        shifted_window_attn_mask=None,
        shifted_window_attn_mask_1d=None,
        attn_type="swin",
        with_shift=False,
        attn_num_splits=None,
    ):
        # source, target: [B, L, C]
        query, key, value = source, target, target

        # for stereo: 2d attn in self-attn, 1d attn in cross-attn
        is_self_attn = (query - key).abs().max() < 1e-6

        # single-head attention
        query = self.q_proj(query)  # [B, L, C]
        key = self.k_proj(key)  # [B, L, C]
        value = self.v_proj(value)  # [B, L, C]

        if attn_type == "swin" and attn_num_splits > 1:  # self, cross-attn: both swin 2d
            if self.nhead > 1:
                # we observe that multihead attention slows down the speed and increases the memory consumption
                # without bringing obvious performance gains and thus the implementation is removed
                raise NotImplementedError
            else:
                message = single_head_split_window_attention(
                    query,
                    key,
                    value,
                    num_splits=attn_num_splits,
                    with_shift=with_shift,
                    h=height,
                    w=width,
                    attn_mask=shifted_window_attn_mask,
                )

        elif attn_type == "self_swin2d_cross_1d":  # self-attn: swin 2d, cross-attn: full 1d
            if self.nhead > 1:
                raise NotImplementedError
            else:
                if is_self_attn:
                    if attn_num_splits > 1:
                        message = single_head_split_window_attention(
                            query,
                            key,
                            value,
                            num_splits=attn_num_splits,
                            with_shift=with_shift,
                            h=height,
                            w=width,
                            attn_mask=shifted_window_attn_mask,
                        )
                    else:
                        # full 2d attn
                        message = single_head_full_attention(query, key, value)  # [N, L, C]

                else:
                    # cross attn 1d
                    message = single_head_full_attention_1d(
                        query,
                        key,
                        value,
                        h=height,
                        w=width,
                    )

        elif attn_type == "self_swin2d_cross_swin1d":  # self-attn: swin 2d, cross-attn: swin 1d
            if self.nhead > 1:
                raise NotImplementedError
            else:
                if is_self_attn:
                    if attn_num_splits > 1:
                        # self attn shift window
                        message = single_head_split_window_attention(
                            query,
                            key,
                            value,
                            num_splits=attn_num_splits,
                            with_shift=with_shift,
                            h=height,
                            w=width,
                            attn_mask=shifted_window_attn_mask,
                        )
                    else:
                        # full 2d attn
                        message = single_head_full_attention(query, key, value)  # [N, L, C]
                else:
                    if attn_num_splits > 1:
                        assert shifted_window_attn_mask_1d is not None
                        # cross attn 1d shift
                        message = single_head_split_window_attention_1d(
                            query,
                            key,
                            value,
                            num_splits=attn_num_splits,
                            with_shift=with_shift,
                            h=height,
                            w=width,
                            attn_mask=shifted_window_attn_mask_1d,
                        )
                    else:
                        message = single_head_full_attention_1d(
                            query,
                            key,
                            value,
                            h=height,
                            w=width,
                        )

        else:
            message = single_head_full_attention(query, key, value)  # [B, L, C]

        message = self.merge(message)  # [B, L, C]
        message = self.norm1(message)

        if not self.no_ffn:
            message = self.mlp(torch.cat([source, message], dim=-1))
            message = self.norm2(message)

        return source + message


class TransformerBlock(nn.Module):
    """self attention + cross attention + FFN"""

    def __init__(
        self,
        d_model=128,
        nhead=1,
        ffn_dim_expansion=4,
    ):
        super(TransformerBlock, self).__init__()

        self.self_attn = TransformerLayer(
            d_model=d_model,
            nhead=nhead,
            no_ffn=True,
            ffn_dim_expansion=ffn_dim_expansion,
        )

        self.cross_attn_ffn = TransformerLayer(
            d_model=d_model,
            nhead=nhead,
            ffn_dim_expansion=ffn_dim_expansion,
        )

    def forward(
        self,
        source,
        target,
        height=None,
        width=None,
        shifted_window_attn_mask=None,
        shifted_window_attn_mask_1d=None,
        attn_type="swin",
        with_shift=False,
        attn_num_splits=None,
    ):
        # source, target: [B, L, C]

        # self attention
        source = self.self_attn(
            source,
            source,
            height=height,
            width=width,
            shifted_window_attn_mask=shifted_window_attn_mask,
            attn_type=attn_type,
            with_shift=with_shift,
            attn_num_splits=attn_num_splits,
        )

        # cross attention and ffn
        source = self.cross_attn_ffn(
            source,
            target,
            height=height,
            width=width,
            shifted_window_attn_mask=shifted_window_attn_mask,
            shifted_window_attn_mask_1d=shifted_window_attn_mask_1d,
            attn_type=attn_type,
            with_shift=with_shift,
            attn_num_splits=attn_num_splits,
        )

        return source


class FeatureTransformer(nn.Module):
    def __init__(
        self,
        num_layers=6,
        d_model=128,
        nhead=1,
        ffn_dim_expansion=4,
    ):
        super(FeatureTransformer, self).__init__()

        self.d_model = d_model
        self.nhead = nhead

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    d_model=d_model,
                    nhead=nhead,
                    ffn_dim_expansion=ffn_dim_expansion,
                )
                for i in range(num_layers)
            ]
        )

        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(
        self,
        feature0,
        feature1,
        attn_type="swin",
        attn_num_splits=None,
        **kwargs,
    ):
        b, c, h, w = feature0.shape
        assert self.d_model == c

        feature0 = feature0.flatten(-2).permute(0, 2, 1)  # [B, H*W, C]
        feature1 = feature1.flatten(-2).permute(0, 2, 1)  # [B, H*W, C]

        # 2d attention
        if "swin" in attn_type and attn_num_splits > 1:
            # global and refine use different number of splits
            window_size_h = h // attn_num_splits
            window_size_w = w // attn_num_splits

            # compute attn mask once
            shifted_window_attn_mask = generate_shift_window_attn_mask(
                input_resolution=(h, w),
                window_size_h=window_size_h,
                window_size_w=window_size_w,
                shift_size_h=window_size_h // 2,
                shift_size_w=window_size_w // 2,
                device=feature0.device,
            )  # [K*K, H/K*W/K, H/K*W/K]
        else:
            shifted_window_attn_mask = None

        # 1d attention
        if "swin1d" in attn_type and attn_num_splits > 1:
            window_size_w = w // attn_num_splits

            # compute attn mask once
            shifted_window_attn_mask_1d = generate_shift_window_attn_mask_1d(
                input_w=w,
                window_size_w=window_size_w,
                shift_size_w=window_size_w // 2,
                device=feature0.device,
            )  # [K, W/K, W/K]
        else:
            shifted_window_attn_mask_1d = None

        # concat feature0 and feature1 in batch dimension to compute in parallel
        concat0 = torch.cat((feature0, feature1), dim=0)  # [2B, H*W, C]
        concat1 = torch.cat((feature1, feature0), dim=0)  # [2B, H*W, C]

        for i, layer in enumerate(self.layers):
            concat0 = layer(
                concat0,
                concat1,
                height=h,
                width=w,
                attn_type=attn_type,
                with_shift="swin" in attn_type and attn_num_splits > 1 and i % 2 == 1,
                attn_num_splits=attn_num_splits,
                shifted_window_attn_mask=shifted_window_attn_mask,
                shifted_window_attn_mask_1d=shifted_window_attn_mask_1d,
            )

            # update feature1
            concat1 = torch.cat(concat0.chunk(chunks=2, dim=0)[::-1], dim=0)

        feature0, feature1 = concat0.chunk(chunks=2, dim=0)  # [B, H*W, C]

        # reshape back
        feature0 = feature0.view(b, h, w, c).permute(0, 3, 1, 2).contiguous()  # [B, C, H, W]
        feature1 = feature1.view(b, h, w, c).permute(0, 3, 1, 2).contiguous()  # [B, C, H, W]

        return feature0, feature1


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/trident_conv.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# https://github.com/facebookresearch/detectron2/blob/main/projects/TridentNet/tridentnet/trident_conv.py

import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.modules.utils import _pair


class MultiScaleTridentConv(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        stride=1,
        strides=1,
        paddings=0,
        dilations=1,
        dilation=1,
        groups=1,
        num_branch=1,
        test_branch_idx=-1,
        bias=False,
        norm=None,
        activation=None,
    ):
        super(MultiScaleTridentConv, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = _pair(kernel_size)
        self.num_branch = num_branch
        self.stride = _pair(stride)
        self.groups = groups
        self.with_bias = bias
        self.dilation = dilation
        if isinstance(paddings, int):
            paddings = [paddings] * self.num_branch
        if isinstance(dilations, int):
            dilations = [dilations] * self.num_branch
        if isinstance(strides, int):
            strides = [strides] * self.num_branch
        self.paddings = [_pair(padding) for padding in paddings]
        self.dilations = [_pair(dilation) for dilation in dilations]
        self.strides = [_pair(stride) for stride in strides]
        self.test_branch_idx = test_branch_idx
        self.norm = norm
        self.activation = activation

        assert len({self.num_branch, len(self.paddings), len(self.strides)}) == 1

        self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, *self.kernel_size))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_channels))
        else:
            self.bias = None

        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
        if self.bias is not None:
            nn.init.constant_(self.bias, 0)

    def forward(self, inputs):
        num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1
        assert len(inputs) == num_branch

        if self.training or self.test_branch_idx == -1:
            outputs = [
                F.conv2d(input, self.weight, self.bias, stride, padding, self.dilation, self.groups)
                for input, stride, padding in zip(inputs, self.strides, self.paddings)
            ]
        else:
            outputs = [
                F.conv2d(
                    inputs[0],
                    self.weight,
                    self.bias,
                    self.strides[self.test_branch_idx] if self.test_branch_idx == -1 else self.strides[-1],
                    self.paddings[self.test_branch_idx] if self.test_branch_idx == -1 else self.paddings[-1],
                    self.dilation,
                    self.groups,
                )
            ]

        if self.norm is not None:
            outputs = [self.norm(x) for x in outputs]
        if self.activation is not None:
            outputs = [self.activation(x) for x in outputs]
        return outputs


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/unimatch.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from .attention import SelfAttnPropagation
from .backbone import CNNEncoder
from .geometry import compute_flow_with_depth_pose, flow_warp
from .matching import (
    correlation_softmax_depth,
    global_correlation_softmax,
    global_correlation_softmax_stereo,
    local_correlation_softmax,
    local_correlation_softmax_stereo,
    local_correlation_with_flow,
)
from .reg_refine import BasicUpdateBlock
from .transformer import FeatureTransformer
from .utils import feature_add_position, normalize_img, upsample_flow_with_mask


class UniMatch(nn.Module):
    def __init__(
        self,
        num_scales=1,
        feature_channels=128,
        upsample_factor=8,
        num_head=1,
        ffn_dim_expansion=4,
        num_transformer_layers=6,
        reg_refine=False,  # optional local regression refinement
        task="flow",
    ):
        super(UniMatch, self).__init__()

        self.feature_channels = feature_channels
        self.num_scales = num_scales
        self.upsample_factor = upsample_factor
        self.reg_refine = reg_refine

        # CNN
        self.backbone = CNNEncoder(output_dim=feature_channels, num_output_scales=num_scales)

        # Transformer
        self.transformer = FeatureTransformer(
            num_layers=num_transformer_layers,
            d_model=feature_channels,
            nhead=num_head,
            ffn_dim_expansion=ffn_dim_expansion,
        )

        # propagation with self-attn
        self.feature_flow_attn = SelfAttnPropagation(in_channels=feature_channels)

        if not self.reg_refine or task == "depth":
            # convex upsampling simiar to RAFT
            # concat feature0 and low res flow as input
            self.upsampler = nn.Sequential(
                nn.Conv2d(2 + feature_channels, 256, 3, 1, 1),
                nn.ReLU(inplace=True),
                nn.Conv2d(256, upsample_factor**2 * 9, 1, 1, 0),
            )
            # thus far, all the learnable parameters are task-agnostic

        if reg_refine:
            # optional task-specific local regression refinement
            self.refine_proj = nn.Conv2d(128, 256, 1)
            self.refine = BasicUpdateBlock(
                corr_channels=(2 * 4 + 1) ** 2,
                downsample_factor=upsample_factor,
                flow_dim=2 if task == "flow" else 1,
                bilinear_up=task == "depth",
            )

    def extract_feature(self, img0, img1):
        concat = torch.cat((img0, img1), dim=0)  # [2B, C, H, W]
        features = self.backbone(concat)  # list of [2B, C, H, W], resolution from high to low

        # reverse: resolution from low to high
        features = features[::-1]

        feature0, feature1 = [], []

        for i in range(len(features)):
            feature = features[i]
            chunks = torch.chunk(feature, 2, 0)  # tuple
            feature0.append(chunks[0])
            feature1.append(chunks[1])

        return feature0, feature1

    def upsample_flow(self, flow, feature, bilinear=False, upsample_factor=8, is_depth=False):
        if bilinear:
            multiplier = 1 if is_depth else upsample_factor
            up_flow = (
                F.interpolate(flow, scale_factor=upsample_factor, mode="bilinear", align_corners=True) * multiplier
            )
        else:
            concat = torch.cat((flow, feature), dim=1)
            mask = self.upsampler(concat)
            up_flow = upsample_flow_with_mask(flow, mask, upsample_factor=self.upsample_factor, is_depth=is_depth)

        return up_flow

    def forward(
        self,
        img0,
        img1,
        attn_type=None,
        attn_splits_list=None,
        corr_radius_list=None,
        prop_radius_list=None,
        num_reg_refine=1,
        pred_bidir_flow=False,
        task="flow",
        intrinsics=None,
        pose=None,  # relative pose transform
        min_depth=1.0 / 0.5,  # inverse depth range
        max_depth=1.0 / 10,
        num_depth_candidates=64,
        depth_from_argmax=False,
        pred_bidir_depth=False,
        **kwargs,
    ):
        if pred_bidir_flow:
            assert task == "flow"

        if task == "depth":
            assert self.num_scales == 1  # multi-scale depth model is not supported yet

        results_dict = {}
        flow_preds = []

        if task == "flow":
            # stereo and depth tasks have normalized img in dataloader
            img0, img1 = normalize_img(img0, img1)  # [B, 3, H, W]

        # list of features, resolution low to high
        feature0_list, feature1_list = self.extract_feature(img0, img1)  # list of features

        flow = None

        if task != "depth":
            assert len(attn_splits_list) == len(corr_radius_list) == len(prop_radius_list) == self.num_scales
        else:
            assert len(attn_splits_list) == len(prop_radius_list) == self.num_scales == 1

        for scale_idx in range(self.num_scales):
            feature0, feature1 = feature0_list[scale_idx], feature1_list[scale_idx]

            if pred_bidir_flow and scale_idx > 0:
                # predicting bidirectional flow with refinement
                feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0)

            feature0_ori, feature1_ori = feature0, feature1

            upsample_factor = self.upsample_factor * (2 ** (self.num_scales - 1 - scale_idx))

            if task == "depth":
                # scale intrinsics
                intrinsics_curr = intrinsics.clone()
                intrinsics_curr[:, :2] = intrinsics_curr[:, :2] / upsample_factor

            if scale_idx > 0:
                assert task != "depth"  # not supported for multi-scale depth model
                flow = F.interpolate(flow, scale_factor=2, mode="bilinear", align_corners=True) * 2

            if flow is not None:
                assert task != "depth"
                flow = flow.detach()

                if task == "stereo":
                    # construct flow vector for disparity
                    # flow here is actually disparity
                    zeros = torch.zeros_like(flow)  # [B, 1, H, W]
                    # NOTE: reverse disp, disparity is positive
                    displace = torch.cat((-flow, zeros), dim=1)  # [B, 2, H, W]
                    feature1 = flow_warp(feature1, displace)  # [B, C, H, W]
                elif task == "flow":
                    feature1 = flow_warp(feature1, flow)  # [B, C, H, W]
                else:
                    raise NotImplementedError

            attn_splits = attn_splits_list[scale_idx]
            if task != "depth":
                corr_radius = corr_radius_list[scale_idx]
            prop_radius = prop_radius_list[scale_idx]

            # add position to features
            feature0, feature1 = feature_add_position(feature0, feature1, attn_splits, self.feature_channels)

            # Transformer
            feature0, feature1 = self.transformer(
                feature0,
                feature1,
                attn_type=attn_type,
                attn_num_splits=attn_splits,
            )

            # correlation and softmax
            if task == "depth":
                # first generate depth candidates
                b, _, h, w = feature0.size()
                depth_candidates = torch.linspace(min_depth, max_depth, num_depth_candidates).type_as(feature0)
                depth_candidates = depth_candidates.view(1, num_depth_candidates, 1, 1).repeat(
                    b, 1, h, w
                )  # [B, D, H, W]

                flow_pred = correlation_softmax_depth(
                    feature0,
                    feature1,
                    intrinsics_curr,
                    pose,
                    depth_candidates=depth_candidates,
                    depth_from_argmax=depth_from_argmax,
                    pred_bidir_depth=pred_bidir_depth,
                )[0]

            else:
                if corr_radius == -1:  # global matching
                    if task == "flow":
                        flow_pred = global_correlation_softmax(feature0, feature1, pred_bidir_flow)[0]
                    elif task == "stereo":
                        flow_pred = global_correlation_softmax_stereo(feature0, feature1)[0]
                    else:
                        raise NotImplementedError
                else:  # local matching
                    if task == "flow":
                        flow_pred = local_correlation_softmax(feature0, feature1, corr_radius)[0]
                    elif task == "stereo":
                        flow_pred = local_correlation_softmax_stereo(feature0, feature1, corr_radius)[0]
                    else:
                        raise NotImplementedError

            # flow or residual flow
            flow = flow + flow_pred if flow is not None else flow_pred

            if task == "stereo":
                flow = flow.clamp(min=0)  # positive disparity

            # upsample to the original resolution for supervison at training time only
            if self.training:
                flow_bilinear = self.upsample_flow(
                    flow, None, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth"
                )
                flow_preds.append(flow_bilinear)

            # flow propagation with self-attn
            if (pred_bidir_flow or pred_bidir_depth) and scale_idx == 0:
                feature0 = torch.cat((feature0, feature1), dim=0)  # [2*B, C, H, W] for propagation

            flow = self.feature_flow_attn(
                feature0,
                flow.detach(),
                local_window_attn=prop_radius > 0,
                local_window_radius=prop_radius,
            )

            # bilinear exclude the last one
            if self.training and scale_idx < self.num_scales - 1:
                flow_up = self.upsample_flow(
                    flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth"
                )
                flow_preds.append(flow_up)

            if scale_idx == self.num_scales - 1:
                if not self.reg_refine:
                    # upsample to the original image resolution

                    if task == "stereo":
                        flow_pad = torch.cat((-flow, torch.zeros_like(flow)), dim=1)  # [B, 2, H, W]
                        flow_up_pad = self.upsample_flow(flow_pad, feature0)
                        flow_up = -flow_up_pad[:, :1]  # [B, 1, H, W]
                    elif task == "depth":
                        depth_pad = torch.cat((flow, torch.zeros_like(flow)), dim=1)  # [B, 2, H, W]
                        depth_up_pad = self.upsample_flow(depth_pad, feature0, is_depth=True).clamp(
                            min=min_depth, max=max_depth
                        )
                        flow_up = depth_up_pad[:, :1]  # [B, 1, H, W]
                    else:
                        flow_up = self.upsample_flow(flow, feature0)

                    flow_preds.append(flow_up)
                else:
                    # task-specific local regression refinement
                    # supervise current flow
                    if self.training:
                        flow_up = self.upsample_flow(
                            flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth"
                        )
                        flow_preds.append(flow_up)

                    assert num_reg_refine > 0
                    for refine_iter_idx in range(num_reg_refine):
                        flow = flow.detach()

                        if task == "stereo":
                            zeros = torch.zeros_like(flow)  # [B, 1, H, W]
                            # NOTE: reverse disp, disparity is positive
                            displace = torch.cat((-flow, zeros), dim=1)  # [B, 2, H, W]
                            correlation = local_correlation_with_flow(
                                feature0_ori,
                                feature1_ori,
                                flow=displace,
                                local_radius=4,
                            )  # [B, (2R+1)^2, H, W]
                        elif task == "depth":
                            if pred_bidir_depth and refine_iter_idx == 0:
                                intrinsics_curr = intrinsics_curr.repeat(2, 1, 1)
                                pose = torch.cat((pose, torch.inverse(pose)), dim=0)

                                feature0_ori, feature1_ori = torch.cat((feature0_ori, feature1_ori), dim=0), torch.cat(
                                    (feature1_ori, feature0_ori), dim=0
                                )

                            flow_from_depth = compute_flow_with_depth_pose(
                                1.0 / flow.squeeze(1),
                                intrinsics_curr,
                                extrinsics_rel=pose,
                            )

                            correlation = local_correlation_with_flow(
                                feature0_ori,
                                feature1_ori,
                                flow=flow_from_depth,
                                local_radius=4,
                            )  # [B, (2R+1)^2, H, W]

                        else:
                            correlation = local_correlation_with_flow(
                                feature0_ori,
                                feature1_ori,
                                flow=flow,
                                local_radius=4,
                            )  # [B, (2R+1)^2, H, W]

                        proj = self.refine_proj(feature0)

                        net, inp = torch.chunk(proj, chunks=2, dim=1)

                        net = torch.tanh(net)
                        inp = torch.relu(inp)

                        net, up_mask, residual_flow = self.refine(
                            net,
                            inp,
                            correlation,
                            flow.clone(),
                        )

                        if task == "depth":
                            flow = (flow - residual_flow).clamp(min=min_depth, max=max_depth)
                        else:
                            flow = flow + residual_flow

                        if task == "stereo":
                            flow = flow.clamp(min=0)  # positive

                        if self.training or refine_iter_idx == num_reg_refine - 1:
                            if task == "depth":
                                if refine_iter_idx < num_reg_refine - 1:
                                    # bilinear upsampling
                                    flow_up = self.upsample_flow(
                                        flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=True
                                    )
                                else:
                                    # last one convex upsampling
                                    # NOTE: clamp depth due to the zero padding in the unfold in the convex upsampling
                                    # pad depth to 2 channels as flow
                                    depth_pad = torch.cat((flow, torch.zeros_like(flow)), dim=1)  # [B, 2, H, W]
                                    depth_up_pad = self.upsample_flow(depth_pad, feature0, is_depth=True).clamp(
                                        min=min_depth, max=max_depth
                                    )
                                    flow_up = depth_up_pad[:, :1]  # [B, 1, H, W]

                            else:
                                flow_up = upsample_flow_with_mask(
                                    flow, up_mask, upsample_factor=self.upsample_factor, is_depth=task == "depth"
                                )

                            flow_preds.append(flow_up)

        if task == "stereo":
            for i in range(len(flow_preds)):
                flow_preds[i] = flow_preds[i].squeeze(1)  # [B, H, W]

        # convert inverse depth to depth
        if task == "depth":
            for i in range(len(flow_preds)):
                flow_preds[i] = 1.0 / flow_preds[i].squeeze(1)  # [B, H, W]

        results_dict.update({"flow_preds": flow_preds})

        return results_dict


================================================
FILE: Open-Sora/tools/scoring/optical_flow/unimatch/utils.py
================================================
import torch
import torch.nn.functional as F

from .position import PositionEmbeddingSine


def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None):
    assert device is not None

    x, y = torch.meshgrid(
        [torch.linspace(w_min, w_max, len_w, device=device), torch.linspace(h_min, h_max, len_h, device=device)],
    )
    grid = torch.stack((x, y), -1).transpose(0, 1).float()  # [H, W, 2]

    return grid


def normalize_coords(coords, h, w):
    # coords: [B, H, W, 2]
    c = torch.Tensor([(w - 1) / 2.0, (h - 1) / 2.0]).float().to(coords.device)
    return (coords - c) / c  # [-1, 1]


def normalize_img(img0, img1):
    # loaded images are in [0, 255]
    # normalize by ImageNet mean and std
    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(img1.device)
    std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(img1.device)
    img0 = (img0 / 255.0 - mean) / std
    img1 = (img1 / 255.0 - mean) / std

    return img0, img1


def split_feature(
    feature,
    num_splits=2,
    channel_last=False,
):
    if channel_last:  # [B, H, W, C]
        b, h, w, c = feature.size()
        assert h % num_splits == 0 and w % num_splits == 0

        b_new = b * num_splits * num_splits
        h_new = h // num_splits
        w_new = w // num_splits

        feature = (
            feature.view(b, num_splits, h // num_splits, num_splits, w // num_splits, c)
            .permute(0, 1, 3, 2, 4, 5)
            .reshape(b_new, h_new, w_new, c)
        )  # [B*K*K, H/K, W/K, C]
    else:  # [B, C, H, W]
        b, c, h, w = feature.size()
        assert h % num_splits == 0 and w % num_splits == 0

        b_new = b * num_splits * num_splits
        h_new = h // num_splits
        w_new = w // num_splits

        feature = (
            feature.view(b, c, num_splits, h // num_splits, num_splits, w // num_splits)
            .permute(0, 2, 4, 1, 3, 5)
            .reshape(b_new, c, h_new, w_new)
        )  # [B*K*K, C, H/K, W/K]

    return feature


def merge_splits(
    splits,
    num_splits=2,
    channel_last=False,
):
    if channel_last:  # [B*K*K, H/K, W/K, C]
        b, h, w, c = splits.size()
        new_b = b // num_splits // num_splits

        splits = splits.view(new_b, num_splits, num_splits, h, w, c)
        merge = (
            splits.permute(0, 1, 3, 2, 4, 5).contiguous().view(new_b, num_splits * h, num_splits * w, c)
        )  # [B, H, W, C]
    else:  # [B*K*K, C, H/K, W/K]
        b, c, h, w = splits.size()
        new_b = b // num_splits // num_splits

        splits = splits.view(new_b, num_splits, num_splits, c, h, w)
        merge = (
            splits.permute(0, 3, 1, 4, 2, 5).contiguous().view(new_b, c, num_splits * h, num_splits * w)
        )  # [B, C, H, W]

    return merge


def generate_shift_window_attn_mask(
    input_resolution, window_size_h, window_size_w, shift_size_h, shift_size_w, device=torch.device("cuda")
):
    # ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
    # calculate attention mask for SW-MSA
    h, w = input_resolution
    img_mask = torch.zeros((1, h, w, 1)).to(device)  # 1 H W 1
    h_slices = (slice(0, -window_size_h), slice(-window_size_h, -shift_size_h), slice(-shift_size_h, None))
    w_slices = (slice(0, -window_size_w), slice(-window_size_w, -shift_size_w), slice(-shift_size_w, None))
    cnt = 0
    for h in h_slices:
        for w in w_slices:
            img_mask[:, h, w, :] = cnt
            cnt += 1

    mask_windows = split_feature(img_mask, num_splits=input_resolution[-1] // window_size_w, channel_last=True)

    mask_windows = mask_windows.view(-1, window_size_h * window_size_w)
    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))

    return attn_mask


def feature_add_position(feature0, feature1, attn_splits, feature_channels):
    pos_enc = PositionEmbeddingSine(num_pos_feats=feature_channels // 2)

    if attn_splits > 1:  # add position in splited window
        feature0_splits = split_feature(feature0, num_splits=attn_splits)
        feature1_splits = split_feature(feature1, num_splits=attn_splits)

        position = pos_enc(feature0_splits)

        feature0_splits = feature0_splits + position
        feature1_splits = feature1_splits + position

        feature0 = merge_splits(feature0_splits, num_splits=attn_splits)
        feature1 = merge_splits(feature1_splits, num_splits=attn_splits)
    else:
        position = pos_enc(feature0)

        feature0 = feature0 + position
        feature1 = feature1 + position

    return feature0, feature1


def upsample_flow_with_mask(flow, up_mask, upsample_factor, is_depth=False):
    # convex upsampling following raft

    mask = up_mask
    b, flow_channel, h, w = flow.shape
    mask = mask.view(b, 1, 9, upsample_factor, upsample_factor, h, w)  # [B, 1, 9, K, K, H, W]
    mask = torch.softmax(mask, dim=2)

    multiplier = 1 if is_depth else upsample_factor
    up_flow = F.unfold(multiplier * flow, [3, 3], padding=1)
    up_flow = up_flow.view(b, flow_channel, 9, 1, 1, h, w)  # [B, 2, 9, 1, 1, H, W]

    up_flow = torch.sum(mask * up_flow, dim=2)  # [B, 2, K, K, H, W]
    up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)  # [B, 2, K, H, K, W]
    up_flow = up_flow.reshape(b, flow_channel, upsample_factor * h, upsample_factor * w)  # [B, 2, K*H, K*W]

    return up_flow


def split_feature_1d(
    feature,
    num_splits=2,
):
    # feature: [B, W, C]
    b, w, c = feature.size()
    assert w % num_splits == 0

    b_new = b * num_splits
    w_new = w // num_splits

    feature = feature.view(b, num_splits, w // num_splits, c).view(b_new, w_new, c)  # [B*K, W/K, C]

    return feature


def merge_splits_1d(
    splits,
    h,
    num_splits=2,
):
    b, w, c = splits.size()
    new_b = b // num_splits // h

    splits = splits.view(new_b, h, num_splits, w, c)
    merge = splits.view(new_b, h, num_splits * w, c)  # [B, H, W, C]

    return merge


def window_partition_1d(x, window_size_w):
    """
    Args:
        x: (B, W, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, C)
    """
    B, W, C = x.shape
    x = x.view(B, W // window_size_w, window_size_w, C).view(-1, window_size_w, C)
    return x


def generate_shift_window_attn_mask_1d(input_w, window_size_w, shift_size_w, device=torch.device("cuda")):
    # calculate attention mask for SW-MSA
    img_mask = torch.zeros((1, input_w, 1)).to(device)  # 1 W 1
    w_slices = (slice(0, -window_size_w), slice(-window_size_w, -shift_size_w), slice(-shift_size_w, None))
    cnt = 0
    for w in w_slices:
        img_mask[:, w, :] = cnt
        cnt += 1

    mask_windows = window_partition_1d(img_mask, window_size_w)  # nW, window_size, 1
    mask_windows = mask_windows.view(-1, window_size_w)
    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)  # nW, window_size, window_size
    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))

    return attn_mask


================================================
FILE: PixArt-alpha-ToCa/Dockerfile
================================================
# This is a sample Dockefile that builds a runtime container and runs the sample Gradio app.
# Note, you must pass in the pretrained models when you run the container.

FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04

WORKDIR /workspace

RUN apt-get update && \
    apt-get install -y \
        git \
        python3 \
        python-is-python3 \
        python3-pip \
        python3.10-venv \
        libgl1 \
        libgl1-mesa-glx \ 
        libglib2.0-0 \
    && rm -rf /var/lib/apt/lists/*

ADD requirements.txt .

RUN pip install -r requirements.txt

ADD . .

RUN chmod a+x docker-entrypoint.sh

ENV DEMO_PORT=12345
ENTRYPOINT [ "/workspace/docker-entrypoint.sh" ]

================================================
FILE: PixArt-alpha-ToCa/README(PixArt-alpha).md
================================================
<p align="center">
  <img src="asset/logo.png"  height=120>
</p>


### <div align="center">👉 PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis<div> 
### <div align="center"> ICLR 2024 Spotlight <div> 

<div align="center">
  <a href="https://github.com/PixArt-alpha/PixArt-sigma/"><img src="https://img.shields.io/static/v1?label=PixArt-Sigma Code&message=Github&color=blue&logo=github-pages"></a> &ensp;

  <a href="https://pixart-alpha.github.io/"><img src="https://img.shields.io/static/v1?label=Project%20Page&message=Github&color=blue&logo=github-pages"></a> &ensp;
  <a href="https://huggingface.co/datasets/PixArt-alpha/SAM-LLaVA-Captions10M"><img src="https://img.shields.io/static/v1?label=SAM-LLaVA&message=HF&color=yellow"></a> &ensp;
  <a href="https://arxiv.org/abs/2310.00426"><img src="https://img.shields.io/static/v1?label=Paper&message=Arxiv:Alpha&color=red&logo=arxiv"></a> &ensp;
  <a href="https://arxiv.org/abs/2401.05252"><img src="https://img.shields.io/static/v1?label=Paper&message=Arxiv:Delta&color=red&logo=arxiv"></a> &ensp;
  <a href="https://discord.gg/rde6eaE5Ta"><img src="https://img.shields.io/static/v1?label=Discuss&message=Discord&color=purple&logo=discord"></a> &ensp;
  <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/pixart"><img src="https://img.shields.io/static/v1?label=Usage&message=Diffusers&color=green&"></a> &ensp;
  <a href="https://github.com/city96/ComfyUI_ExtraModels"><img src="https://img.shields.io/static/v1?label=App&message=ComfyUI&&color=green"></a> &ensp;

  <a href="https://huggingface.co/spaces/PixArt-alpha/PixArt-alpha"><img src="https://img.shields.io/static/v1?label=Demo PixArt&message=HuggingFace&color=yellow"></a> &ensp;
  <a href="https://huggingface.co/spaces/PixArt-alpha/PixArt-LCM"><img src="https://img.shields.io/static/v1?label=Demo PixArt-LCM&message=HuggingFace&color=yellow"></a> &ensp;
  <a href="https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha"><img src="https://img.shields.io/static/v1?label=Demo PixArt&message=OpenXLab&color=purple"></a> &ensp;
  <a href="https://openxlab.org.cn/apps/detail/houshaowei/PixArt-LCM"><img src="https://img.shields.io/static/v1?label=Demo PixArt-LCM&message=OpenXLab&color=purple"></a> &ensp;
  <a href="https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing"><img src="https://img.shields.io/static/v1?label=Free%20Trial&message=Google%20Colab&logo=google&color=orange"></a> &ensp;
</div>

---

This repo contains PyTorch model definitions, pre-trained weights and inference/sampling code for our paper exploring 
Fast training diffusion models with transformers. You can find more visualizations on our [project page](https://pixart-alpha.github.io/).

<img src="asset/logo.png" width="10%" alt="" /> **PixArt-α Community**: Join our PixArt-α discord channels <a href="https://discord.gg/rde6eaE5Ta" style="text-decoration:none;">
<img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a> for discussions. Coders are welcome to contribute.

> [**PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis**](https://pixart-alpha.github.io/)<br>
> [Junsong Chen*](https://lawrence-cj.github.io/), [Jincheng Yu*](https://lovesykun.cn/about.html), 
> [Chongjian Ge*](https://chongjiange.github.io/), [Lewei Yao*](https://scholar.google.com/citations?user=hqDyTg8AAAAJ&hl=zh-CN&oi=ao),
> [Enze Xie](https://xieenze.github.io/)&#8224;,
> [Yue Wu](https://yuewuhkust.github.io/), [Zhongdao Wang](https://zhongdao.github.io/), 
> [James Kwok](https://www.cse.ust.hk/~jamesk/), [Ping Luo](http://luoping.me/), 
> [Huchuan Lu](https://scholar.google.com/citations?hl=en&user=D3nE0agAAAAJ), 
> [Zhenguo Li](https://scholar.google.com/citations?user=XboZC1AAAAAJ)
> <br>Huawei Noah’s Ark Lab, Dalian University of Technology, HKU, HKUST<br>

> [**PIXART-δ: Fast and Controllable Image Generation with Latent Consistency Models**](https://pixart-alpha.github.io/)<br>
> [Junsong Chen](https://lawrence-cj.github.io/), [Yue Wu](https://yuewuhkust.github.io/), [Simian Luo](https://luosiallen.github.io/),  [Enze Xie](https://xieenze.github.io/)&#8224;,
> [Sayak Paul](https://sayak.dev/), [Ping Luo](http://luoping.me/), [Hang Zhao](), [Zhenguo Li](https://scholar.google.com/citations?user=XboZC1AAAAAJ)
> <br>Huawei Noah’s Ark Lab, DLUT, Tsinghua University, HKU, Hugging Face<br>

---
## Breaking News 🔥🔥!!
- (🔥 New) Apr. 12, 2024. 💥 A better version of [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma) training & inference code, checkpoints are all released!!!
Welcome to collaborate and contribute. Star 🌟us if you think it is helpful!!


- (🔥 New) Jan. 19, 2024. 💥 [PixArt-δ](https://arxiv.org/abs/2401.05252) ControlNet [app_controlnet.py](app/app_controlnet.py) and [Checkpoint](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) are released!!!
- (🔥 New) Jan. 16, 2024. 💥 Glad to announce that [PixArt-α](https://arxiv.org/abs/2310.00426) is accepted by ICLR 2024 (Spotlight).
- (🔥 New) Dec. 17, 2023. 💥 PixArt supports [ComfyUI](https://github.com/comfyanonymous/ComfyUI#manual-install-windows-linux). Thanks to [@city96](https://github.com/city96/ComfyUI_ExtraModels) with his great work.
- (🔥 New) Nov. 30, 2023. 💥 PixArt collaborates with [LCMs](https://github.com/luosiallen/latent-consistency-model) team to make the **fastest** [Training & Inference Text-to-Image Generation System](https://github.com/PixArt-alpha/PixArt-alpha).
Here, [Training code](train_scripts/train_pixart_lcm.py) & [Inference code](scripts/inference_lcm.py) & [Weights](https://huggingface.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS) & [HF Demo](https://huggingface.co/spaces/PixArt-alpha/PixArt-LCM) [OpenXLab Demo](https://openxlab.org.cn/apps/detail/houshaowei/PixArt-LCM) are all released, we hope users will enjoy them. 
Detailed **inference speed** and **code guidance** can be found in [docs](asset/docs/pixart_lcm.md). At the same time, we update the codebase for better user experience and fix some bugs in the newest version.

---
## 🚩 **New Features/Updates**
- ✅ Jan. 11, 2024. 💥 [PixArt-δ](https://arxiv.org/abs/2401.05252): We are excited to announce the release of the [PixArt-δ](https://arxiv.org/abs/2401.05252) technical report!!!
This report offers valuable insights into the training of LCM and ControlNet-like modules in Transformer Models. Along with the report, we have also released all the training and inference code for LCM & ControlNet [in this repository](https://github.com/PixArt-alpha/PixArt-alpha). 
We encourage you to try them out and warmly welcome any Pull Requests from our users. Your contributions and feedback are highly appreciated!
- ✅ Feb. 07, 2024. [train_diffusers.py](train_scripts/train_diffusers.py) can directly train with diffusers model and visualize during training.
- ✅ Jan. 26, 2024. 💥 All checkpoints of [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), including 256px checkpoints are all available here [Download Models](#-download-models).
- ✅ Jan. 19, 2024. 💥 [PixArt-δ](https://arxiv.org/abs/2401.05252) ControlNet [app_controlnet.py](app/app_controlnet.py) and [Checkpoint](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) is released!!!
- ✅ Jan. 12, 2024. 💥 We release the [SAM-LLaVA-Captions](https://huggingface.co/datasets/PixArt-alpha/SAM-LLaVA-Captions10M) used in PixArt-α training.
- ✅ Dec. 27, 2023. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) incorporates into [ControlLLM](https://github.com/OpenGVLab/ControlLLM)!
- ✅ Dec. 17, 2023. [PixArt-LCM-Lora](train_scripts/train_pixart_lcm_lora.py) & [PixArt-Lora](train_scripts/train_pixart_lora_hf.py) training scripts in Hugging Face style is released.
- ✅ Dec. 13, 2023. Add multi-scale vae feature extraction in [tools/extract_features.py](https://github.com/PixArt-alpha/PixArt-alpha/blob/3b4f0afdbe39def80b41ab05c664c963edeebbcd/tools/extract_features.py#L276).
- ✅ Dec. 01, 2023. Add a [Notebook folder](./notebooks) to help users get started with PixArt quickly! Thanks to [@kopyl](https://github.com/kopyl) for his contribution!
- ✅ Nov. 27, 2023. 💥 **PixArt-α Community**: Join our PixArt-α discord channels <a href="https://discord.gg/rde6eaE5Ta" style="text-decoration:none;">
<img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a> for discussions. Coders are welcome to contribute.
- ✅ Nov. 21, 2023. 💥 [SA-Sovler](https://arxiv.org/abs/2309.05019) official code first release [here](asset/docs/sasolver.md).
- ✅ Nov. 19, 2023. Release `PixArt + Dreambooth` training scripts.
- ✅ Nov. 16, 2023. Diffusers support `random resolution` and `batch images` generation now. Besides, 
running `Pixart` in under 8GB GPU VRAM is available in 🧨 [diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pixart).
- ✅ Nov. 10, 2023. Support DALL-E 3 Consistency Decoder in 🧨 diffusers.
- ✅ Nov. 06, 2023. Release pretrained weights with 🧨 diffusers integration, Hugging Face demo, and Google Colab example.
- ✅ Nov. 03, 2023. Release the LLaVA-captioning inference code.
- ✅ Oct. 27, 2023. Release the training & feature extraction code.
- ✅ Oct. 20, 2023. Collaborate with Hugging Face & Diffusers team to co-release the code and weights. (plz stay tuned.)
- ✅ Oct. 15, 2023. Release the inference code.

---

## Contents
* [Training](#-how-to-train)
* [Inference](#-how-to-test)
* [Download Models](#-download-models)
* [Use diffusers](#1---using-in--diffusers)
* [Data Processing](#-how-to-extract-t5-and-vae-features)
* [PixArt-**α** Demo](#3---gradio-with-diffusers--faster-)
* [PixArt-**α** 8GB VRAM](asset/docs/pixart.md)
* [PixArt-**δ** (LCM)](asset/docs/pixart_lcm.md)
* [PixArt-**δ** (ControlNet)](asset/docs/pixart_controlnet.md)
* [PixArt-**δ** (Dreambooth)](asset/docs/pixart-dreambooth.md)
* [Acknowledgement](#acknowledgements)
* [Citation](#bibtex)


* [PixArt-**Σ** Releasing](https://github.com/PixArt-alpha/PixArt-sigma)

---

## 🐱 Abstract
<b>TL; DR: <font color="red">PixArt-α</font> is a Transformer-based T2I diffusion model whose image generation quality is competitive with state-of-the-art image generators (e.g., Imagen, SDXL, and even Midjourney), and the training speed markedly surpasses existing large-scale T2I models, e.g., PixArt-α only takes 10.8% of Stable Diffusion v1.5's training time (675 vs. 6,250 A100 GPU days).</b>

<details><summary>CLICK for the full abstract</summary>
The most advanced text-to-image (T2I) models require significant training costs (e.g., millions of GPU hours), 
seriously hindering the fundamental innovation for the AIGC community while increasing CO2 emissions. 
This paper introduces PixArt-α, a Transformer-based T2I diffusion model whose image generation quality is competitive with state-of-the-art image generators (e.g., Imagen, SDXL, and even Midjourney), 
reaching near-commercial application standards. Additionally, it supports high-resolution image synthesis up to 1024px resolution with low training cost. 
To achieve this goal, three core designs are proposed: 
(1) Training strategy decomposition: We devise three distinct training steps that separately optimize pixel dependency, text-image alignment, and image aesthetic quality; 
(2) Efficient T2I Transformer: We incorporate cross-attention modules into Diffusion Transformer (DiT) to inject text conditions and streamline the computation-intensive class-condition branch; 
(3) High-informative data: We emphasize the significance of concept density in text-image pairs and leverage a large Vision-Language model to auto-label dense pseudo-captions to assist text-image alignment learning. 
As a result, PixArt-α's training speed markedly surpasses existing large-scale T2I models, 
e.g., PixArt-α only takes 10.8% of Stable Diffusion v1.5's training time (675 vs. 6,250 A100 GPU days), 
saving nearly $300,000 ($26,000 vs. $320,000) and reducing 90% CO2 emissions. Moreover, compared with a larger SOTA model, RAPHAEL, 
our training cost is merely 1%. Extensive experiments demonstrate that PixArt-α excels in image quality, artistry, and semantic control. 
We hope PixArt-α will provide new insights to the AIGC community and startups to accelerate building their own high-quality yet low-cost generative models from scratch.
</details>

---

![A small cactus with a happy face in the Sahara desert.](asset/images/teaser.png)

---

# 🔥🔥🔥 Why PixArt-α? 
## Training Efficiency
PixArt-α only takes 12% of Stable Diffusion v1.5's training time (753 vs. 6,250 A100 GPU days), saving nearly $300,000 ($28,000 vs. $320,000) and reducing 90% CO2 emissions. Moreover, compared with a larger SOTA model, RAPHAEL, our training cost is merely 1%.
![Training Efficiency.](asset/images/efficiency.png)

| Method    | Type | #Params | #Images| FID-30K ↓        | A100 GPU days |
|-----------|------|---------|--------|------------------|---------------|
| DALL·E    | Diff | 12.0B   | 250M   | 27.50            |               |
| GLIDE     | Diff | 5.0B    | 250M   | 12.24            |               |
| LDM       | Diff | 1.4B    | 400M   | 12.64            |               |
| DALL·E 2  | Diff | 6.5B    | 650M   | 10.39            | 41,66         |
| SDv1.5    | Diff | 0.9B    | 2000M  | 9.62             | 6,250         |
| GigaGAN   | GAN  | 0.9B    | 2700M  | 9.09             | 4,783         |
| Imagen    | Diff | 3.0B    | 860M   | 7.27             | 7,132         |
| RAPHAEL   | Diff | 3.0B    | 5000M+ | 6.61             | 60,000        |
| PixArt-α  | Diff | 0.6B    | 25M    | 7.32 (zero-shot) | 753           |
| PixArt-α  | Diff | 0.6B    | 25M    | 5.51 (COCO FT)   | 753           |

## Inference Efficiency
PIXART-δ successfully generates **1024x1024 high resolution** images within **0.5 seconds** on an A100. With the implementation
of 8-bit inference technology, PIXART-δ requires **less than 8GB of GPU VRAM**. 

Let us stress again how liberating it is to explore image generation so easily with PixArt-LCM.

| Hardware                    | PIXART-δ (4 steps) | SDXL LoRA LCM (4 steps) | PixArt-α (14 steps) | SDXL standard (25 steps) |
|-----------------------------|--------------------|-------------------------|---------------------|---------------------------|
| T4 (Google Colab Free Tier) | 3.3s               | 8.4s                    | 16.0s               | 26.5s                     |
| V100 (32 GB)                | 0.8s               | 1.2s                    | 5.5s                | 7.7s                      |
| A100 (80 GB)                | 0.51s              | 1.2s                    | 2.2s                | 3.8s                      |

These tests were run with a batch size of 1 in all cases.

For cards with a lot of capacity, such as A100, performance increases significantly when generating multiple images at once, which is usually the case for production workloads.

## High-quality Generation from PixArt-α

- More samples
<div id="more-samples" style="display: flex; justify-content: center;">
  <img src="asset/images/more-samples1.png" style="width: 50%; height: auto; object-fit: contain; margin: 5px;">
  <img src="asset/images/more-samples.png" style="width: 43%; height: auto; object-fit: contain; margin: 5px;">
</div>

- PixArt + [Dreambooth](https://dreambooth.github.io/)
<div id="dreambooth" style="display: flex; justify-content: center;">
  <img src="asset/images/dreambooth/dreambooth_dog.svg" width="46%" style="margin: 5px;">
  <img src="asset/images/dreambooth/dreambooth_m5.svg" width="46%" style="margin: 5px;">
</div>

- PixArt + [ControlNet](https://github.com/lllyasviel/ControlNet)
<div id="ControlNet" style="display: flex; justify-content: center;">
  <img src="asset/images/controlnet/controlnet_huawei.svg" width="46%" style="margin: 5px;">
  <img src="asset/images/controlnet/controlnet_lenna.svg" width="46%" style="margin: 5px;">
</div>

# 🔧 Dependencies and Installation

- Python >= 3.9 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html))
- [PyTorch >= 1.13.0+cu11.7](https://pytorch.org/)
```bash
conda create -n pixart python=3.9
conda activate pixart
pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu118

git clone https://github.com/PixArt-alpha/PixArt-alpha.git
cd PixArt-alpha
pip install -r requirements.txt
```

# ⏬ Download Models
All models will be automatically downloaded. You can also choose to download manually from this [url](https://huggingface.co/PixArt-alpha/PixArt-alpha).

| Model                       | #Params | url                                                                                                                                                                                                          | Download in OpenXLab                                                                                            |
|:----------------------------|:--------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------|
| T5                          | 4.3B    | [T5](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl)                                                                                                                                 | [T5](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/t5-v1_1-xxl.zip)                  |
| VAE                         | 80M     | [VAE](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/sd-vae-ft-ema)                                                                                                                              | [VAE](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/sd-vae-ft-ema.zip)               |
| PixArt-α-SAM-256            | 0.6B    | [PixArt-XL-2-SAM-256x256.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-SAM-256x256.pth) or [diffusers version](https://huggingface.co/PixArt-alpha/PixArt-XL-2-SAM-256x256) | [256-SAM](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/PixArt-XL-2-SAM-256x256.pth) |
| PixArt-α-256                | 0.6B    | [PixArt-XL-2-256x256.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-256x256.pth) or [diffusers version](https://huggingface.co/PixArt-alpha/PixArt-XL-2-256x256)             | [256](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/PixArt-XL-2-256x256.pth)         |
| PixArt-α-256-MSCOCO-FID7.32 | 0.6B    | [PixArt-XL-2-256x256.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-256x256-MSCOCO-FID732.pth)                                                                               | [256]()                                                                                                         |
| PixArt-α-512                | 0.6B    | [PixArt-XL-2-512x512.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-512x512.pth) or [diffusers version](https://huggingface.co/PixArt-alpha/PixArt-XL-2-512x512)             | [512](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/PixArt-XL-2-512x512.pth)         |
| PixArt-α-1024               | 0.6B    | [PixArt-XL-2-1024-MS.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-1024-MS.pth) or [diffusers version](https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS)             | [1024](https://download.openxlab.org.cn/models/PixArt-alpha/PixArt-alpha/weight/PixArt-XL-2-1024-MS.pth)        |
| PixArt-δ-1024-LCM           | 0.6B    | [diffusers version](https://huggingface.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS)                                                                                                                             |                                                                                                                 |
| ControlNet-HED-Encoder      | 30M     | [ControlNetHED.pth](https://huggingface.co/PixArt-alpha/PixArt-alpha/blob/main/ControlNetHED.pth)                                                                                                            |                                                                                                                 |
| PixArt-δ-512-ControlNet     | 0.9B    | [PixArt-XL-2-512-ControlNet.pth](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main)                                                                                                            | [512](https://openxlab.org.cn/models/detail/PixArt-alpha/PixArt-ControlNet)                                     |
| PixArt-δ-1024-ControlNet    | 0.9B    | [PixArt-XL-2-1024-ControlNet.pth](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main)                                                                                                           | [1024](https://openxlab.org.cn/models/detail/PixArt-alpha/PixArt-ControlNet)                                    |

ALSO find all models in [OpenXLab_PixArt-alpha](https://openxlab.org.cn/models/detail/PixArt-alpha/PixArt-alpha)

# 🔥 How to Train
## 1. PixArt Training

**First of all.**

Thanks to [@kopyl](https://github.com/kopyl), you can reproduce the full fine-tune training flow on [Pokemon dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) from HugginFace with notebooks:
1. Train with [notebooks/train.ipynb](https://github.com/PixArt-alpha/PixArt-alpha/blob/53dac066f60fe5fdbdde4f0360145ca96d4cc38c/notebooks/train.ipynb).
2. Convert to Diffusers with [notebooks/convert-checkpoint-to-diffusers.ipynb](https://github.com/PixArt-alpha/PixArt-alpha/blob/master/notebooks/convert-checkpoint-to-diffusers.ipynb).
3. Run the inference with converted checkpoint in step 2 with [notebooks/infer.ipynb](https://github.com/PixArt-alpha/PixArt-alpha/blob/master/notebooks/infer.ipynb).

**Then, for more details.**

Here we take SAM dataset training config as an example, but of course, you can also prepare your own dataset following this method.

You **ONLY** need to change the **config** file in [config](./configs/pixart_config) and **dataloader** in [dataset](./diffusion/data/datasets).
```bash
python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train.py configs/pixart_config/PixArt_xl2_img256_SAM.py --work-dir output/train_SAM_256
```

The directory structure for SAM dataset is:
```
cd ./data

SA1B
├──images/  (images are saved here)
│  ├──sa_xxxxx.jpg
│  ├──sa_xxxxx.jpg
│  ├──......
├──captions/    (corresponding captions are saved here, same name as images)
│  ├──sa_xxxxx.txt
│  ├──sa_xxxxx.txt
├──partition/   (all image names are stored txt file where each line is a image name)
│  ├──part0.txt
│  ├──part1.txt
│  ├──......
├──caption_feature_wmask/   (run tools/extract_caption_feature.py to generate caption T5 features, same name as images except .npz extension)
│  ├──sa_xxxxx.npz
│  ├──sa_xxxxx.npz
│  ├──......
├──img_vae_feature/  (run tools/extract_img_vae_feature.py to generate image VAE features, same name as images except .npy extension)
│  ├──train_vae_256/
│  │  ├──noflip/
│  │  │  ├──sa_xxxxx.npy
│  │  │  ├──sa_xxxxx.npy
│  │  │  ├──......

```

**Here we prepare data_toy for better understanding**
```bash
cd ./data

git lfs install
git clone https://huggingface.co/datasets/PixArt-alpha/data_toy
```
Then, 
[Here](https://huggingface.co/datasets/PixArt-alpha/data_toy/blob/main/part0.txt) is an example of partition/part0.txt file.

---

Besides, for json file guided [training](https://github.com/PixArt-alpha/PixArt-alpha/blob/fe0cb78065d64c18ecd8955a04e4f29138d47946/configs/pixart_config/PixArt_xl2_img1024_internalms.py#L3C2-L3C2),
[here](https://huggingface.co/datasets/PixArt-alpha/data_toy/blob/main/data_info.json) is a toy json file for better understand.

---

## 2. PixArt + DreamBooth Training

Following the `Pixart + DreamBooth` [training guidance](asset/docs/pixart-dreambooth.md)

## 3. PixArt + LCM / LCM-LoRA Training

Following the `PixArt + LCM` [training guidance](asset/docs/pixart_lcm.md)

## 4. PixArt + ControlNet Training

Following the `PixArt + ControlNet` [training guidance](asset/docs/pixart_controlnet.md)

## 4. PixArt + LoRA Training

```bash
pip install peft==0.6.2

accelerate launch --num_processes=1 --main_process_port=36667  train_scripts/train_pixart_lora_hf.py --mixed_precision="fp16" \
  --pretrained_model_name_or_path=PixArt-alpha/PixArt-XL-2-1024-MS \
  --dataset_name=lambdalabs/pokemon-blip-captions --caption_column="text" \
  --resolution=1024 --random_flip \
  --train_batch_size=16 \
  --num_train_epochs=200 --checkpointing_steps=100 \
  --learning_rate=1e-06 --lr_scheduler="constant" --lr_warmup_steps=0 \
  --seed=42 \
  --output_dir="pixart-pokemon-model" \
  --validation_prompt="cute dragon creature" --report_to="tensorboard" \
  --gradient_checkpointing --checkpoints_total_limit=10 --validation_epochs=5 \
  --rank=16
```

# 💻 How to Test
Inference requires at least `23GB` of GPU memory using this repo, while `11GB and 8GB` using in 🧨 [diffusers](#using-in--diffusers).

Currently support:
- [x] [IDDPM](https://arxiv.org/abs/2102.09672)
- [x] [DPM-Solver](https://arxiv.org/abs/2206.00927)
- [x] [SA-Solver](https://arxiv.org/abs/2309.05019)
- [ ] [DPM-Solver-v3](https://arxiv.org/abs/2310.13268v2)

## 1. Quick start with [Gradio](https://www.gradio.app/guides/quickstart)

To get started, first install the required dependencies. Make sure you've downloaded the [models](https://huggingface.co/PixArt-alpha/PixArt-alpha) to the output/pretrained_models folder, and then run on your local machine:

```bash
DEMO_PORT=12345 python app/app.py
```

As an alternative, a sample [Dockerfile](Dockerfile) is provided to make a runtime container that starts the Gradio app.

```bash
docker build . -t pixart
docker run --gpus all -it -p 12345:12345 -v <path_to_huggingface_cache>:/root/.cache/huggingface pixart
```

Or use docker-compose.  Note, if you want to change context from the 1024 to 512 or LCM version of the app just change the APP_CONTEXT env variable in the docker-compose.yml file.  The default is 1024

```bash
docker compose build
docker compose up
```

Let's have a look at a simple example using the `http://your-server-ip:12345`.


## 2. Integration in diffusers
### 1). Using in 🧨 diffusers

Make sure you have the updated versions of the following libraries:

```bash
pip install -U transformers accelerate diffusers SentencePiece ftfy beautifulsoup4
```

And then:

```python
import torch
from diffusers import PixArtAlphaPipeline, ConsistencyDecoderVAE, AutoencoderKL
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too.
pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16, use_safetensors=True)

# If use DALL-E 3 Consistency Decoder
# pipe.vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)

# If use SA-Solver sampler
# from diffusion.sa_solver_diffusers import SASolverScheduler
# pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction')

# If loading a LoRA model
# transformer = Transformer2DModel.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", subfolder="transformer", torch_dtype=torch.float16)
# transformer = PeftModel.from_pretrained(transformer, "Your-LoRA-Model-Path")
# pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", transformer=transformer, torch_dtype=torch.float16, use_safetensors=True)
# del transformer

# Enable memory optimizations.
# pipe.enable_model_cpu_offload()

pipe.to(device)

prompt = "A small cactus with a happy face in the Sahara desert."
image = pipe(prompt).images[0]
image.save("./catcus.png")
```
Check out the [documentation](./asset/docs/sasolver.md) for more information about SA-Solver Sampler.

This integration allows running the pipeline with a batch size of 4 under 11 GBs of GPU VRAM. 
Check out the [documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pixart) to learn more.

### 2). Running the `PixArtAlphaPipeline` in under 8GB GPU VRAM

GPU VRAM consumption under 8 GB is supported now, please refer to [documentation](asset/docs/pixart.md) for more information.

### 3). Gradio with diffusers (Faster)

To get started, first install the required dependencies, then run on your local machine:

```bash
# diffusers version
DEMO_PORT=12345 python app/app.py
```
Let's have a look at a simple example using the `http://your-server-ip:12345`.

You can also click [here](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing) to have a free trial on Google Colab.

### 4). Convert .pth checkpoint into diffusers version

```bash
python tools/convert_pixart_alpha_to_diffusers.py --image_size your_img_size --multi_scale_train (True if you use PixArtMS else False) --orig_ckpt_path path/to/pth --dump_path path/to/diffusers --only_transformer=True
```


## 3. Online Demo [![Hugging Face PixArt](https://img.shields.io/static/v1?label=Demo&message=HuggingFace%20Gradio&color=orange)](https://huggingface.co/spaces/PixArt-alpha/PixArt-alpha) 
![Online Demo sample](asset/images/sample.png)

# ✏️ How to LLaVA captioning
Thanks to the code base of [LLaVA-Lightning-MPT](https://huggingface.co/liuhaotian/LLaVA-Lightning-MPT-7B-preview), 
we can caption the LAION and SAM dataset with the following launching code:
```bash
python tools/VLM_caption_lightning.py --output output/dir/ --data-root data/root/path --index path/to/data.json
```
We present auto-labeling with custom prompts for LAION (left) and SAM (right). The words highlighted in green represent the original caption in LAION, while those marked in red indicate the detailed captions labeled by LLaVA.

![Dialog with LLaVA.](asset/images/LLaVA-dialog.png)

# ✏️ How to extract T5 and VAE features

Prepare T5 text feature and VAE image feature in advance will speed up the training process and save GPU memory.
```bash
python tools/extract_features.py --img_size=1024 \
    --json_path "data/data_info.json" \
    --t5_save_root "data/SA1B/caption_feature_wmask" \
    --vae_save_root "data/SA1B/img_vae_features" \
    --pretrained_models_dir "output/pretrained_models" \
    --dataset_root "data/SA1B/Images/"
```

## 💪To-Do List (Congratulations🎉)

- [x] Inference code
- [x] Training code
- [x] T5 & VAE feature extraction code
- [x] LLaVA captioning code
- [x] Model zoo 
- [x] Diffusers version & Hugging Face demo
- [x] Google Colab example
- [x] DALLE3 VAE integration
- [x] Inference under 8GB GPU VRAM with diffusers
- [x] Dreambooth Training code
- [x] SA-Solver code
- [x] PixArt-α-LCM will release soon
- [x] Multi-scale vae feature extraction code
- [x] PixArt-α-LCM-LoRA scripts will release soon
- [x] PixArt-α-LoRA training scripts will release soon
- [x] ControlNet code will be released
- [x] SAM-LLaVA caption dataset
- [x] ControlNet checkpoint
- [x] 256px pre-trained models
- [x] PixArt-Σ: Next version model with much better ability is training!

# Other Source
We make a video comparing PixArt with current most powerful Text-to-Image models.

[![Watch the video](https://img.youtube.com/vi/7_6KsIITgWY/maxresdefault.jpg)](https://www.youtube.com/watch?v=7_6KsIITgWY)

# 📖BibTeX
    @misc{chen2023pixartalpha,
          title={PixArt-$\alpha$: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis}, 
          author={Junsong Chen and Jincheng Yu and Chongjian Ge and Lewei Yao and Enze Xie and Yue Wu and Zhongdao Wang and James Kwok and Ping Luo and Huchuan Lu and Zhenguo Li},
          year={2023},
          eprint={2310.00426},
          archivePrefix={arXiv},
          primaryClass={cs.CV}
    }
    @misc{chen2024pixartdelta,
          title={PIXART-{\delta}: Fast and Controllable Image Generation with Latent Consistency Models}, 
          author={Junsong Chen and Yue Wu and Simian Luo and Enze Xie and Sayak Paul and Ping Luo and Hang Zhao and Zhenguo Li},
          year={2024},
          eprint={2401.05252},
          archivePrefix={arXiv},
          primaryClass={cs.CV}
    }
    
# 🤗Acknowledgements
- Thanks to [Diffusers](https://github.com/huggingface/diffusers) for their wonderful technical support and awesome collaboration!
- Thanks to [Hugging Face](https://github.com/huggingface) for sponsoring the nicely demo!
- Thanks to [DiT](https://github.com/facebookresearch/DiT) for their wonderful work and codebase!

## Star History

[![Star History Chart](https://api.star-history.com/svg?repos=PixArt-alpha/PixArt-alpha&type=Date)](https://star-history.com/#PixArt-alpha/PixArt-alpha&Date)


================================================
FILE: PixArt-alpha-ToCa/app/app.py
================================================
#!/usr/bin/env python
from __future__ import annotations
import os
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import random
import gradio as gr
import numpy as np
import uuid
from diffusers import ConsistencyDecoderVAE, PixArtAlphaPipeline, DPMSolverMultistepScheduler
import torch
from typing import Tuple
from datetime import datetime
from diffusion.sa_solver_diffusers import SASolverScheduler


DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png)
        # PixArt-Alpha 1024px
        #### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-XL-2-1024-MS](https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS) checkpoint.
        #### English prompts ONLY; 提示词仅限英文
        Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing).
        ### <span style='color: red;'>You may change the DPM-Solver inference steps from 14 to 20, if you didn't get satisfied results.
        """
if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"

MAX_SEED = np.iinfo(np.int32).max
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
PORT = int(os.getenv("DEMO_PORT", "15432"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


style_list = [
    {
        "name": "(No style)",
        "prompt": "{prompt}",
        "negative_prompt": "",
    },
    {
        "name": "Cinematic",
        "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
        "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
    },
    {
        "name": "Photographic",
        "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
        "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
    },
    {
        "name": "Anime",
        "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime,  highly detailed",
        "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
    },
    {
        "name": "Manga",
        "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
        "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
    },
    {
        "name": "Digital Art",
        "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
        "negative_prompt": "photo, photorealistic, realism, ugly",
    },
    {
        "name": "Pixel art",
        "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
        "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
    },
    {
        "name": "Fantasy art",
        "prompt": "ethereal fantasy concept art of  {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
        "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
    },
    {
        "name": "Neonpunk",
        "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
        "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
    },
    {
        "name": "3D Model",
        "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
        "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
    },
]


styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "(No style)"
SCHEDULE_NAME = ["DPM-Solver", "SA-Solver"]
DEFAULT_SCHEDULE_NAME = "DPM-Solver"
NUM_IMAGES_PER_PROMPT = 1

def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
    if not negative:
        negative = ""
    return p.replace("{prompt}", positive), n + negative


if torch.cuda.is_available():
    pipe = PixArtAlphaPipeline.from_pretrained(
        "PixArt-alpha/PixArt-XL-2-1024-MS",
        torch_dtype=torch.float16,
        use_safetensors=True,
    )

    if os.getenv('CONSISTENCY_DECODER', False):
        print("Using DALL-E 3 Consistency Decoder")
        pipe.vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)

    if ENABLE_CPU_OFFLOAD:
        pipe.enable_model_cpu_offload()
    else:
        pipe.to(device)
        print("Loaded on Device!")

    # speed-up T5
    pipe.text_encoder.to_bettertransformer()

    if USE_TORCH_COMPILE:
        pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
        print("Model Compiled!")


def save_image(img):
    unique_name = f'{str(uuid.uuid4())}.png'
    save_path = os.path.join(f'output/online_demo_img/{datetime.now().date()}')
    os.makedirs(save_path, exist_ok=True)
    unique_name = os.path.join(save_path, unique_name)
    img.save(unique_name)
    return unique_name


def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed


def generate(
        prompt: str,
        negative_prompt: str = "",
        style: str = DEFAULT_STYLE_NAME,
        use_negative_prompt: bool = False,
        seed: int = 0,
        width: int = 1024,
        height: int = 1024,
        schedule: str = 'DPM-Solver',
        dpms_guidance_scale: float = 4.5,
        sas_guidance_scale: float = 3,
        dpms_inference_steps: int = 20,
        sas_inference_steps: int = 25,
        randomize_seed: bool = False,
        use_resolution_binning: bool = True,
        progress=gr.Progress(track_tqdm=True),
):
    seed = int(randomize_seed_fn(seed, randomize_seed))
    generator = torch.Generator().manual_seed(seed)

    if schedule == 'DPM-Solver':
        if not isinstance(pipe.scheduler, DPMSolverMultistepScheduler):
            pipe.scheduler = DPMSolverMultistepScheduler()
        num_inference_steps = dpms_inference_steps
        guidance_scale = dpms_guidance_scale
    elif schedule == "SA-Solver":
        if not isinstance(pipe.scheduler, SASolverScheduler):
            pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction', tau_func=lambda t: 1 if 200 <= t <= 800 else 0, predictor_order=2, corrector_order=2)
        num_inference_steps = sas_inference_steps
        guidance_scale = sas_guidance_scale
    else:
        raise ValueError(f"Unknown schedule: {schedule}")

    if not use_negative_prompt:
        negative_prompt = None  # type: ignore
    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)

    images = pipe(
        prompt=prompt,
        width=width,
        height=height,
        negative_prompt=negative_prompt,
        guidance_scale=guidance_scale,
        num_inference_steps=num_inference_steps,
        generator=generator,
        num_images_per_prompt=NUM_IMAGES_PER_PROMPT,
        use_resolution_binning=use_resolution_binning,
        output_type="pil",
    ).images

    image_paths = [save_image(img) for img in images]
    print(image_paths)
    return image_paths, seed


examples = [
    "A small cactus with a happy face in the Sahara desert.",
    "an astronaut sitting in a diner, eating fries, cinematic, analog film",
    "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
    "stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.",
    "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
    "beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background",
    "Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism",
    "anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur",
    "The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8",
]

with gr.Blocks(css="app/style.css") as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_id="duplicate-button",
        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
    )
    with gr.Group():
        with gr.Row():
            prompt = gr.Text(
                label="Prompt",
                show_label=False,
                max_lines=1,
                placeholder="Enter your prompt",
                container=False,
            )
            run_button = gr.Button("Run", scale=0)
        result = gr.Gallery(label="Result", columns=NUM_IMAGES_PER_PROMPT, show_label=False)
    with gr.Accordion("Advanced options", open=False):
        with gr.Row():
            use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
        schedule = gr.Radio(
            show_label=True,
            container=True,
            interactive=True,
            choices=SCHEDULE_NAME,
            value=DEFAULT_SCHEDULE_NAME,
            label="Sampler Schedule",
            visible=True,
        )
        style_selection = gr.Radio(
            show_label=True,
            container=True,
            interactive=True,
            choices=STYLE_NAMES,
            value=DEFAULT_STYLE_NAME,
            label="Image Style",
        )
        negative_prompt = gr.Text(
            label="Negative prompt",
            max_lines=1,
            placeholder="Enter a negative prompt",
            visible=True,
        )
        seed = gr.Slider(
            label="Seed",
            minimum=0,
            maximum=MAX_SEED,
            step=1,
            value=0,
        )
        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
        with gr.Row(visible=True):
            width = gr.Slider(
                label="Width",
                minimum=256,
                maximum=MAX_IMAGE_SIZE,
                step=32,
                value=1024,
            )
            height = gr.Slider(
                label="Height",
                minimum=256,
                maximum=MAX_IMAGE_SIZE,
                step=32,
                value=1024,
            )
        with gr.Row():
            dpms_guidance_scale = gr.Slider(
                label="DPM-Solver Guidance scale",
                minimum=1,
                maximum=10,
                step=0.1,
                value=4.5,
            )
            dpms_inference_steps = gr.Slider(
                label="DPM-Solver inference steps",
                minimum=5,
                maximum=40,
                step=1,
                value=14,
            )
        with gr.Row():
            sas_guidance_scale = gr.Slider(
                label="SA-Solver Guidance scale",
                minimum=1,
                maximum=10,
                step=0.1,
                value=3,
            )
            sas_inference_steps = gr.Slider(
                label="SA-Solver inference steps",
                minimum=10,
                maximum=40,
                step=1,
                value=25,
            )

    gr.Examples(
        examples=examples,
        inputs=prompt,
        outputs=[result, seed],
        fn=generate,
        cache_examples=CACHE_EXAMPLES,
    )

    use_negative_prompt.change(
        fn=lambda x: gr.update(visible=x),
        inputs=use_negative_prompt,
        outputs=negative_prompt,
        api_name=False,
    )

    gr.on(
        triggers=[
            prompt.submit,
            negative_prompt.submit,
            run_button.click,
        ],
        fn=generate,
        inputs=[
            prompt,
            negative_prompt,
            style_selection,
            use_negative_prompt,
            seed,
            width,
            height,
            schedule,
            dpms_guidance_scale,
            sas_guidance_scale,
            dpms_inference_steps,
            sas_inference_steps,
            randomize_seed,
        ],
        outputs=[result, seed],
        api_name="run",
    )

if __name__ == "__main__":
    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True)


================================================
FILE: PixArt-alpha-ToCa/app/app_512.py
================================================
#!/usr/bin/env python
from __future__ import annotations
import os
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import random
import gradio as gr
import numpy as np
import uuid
from diffusers import PixArtAlphaPipeline, ConsistencyDecoderVAE, DPMSolverMultistepScheduler
import torch
from typing import Tuple
from datetime import datetime
from diffusion.data.datasets import ASPECT_RATIO_512_TEST
from diffusion.model.utils import resize_and_crop_img
from diffusion.sa_solver_diffusers import SASolverScheduler


DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png)
        # PixArt-Alpha 512px
        #### [PixArt-Alpha 512px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-XL-2-512x512](https://huggingface.co/PixArt-alpha/PixArt-XL-2-512x512) checkpoint.
        #### English prompts ONLY; 提示词仅限英文
        Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing).
        """
if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"

MAX_SEED = np.iinfo(np.int32).max
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
PORT = int(os.getenv("DEMO_PORT", "15432"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


style_list = [
    {
        "name": "(No style)",
        "prompt": "{prompt}",
        "negative_prompt": "",
    },
    {
        "name": "Cinematic",
        "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
        "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
    },
    {
        "name": "Photographic",
        "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
        "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
    },
    {
        "name": "Anime",
        "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime,  highly detailed",
        "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
    },
    {
        "name": "Manga",
        "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
        "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
    },
    {
        "name": "Digital Art",
        "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
        "negative_prompt": "photo, photorealistic, realism, ugly",
    },
    {
        "name": "Pixel art",
        "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
        "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
    },
    {
        "name": "Fantasy art",
        "prompt": "ethereal fantasy concept art of  {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
        "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
    },
    {
        "name": "Neonpunk",
        "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
        "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
    },
    {
        "name": "3D Model",
        "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
        "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
    },
]


styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "(No style)"
SCHEDULE_NAME = ["DPM-Solver", "SA-Solver"]
DEFAULT_SCHEDULE_NAME = "DPM-Solver"
NUM_IMAGES_PER_PROMPT = 2


def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
    if not negative:
        negative = ""
    return p.replace("{prompt}", positive), n + negative


if torch.cuda.is_available():
    pipe = PixArtAlphaPipeline.from_pretrained(
        "PixArt-alpha/PixArt-XL-2-512x512",
        torch_dtype=torch.float16,
        variant="fp16",
        use_safetensors=True,
    )

    if os.getenv('CONSISTENCY_DECODER', False):
        print("Using DALL-E 3 Consistency Decoder")
        pipe.vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)

    if ENABLE_CPU_OFFLOAD:
        pipe.enable_model_cpu_offload()
    else:
        pipe.to(device)
        print("Loaded on Device!")

    # speed-up T5
    pipe.text_encoder.to_bettertransformer()

    if USE_TORCH_COMPILE:
        pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
        print("Model Compiled!")


def prepare_prompt_hw(height, width, ratios):
    ar = float(height/width)
    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
    default_hw = ratios[closest_ratio]
    return int(default_hw[0]), int(default_hw[1])


def save_image(img):
    unique_name = f'{str(uuid.uuid4())}.png'
    save_path = os.path.join(f'output/online_demo_img512/{datetime.now().date()}')
    os.makedirs(save_path, exist_ok=True)
    unique_name = os.path.join(save_path, unique_name)
    img.save(unique_name)
    return unique_name


def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed


def classify_height_width_bin(height: int, width: int, ratios: dict):
    ar = float(height / width)
    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
    default_hw = ratios[closest_ratio]
    return int(default_hw[0]), int(default_hw[1])


def generate(
        prompt: str,
        negative_prompt: str = "",
        style: str = DEFAULT_STYLE_NAME,
        use_negative_prompt: bool = False,
        seed: int = 0,
        width: int = 512,
        height: int = 512,
        schedule: str = 'DPM-Solver',
        dpms_guidance_scale: float = 4.5,
        sas_guidance_scale: float = 3,
        dpms_inference_steps: int = 20,
        sas_inference_steps: int = 25,
        randomize_seed: bool = False,
        use_resolution_binning: bool = True,
        progress=gr.Progress(track_tqdm=True),
):
    seed = int(randomize_seed_fn(seed, randomize_seed))
    generator = torch.Generator().manual_seed(seed)

    if schedule == 'DPM-Solver':
        if not isinstance(pipe.scheduler, DPMSolverMultistepScheduler):
            pipe.scheduler = DPMSolverMultistepScheduler()
        num_inference_steps = dpms_inference_steps
        guidance_scale = dpms_guidance_scale
    elif schedule == "SA-Solver":
        if not isinstance(pipe.scheduler, SASolverScheduler):
            pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction', tau_func=lambda t: 1 if 200 <= t <= 800 else 0, predictor_order=2, corrector_order=2)
        num_inference_steps = sas_inference_steps
        guidance_scale = sas_guidance_scale
    else:
        raise ValueError(f"Unknown schedule: {schedule}")

    if not use_negative_prompt:
        negative_prompt = None  # type: ignore
    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)

    if use_resolution_binning:
        orig_height, orig_width = height, width
        height, width = classify_height_width_bin(height, width, ratios=ASPECT_RATIO_512_TEST)

    images = pipe(
        prompt=prompt,
        width=width,
        height=height,
        negative_prompt=negative_prompt,
        guidance_scale=guidance_scale,
        num_inference_steps=num_inference_steps,
        generator=generator,
        use_resolution_binning=False,
        num_images_per_prompt=NUM_IMAGES_PER_PROMPT,
        output_type="pil",
    ).images

    if use_resolution_binning:
        images = [resize_and_crop_img(img, orig_width, orig_height) for img in images]
    image_paths = [save_image(img) for img in images]
    print(image_paths)
    return image_paths, seed


examples = [
    "A small cactus with a happy face in the Sahara desert.",
    "an astronaut sitting in a diner, eating fries, cinematic, analog film",
    "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
    "stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.",
    "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
    "beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background",
    "Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism",
    "anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur",
    "The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8",
]

with gr.Blocks(css="scripts/style.css") as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_id="duplicate-button",
        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
    )
    with gr.Group():
        with gr.Row():
            prompt = gr.Text(
                label="Prompt",
                show_label=False,
                max_lines=1,
                placeholder="Enter your prompt",
                container=False,
            )
            run_button = gr.Button("Run", scale=0)
        result = gr.Gallery(label="Result", columns=NUM_IMAGES_PER_PROMPT, show_label=False)
    with gr.Accordion("Advanced options", open=False):
        with gr.Row():
            use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=False)
        schedule = gr.Radio(
            show_label=True,
            container=True,
            interactive=True,
            choices=SCHEDULE_NAME,
            value=DEFAULT_SCHEDULE_NAME,
            label="Sampler Schedule",
            visible=True,
        )
        style_selection = gr.Radio(
            show_label=True,
            container=True,
            interactive=True,
            choices=STYLE_NAMES,
            value=DEFAULT_STYLE_NAME,
            label="Image Style",
        )
        negative_prompt = gr.Text(
            label="Negative prompt (no use now)",
            max_lines=1,
            placeholder="Enter a negative prompt",
            visible=False,
        )
        seed = gr.Slider(
            label="Seed",
            minimum=0,
            maximum=MAX_SEED,
            step=1,
            value=0,
        )
        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
        with gr.Row(visible=True):
            width = gr.Slider(
                label="Width",
                minimum=256,
                maximum=MAX_IMAGE_SIZE,
                step=32,
                value=512,
            )
            height = gr.Slider(
                label="Height",
                minimum=256,
                maximum=MAX_IMAGE_SIZE,
                step=32,
                value=512,
            )
        with gr.Row():
            dpms_guidance_scale = gr.Slider(
                label="DPM-Solver Guidance scale",
                minimum=1,
                maximum=10,
                step=0.1,
                value=4.5,
            )
            dpms_inference_steps = gr.Slider(
                label="DPM-Solver inference steps",
                minimum=5,
                maximum=40,
                step=1,
                value=20,
            )
        with gr.Row():
            sas_guidance_scale = gr.Slider(
                label="SA-Solver Guidance scale",
                minimum=1,
                maximum=10,
                step=0.1,
                value=3,
            )
            sas_inference_steps = gr.Slider(
                label="SA-Solver inference steps",
                minimum=10,
                maximum=40,
                step=1,
                value=25,
            )

    gr.Examples(
        examples=examples,
        inputs=prompt,
        outputs=[result, seed],
        fn=generate,
        cache_examples=CACHE_EXAMPLES,
    )

    use_negative_prompt.change(
        fn=lambda x: gr.update(visible=x),
        inputs=use_negative_prompt,
        outputs=negative_prompt,
        api_name=False,
    )

    gr.on(
        triggers=[
            prompt.submit,
            negative_prompt.submit,
            run_button.click,
        ],
        fn=generate,
        inputs=[
            prompt,
            negative_prompt,
            style_selection,
            use_negative_prompt,
            seed,
            width,
            height,
            schedule,
            dpms_guidance_scale,
            sas_guidance_scale,
            dpms_inference_steps,
            sas_inference_steps,
            randomize_seed,
        ],
        outputs=[result, seed],
        api_name="run",
    )

if __name__ == "__main__":
    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True)


================================================
FILE: PixArt-alpha-ToCa/app/app_controlnet.py
================================================
#!/usr/bin/env python
from __future__ import annotations

import argparse
import os
import random
import sys

import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Tuple, Union

current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))

import gradio as gr
import numpy as np
import torch
from PIL import Image as PILImage
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from torchvision.utils import _log_api_usage_once, make_grid, save_image

from diffusers import PixArtAlphaPipeline
from diffusion import DPMS, SASolverSampler
from diffusion.data.datasets import *
from diffusion.model.hed import HEDdetector
from diffusion.model.nets import PixArt_XL_2, PixArtMS_XL_2, ControlPixArtHalf, ControlPixArtMSHalf
from diffusion.model.utils import resize_and_crop_tensor
from diffusion.utils.misc import read_config
from tools.download import find_model


DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png)
        # PixArt-Delta (ControlNet)
        #### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. 
        #### This demo uses the [PixArt-alpha/PixArt-XL-2-1024-ControlNet](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) checkpoint.
        #### This demo uses the [PixArt-alpha/PixArt-XL-2-512-ControlNet](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) checkpoint.
        #### English prompts ONLY; 提示词仅限英文
        ### <span style='color: red;'>Please use the image size corresponding to the model as input to get the best performance. (eg. 1024px for PixArt-XL-2-1024-ControlNet.pth)
        """
if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU �� This demo does not work on CPU.</p>"

MAX_SEED = np.iinfo(np.int32).max
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
PORT = int(os.getenv("DEMO_PORT", "15432"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


@torch.no_grad()
def ndarr_image(tensor: Union[torch.Tensor, List[torch.Tensor]], **kwargs, ) -> None:
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(save_image)
    grid = make_grid(tensor, **kwargs)
    ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
    return ndarr


style_list = [
    {
        "name": "(No style)",
        "prompt": "{prompt}",
        "negative_prompt": "",
    },
    {
        "name": "Cinematic",
        "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
        "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
    },
    {
        "name": "Photographic",
        "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
        "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
    },
    {
        "name": "Anime",
        "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime,  highly detailed",
        "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
    },
    {
        "name": "Manga",
        "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
        "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
    },
    {
        "name": "Digital Art",
        "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
        "negative_prompt": "photo, photorealistic, realism, ugly",
    },
    {
        "name": "Pixel art",
        "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
        "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
    },
    {
        "name": "Fantasy art",
        "prompt": "ethereal fantasy concept art of  {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
        "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
    },
    {
        "name": "Neonpunk",
        "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
        "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
    },
    {
        "name": "3D Model",
        "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
        "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
    },
]


styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "(No style)"
SCHEDULE_NAME = ["DPM-Solver", "SA-Solver"]
DEFAULT_SCHEDULE_NAME = "DPM-Solver"

def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
    if not negative:
        negative = ""
    return p.replace("{prompt}", positive), n + negative


def save_image(img):
    unique_name = str(uuid.uuid4()) + '.png'
    save_path = os.path.join(f'output/online_demo_img/{datetime.now().date()}')
    os.makedirs(save_path, exist_ok=True)
    unique_name = os.path.join(save_path, unique_name)
    img.save(unique_name)
    return unique_name


def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed


@torch.inference_mode()
def generate(
        prompt: str,
        given_image = None,
        negative_prompt: str = "",
        style: str = DEFAULT_STYLE_NAME,
        use_negative_prompt: bool = False,
        seed: int = 0,
        width: int = 1024,
        height: int = 1024,
        schedule: str = 'DPM-Solver',
        dpms_guidance_scale: float = 4.5,
        sas_guidance_scale: float = 3,
        dpms_inference_steps: int = 14,
        sas_inference_steps: int = 25,
        randomize_seed: bool = False,
):
    seed = int(randomize_seed_fn(seed, randomize_seed))
    torch.manual_seed(seed)
    torch.cuda.empty_cache()
    strength = 1.0
    c_vis = given_image

    if not use_negative_prompt:
        negative_prompt = None  # type: ignore
    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)

    prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask\
        = pipe.encode_prompt(prompt=prompt, negative_prompt=negative_prompt)
    prompt_embeds, negative_prompt_embeds = prompt_embeds[:, None], negative_prompt_embeds[:, None]
    torch.cuda.empty_cache()

    # condition process
    if given_image is not None:
        ar = torch.tensor([given_image.size[1] / given_image.size[0]], device=device)[None]
        custom_hw = torch.tensor([given_image.size[1], given_image.size[0]], device=device)[None]
        closest_hw = base_ratios[min(base_ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))]
        hw = torch.tensor(closest_hw, device=device)[None]
        condition_transform = T.Compose([
            T.Lambda(lambda img: img.convert('RGB')),
            T.Resize(int(min(closest_hw))),
            T.CenterCrop([int(closest_hw[0]), int(closest_hw[1])]),
            T.ToTensor(),
        ])

        given_image = condition_transform(given_image).unsqueeze(0).to(device)
        hed_edge = hed(given_image) * strength
        hed_edge = TF.normalize(hed_edge, [.5], [.5])
        hed_edge = hed_edge.repeat(1, 3, 1, 1).to(weight_dtype)
        posterior = vae.encode(hed_edge).latent_dist
        condition = posterior.sample()
        c = condition * config.scale_factor
        c_vis = vae.decode(condition)['sample']
        c_vis = torch.clamp(127.5 * c_vis + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()[0]
    else:
        c = None
        ar = torch.tensor([int(height) / int(width)], device=device)[None]
        custom_hw = torch.tensor([int(height), int(width)], device=device)[None]
        closest_hw = base_ratios[min(base_ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))]
        hw = torch.tensor(closest_hw, device=device)[None]

    latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8)

    # Sample images:
    if schedule == 'DPM-Solver':
        # Create sampling noise:
        n = prompt_embeds.shape[0]
        z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device)
        model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=prompt_attention_mask, c=c)
        dpm_solver = DPMS(model.forward_with_dpmsolver,
                          condition=prompt_embeds,
                          uncondition=negative_prompt_embeds,
                          cfg_scale=dpms_guidance_scale,
                          model_kwargs=model_kwargs)
        samples = dpm_solver.sample(
            z,
            steps=dpms_inference_steps,
            order=2,
            skip_type="time_uniform",
            method="multistep",
        ).to(weight_dtype)
    elif schedule == "SA-Solver":
        # Create sampling noise:
        n = prompt_embeds.shape[0]
        model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=prompt_attention_mask, c=c)
        sas_solver = SASolverSampler(model.forward_with_dpmsolver, device=device)
        samples = sas_solver.sample(
            S=sas_inference_steps,
            batch_size=n,
            shape=(4, latent_size_h, latent_size_w),
            eta=1,
            conditioning=prompt_embeds,
            unconditional_conditioning=negative_prompt_embeds,
            unconditional_guidance_scale=sas_guidance_scale,
            model_kwargs=model_kwargs,
        )[0].to(weight_dtype)

    samples = vae.decode(samples / config.scale_factor).sample
    torch.cuda.empty_cache()
    samples = resize_and_crop_tensor(samples, custom_hw[0, 1], custom_hw[0, 0])
    samples = PILImage.fromarray(ndarr_image(samples, normalize=True, value_range=(-1, 1)))
    image_paths = [save_image(samples)]
    c_vis = PILImage.fromarray(c_vis) if c_vis is not None else samples
    c_paths = [save_image(c_vis)]
    print(image_paths)
    return image_paths, c_paths, seed


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("config", type=str, help="config")
    parser.add_argument('--image_size', default=1024, type=int)
    parser.add_argument('--model_path', type=str)
    return parser.parse_args()


args = get_args()
config = read_config(args.config)
device = "cuda" if torch.cuda.is_available() else "cpu"

assert args.image_size in [512, 1024], "We only provide pre-trained models for 512x512 and 1024x1024 resolutions."
lewei_scale = {512: 1, 1024: 2}
latent_size = args.image_size // 8
weight_dtype = torch.float16
print(f"Inference with {weight_dtype}")

if torch.cuda.is_available():
    hed = HEDdetector(False).to(device)
    pipe = PixArtAlphaPipeline.from_pretrained(
        "PixArt-alpha/PixArt-XL-2-1024-MS",
        transformer=None,
        torch_dtype=weight_dtype,
        use_safetensors=True,
    )
    pipe.to(device)
    print("Loaded on Device!")
    vae = pipe.vae
    text_encoder = pipe.text_encoder
    tokenizer = pipe.tokenizer

    assert args.image_size == config.image_size
    if config.image_size == 512:
        model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[config.image_size])
        print('model architecture ControlPixArtHalf and image size is 512')
        model = ControlPixArtHalf(model).to(device)
    elif config.image_size == 1024:
        model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[config.image_size])
        print('model architecture ControlPixArtMSHalf and image size is 1024')
        model = ControlPixArtMSHalf(model).to(device)

    state_dict = find_model(args.model_path)['state_dict']
    if 'pos_embed' in state_dict:
        del state_dict['pos_embed']
    elif 'base_model.pos_embed' in state_dict:
        del state_dict['base_model.pos_embed']
    missing, unexpected = model.load_state_dict(state_dict, strict=False)
    print('Missing keys (missing pos_embed is normal): ', missing)
    print('Unexpected keys', unexpected)
    model.eval()
    model.to(weight_dtype)
    base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST')

with gr.Blocks(css="app/style_controlnet.css") as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_id="duplicate-button",
        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
    )
    image_input = gr.Image(
        label="Image",
        height=360,
        width=360,
        show_label=False,
        sources="upload",
        type="pil",
    )
    with gr.Group():
        with gr.Row():
            prompt = gr.Text(
                label="Prompt",
                show_label=False,
                max_lines=1,
                placeholder="Enter your prompt",
                container=False,
            )
            run_button = gr.Button("Run", scale=0)
    with gr.Group():
        with gr.Row():
            hed_result = gr.Gallery(label="Hed Result", show_label=False)
            result = gr.Gallery(label="Result", show_label=False)
    with gr.Accordion("Advanced options", open=False):
        with gr.Row():
            use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
        schedule = gr.Radio(
            show_label=True,
            container=True,
            interactive=True,
            choices=SCHEDULE_NAME,
            value=DEFAULT_SCHEDULE_NAME,
            label="Sampler Schedule",
            visible=True,
        )
        style_selection = gr.Radio(
            show_label=True,
            container=True,
            interactive=True,
            choices=STYLE_NAMES,
            value=DEFAULT_STYLE_NAME,
            label="Image Style",
        )
        negative_prompt = gr.Text(
            label="Negative prompt",
            max_lines=1,
            placeholder="Enter a negative prompt",
            visible=True,
        )
        seed = gr.Slider(
            label="Seed",
            minimum=0,
            maximum=MAX_SEED,
            step=1,
            value=0,
        )
        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
        with gr.Row(visible=True):
            width = gr.Slider(
                label="Width",
                minimum=256,
                maximum=MAX_IMAGE_SIZE,
                step=32,
                value=config.image_size,
            )
            height = gr.Slider(
                label="Height",
                minimum=256,
                maximum=MAX_IMAGE_SIZE,
                step=32,
                value=config.image_size,
            )
        with gr.Row():
            dpms_guidance_scale = gr.Slider(
                label="DPM-Solver Guidance scale",
                minimum=1,
                maximum=10,
                step=0.1,
                value=4.5,
            )
            dpms_inference_steps = gr.Slider(
                label="DPM-Solver inference steps",
                minimum=5,
                maximum=40,
                step=1,
                value=14,
            )
        with gr.Row():
            sas_guidance_scale = gr.Slider(
                label="SA-Solver Guidance scale",
                minimum=1,
                maximum=10,
                step=0.1,
                value=3,
            )
            sas_inference_steps = gr.Slider(
                label="SA-Solver inference steps",
                minimum=10,
                maximum=40,
                step=1,
                value=25,
            )

    gr.Examples(
        examples=[
            [
                "anime superman in action",
                "asset/images/controlnet/0_0.png",
            ],
            [
                "illustration of A loving couple standing in the open kitchen of the living room, cooking ,Couples have a full body, with characters accounting for a quarter of the screen, and the composition of the living room has a large perspective, resulting in a larger space.",
                "asset/images/controlnet/0_3.png",
            ],
            [
                "A Electric 4 seats mini VAN,simple design stylel,led headlight,front 45 angle view,sunlight,clear sky.",
                "asset/images/controlnet/0_2.png",
            ],
        ],
        inputs=[prompt, image_input],
        outputs=[result, hed_result, seed],
        fn=generate,
        cache_examples=CACHE_EXAMPLES,

    )

    use_negative_prompt.change(
        fn=lambda x: gr.update(visible=x),
        inputs=use_negative_prompt,
        outputs=negative_prompt,
        api_name=False,
    )

    gr.on(
        triggers=[
            prompt.submit,
            negative_prompt.submit,
            run_button.click,
        ],
        fn=generate,
        inputs=[
            prompt,
            image_input,
            negative_prompt,
            style_selection,
            use_negative_prompt,
            seed,
            width,
            height,
            schedule,
            dpms_guidance_scale,
            sas_guidance_scale,
            dpms_inference_steps,
            sas_inference_steps,
            randomize_seed,
        ],
        outputs=[result, hed_result, seed],
        api_name="run",
    )

if __name__ == "__main__":
    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True)


================================================
FILE: PixArt-alpha-ToCa/app/app_lcm.py
================================================
#!/usr/bin/env python
from __future__ import annotations
import os
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import random
import gradio as gr
import numpy as np
import uuid
from diffusers import PixArtAlphaPipeline, Transformer2DModel
from peft import PeftModel
import torch
from typing import Tuple
from datetime import datetime
import argparse

DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/pixart-lcm.png)
        # PixArt-LCM 1024px
        #### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://huggingface.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS) checkpoint.
        #### [LCMs](https://github.com/luosiallen/latent-consistency-model) is a diffusion distillation method which predict PF-ODE's solution directly in latent space, achieving super fast inference with few steps.
        #### English prompts ONLY; 提示词仅限英文
        Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing).
        """
if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"

MAX_SEED = np.iinfo(np.int32).max
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
PORT = int(os.getenv("DEMO_PORT", "15432"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


style_list = [
    {
        "name": "(No style)",
        "prompt": "{prompt}",
        "negative_prompt": "",
    },
    {
        "name": "Cinematic",
        "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
        "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
    },
    {
        "name": "Photographic",
        "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
        "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
    },
    {
        "name": "Anime",
        "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime,  highly detailed",
        "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
    },
    {
        "name": "Manga",
        "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
        "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
    },
    {
        "name": "Digital Art",
        "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
        "negative_prompt": "photo, photorealistic, realism, ugly",
    },
    {
        "name": "Pixel art",
        "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
        "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
    },
    {
        "name": "Fantasy art",
        "prompt": "ethereal fantasy concept art of  {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
        "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
    },
    {
        "name": "Neonpunk",
        "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
        "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
    },
    {
        "name": "3D Model",
        "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
        "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
    },
]


styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "(No style)"
NUM_IMAGES_PER_PROMPT = 1

def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
    if not negative:
        negative = ""
    return p.replace("{prompt}", positive), n + negative


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--is_lora', action='store_true', help='enable lora ckpt loading')
    parser.add_argument('--repo_id', default="PixArt-alpha/PixArt-LCM-XL-2-1024-MS", type=str)
    parser.add_argument('--lora_repo_id', default="PixArt-alpha/PixArt-LCM-LoRA-XL-2-1024-MS", type=str)
    return parser.parse_args()


args = get_args()
if torch.cuda.is_available():
    if not args.is_lora:
        pipe = PixArtAlphaPipeline.from_pretrained(
            args.repo_id,
            torch_dtype=torch.float16,
            use_safetensors=True,
        )
    else:
        assert args.lora_repo_id is not None
        transformer = Transformer2DModel.from_pretrained(args.repo_id, subfolder="transformer", torch_dtype=torch.float16)
        transformer = PeftModel.from_pretrained(transformer, args.lora_repo_id)
        pipe = PixArtAlphaPipeline.from_pretrained(
            args.repo_id,
            transformer=transformer,
            torch_dtype=torch.float16,
            use_safetensors=True,
        )
        del transformer

    if ENABLE_CPU_OFFLOAD:
        pipe.enable_model_cpu_offload()
    else:
        pipe.to(device)
        print("Loaded on Device!")

    # speed-up T5
    pipe.text_encoder.to_bettertransformer()

    if USE_TORCH_COMPILE:
        pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
        print("Model Compiled!")


def save_image(img):
    unique_name = f'{str(uuid.uuid4())}.png'
    save_path = os.path.join(f'output/online_demo_img/{datetime.now().date()}')
    os.makedirs(save_path, exist_ok=True)
    unique_name = os.path.join(save_path, unique_name)
    img.save(unique_name)
    return unique_name


def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed


def generate(
        prompt: str,
        negative_prompt: str = "",
        style: str = DEFAULT_STYLE_NAME,
        use_negative_prompt: bool = False,
        seed: int = 0,
        width: int = 1024,
        height: int = 1024,
        inference_steps: int = 4,
        randomize_seed: bool = False,
        use_resolution_binning: bool = True,
        progress=gr.Progress(track_tqdm=True),
):
    seed = int(randomize_seed_fn(seed, randomize_seed))
    generator = torch.Generator().manual_seed(seed)

    if not use_negative_prompt:
        negative_prompt = None  # type: ignore
    prompt, negative_prompt = apply_style(style, prompt, negative_prompt)

    images = pipe(
        prompt=prompt,
        width=width,
        height=height,
        negative_prompt=negative_prompt,
        guidance_scale=0.,
        num_inference_steps=inference_steps,
        generator=generator,
        num_images_per_prompt=NUM_IMAGES_PER_PROMPT,
        use_resolution_binning=use_resolution_binning,
        output_type="pil",
    ).images

    image_paths = [save_image(img) for img in images]
    print(image_paths)
    return image_paths, seed


examples = [
    "A small cactus with a happy face in the Sahara desert.",
    "an astronaut sitting in a diner, eating fries, cinematic, analog film",
    "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
    "stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.",
    "professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
    "beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background",
    "Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism",
    "anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur",
    "The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8",
]

with gr.Blocks(css="scripts/style.css") as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_id="duplicate-button",
        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
    )
    with gr.Group():
        with gr.Row():
            prompt = gr.Text(
                label="Prompt",
                show_label=False,
                max_lines=1,
                placeholder="Enter your prompt",
                container=False,
            )
            run_button = gr.Button("Run", scale=0)
        result = gr.Gallery(label="Result", columns=NUM_IMAGES_PER_PROMPT, show_label=False)
    with gr.Accordion("Advanced options", open=False):
        with gr.Row():
            use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
            negative_prompt = gr.Text(
                label="Negative prompt",
                max_lines=1,
                placeholder="Enter a negative prompt",
                visible=True,
            )
        style_selection = gr.Radio(
            show_label=True,
            container=True,
            interactive=True,
            choices=STYLE_NAMES,
            value=DEFAULT_STYLE_NAME,
            label="Image Style",
        )
        seed = gr.Slider(
            label="Seed",
            minimum=0,
            maximum=MAX_SEED,
            step=1,
            value=0,
        )
        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
        with gr.Row(visible=True):
            width = gr.Slider(
                label="Width",
                minimum=256,
                maximum=MAX_IMAGE_SIZE,
                step=32,
                value=1024,
            )
            height = gr.Slider(
                label="Height",
                minimum=256,
                maximum=MAX_IMAGE_SIZE,
                step=32,
                value=1024,
            )
        with gr.Row():
            inference_steps = gr.Slider(
                label="LCM inference steps",
                minimum=1,
                maximum=30,
                step=1,
                value=4,
            )
    gr.Examples(
        examples=examples,
        inputs=prompt,
        outputs=[result, seed],
        fn=generate,
        cache_examples=CACHE_EXAMPLES,
    )

    use_negative_prompt.change(
        fn=lambda x: gr.update(visible=x),
        inputs=use_negative_prompt,
        outputs=negative_prompt,
        api_name=False,
    )

    gr.on(
        triggers=[
            prompt.submit,
            negative_prompt.submit,
            run_button.click,
        ],
        fn=generate,
        inputs=[
            prompt,
            negative_prompt,
            style_selection,
            use_negative_prompt,
            seed,
            width,
            height,
            inference_steps,
            randomize_seed,
        ],
        outputs=[result, seed],
        api_name="run",
    )

if __name__ == "__main__":
    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True)


================================================
FILE: PixArt-alpha-ToCa/app/style.css
================================================
.gradio-container{width:680px!important}

================================================
FILE: PixArt-alpha-ToCa/app/style_controlnet.css
================================================
.gradio-container{width:768px!important}

================================================
FILE: PixArt-alpha-ToCa/asset/docs/pixart-dreambooth.md
================================================
# 🔥 How to Train PixArt + Dreambooth
- PixArt + [Dreambooth](https://dreambooth.github.io/)
<div id="dreambooth" style="display: flex; justify-content: center;">
  <img src="../images/dreambooth/dreambooth_dog.svg" width="46%" style="margin: 5px;">
  <img src="../images/dreambooth/dreambooth_m5.svg" width="46%" style="margin: 5px;">
</div>

You **ONLY** need to change the **config** file in [config](../../configs/pixart_app_config/PixArt_xl2_img1024_dreambooth.py) and **dataloader** in [dataset](../../diffusion/data/datasets/Dreambooth.py).


The directory structure for Dreambooth dataset is:
```
cd ./data/dreambooth

dataset
├──dog6/
│  ├──00.jpg
│  ├──01.jpg
│  ├──......
├──cat/
│  ├──00.jpg
│  ├──01.jpg
│  ├──......

```

To get started, first install the required dependencies, then run on your local machine:

```bash
cd data/
git clone https://github.com/google/dreambooth.git

python -m torch.distributed.launch --nproc_per_node=1 --master_port=26666 train_scripts/train_dreambooth.py configs/pixart_app_config/PixArt_xl2_img1024_dreambooth.py --work-dir output/path
```


================================================
FILE: PixArt-alpha-ToCa/asset/docs/pixart.md
================================================
<!--Copyright 2023 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->

[//]: # (&#40;reference from [hugging Face]&#40;https://github.com/huggingface/diffusers/blob/docs/8bit-inference-pixart/docs/source/en/api/pipelines/pixart.md&#41;&#41;)

## Running the `PixArtAlphaPipeline` in under 8GB GPU VRAM

It is possible to run the [`PixArtAlphaPipeline`] under 8GB GPU VRAM by loading the text encoder in 8-bit numerical precision. Let's walk through a full-fledged example. 

First, install the `bitsandbytes` library:

```bash
pip install -U bitsandbytes
```

Then load the text encoder in 8-bit:

```python
from transformers import T5EncoderModel
from diffusers import PixArtAlphaPipeline

text_encoder = T5EncoderModel.from_pretrained(
    "PixArt-alpha/PixArt-XL-2-1024-MS",
    subfolder="text_encoder",
    load_in_8bit=True,
    device_map="auto",

)
pipe = PixArtAlphaPipeline.from_pretrained(
    "PixArt-alpha/PixArt-XL-2-1024-MS",
    text_encoder=text_encoder,
    transformer=None,
    device_map="auto"
)
```

Now, use the `pipe` to encode a prompt:

```python
with torch.no_grad():
    prompt = "cute cat"
    prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt)

del text_encoder
del pipe
flush()
```

`flush()` is just a utility function to clear the GPU VRAM and is implemented like so:

```python
import gc 

def flush():
    gc.collect()
    torch.cuda.empty_cache()
```

Then compute the latents providing the prompt embeddings as inputs:

```python
pipe = PixArtAlphaPipeline.from_pretrained(
    "PixArt-alpha/PixArt-XL-2-1024-MS",
    text_encoder=None,
    torch_dtype=torch.float16,
).to("cuda")

latents = pipe(
    negative_prompt=None, 
    prompt_embeds=prompt_embeds,
    negative_prompt_embeds=negative_embeds,
    prompt_attention_mask=prompt_attention_mask,
    negative_prompt_attention_mask=negative_prompt_attention_mask,
    num_images_per_prompt=1,
    output_type="latent",
).images

del pipe.transformer
flush()
```

Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded. 

Once the latents are computed, pass it off the VAE to decode into a real image:

```python
with torch.no_grad():
    image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
image = pipe.image_processor.postprocess(image, output_type="pil")
image.save("cat.png")
```

All of this, put together, should allow you to run [`PixArtAlphaPipeline`] under 8GB GPU VRAM.

![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pixart/8bits_cat.png)

Find the script [here](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e) that can be run end-to-end to report the memory being used.

<Tip warning={true}>

Text embeddings computed in 8-bit can have an impact on the quality of the generated images because of the information loss in the representation space induced by the reduced precision. It's recommended to compare the outputs with and without 8-bit.

</Tip>

================================================
FILE: PixArt-alpha-ToCa/asset/docs/pixart_comfyui.md
================================================
<!--Copyright 2023 The Huawei Noah’s Ark Lab Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->

## 🔥 How to use PixArt in ComfyUI

### 1. Preparation for PixArt running envrironment

```bash
cd /workspace

conda create -n pixart python==3.9.0
conda activate pixart
pip install torch==2.0.0+cu117 torchvision==0.15.1+cu117 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu117

git clone https://github.com/PixArt-alpha/PixArt-alpha.git
cd PixArt-alpha
pip install -r requirements.txt
```

### 2. Install ComfyUI related dependencies

```bash
cd /workspace
git clone https://github.com/comfyanonymous/ComfyUI.git

cd ComfyUI
git clone https://github.com/city96/ComfyUI_ExtraModels custom_nodes/ComfyUI_ExtraModels
```

### 3. Download all the checkpoints: PixArt, VAE, T5 with script

```bash
cd /workspace/PixArt
python tools/download.py --model_names "PixArt-XL-2-1024-MS.pth"
```
or download with urls:[PixArt ckpt](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-1024-MS.pth), [VAE ckpt](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/sd-vae-ft-ema), 
[T5 ckpt](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl).

### 4. Put Checkpoints into corresponding folders
```bash
cd /workspace/ComfyUI

mv /path/to/PixArt-XL-2-1024-MS.pth ./models/checkpoints/
mv /path/to/sd-vae-ft-ema ./models/VAE/
mv /path/to/t5-v1_1-xxl ./models/t5/
```
### 5. run the ComfyUI website
```bash
cd /workspace/ComfyUI

python main.py --port 11111 --listen 0.0.0.0
```
Open http://your-server-ip:11111 to play with PixArt.

### 6. Create your own custom nodes
Here we prepare two examples for better understanding:

1) [PixArt Text-to-Image workflow](https://huggingface.co/PixArt-alpha/PixArt-alpha/blob/main/PixArt-image-to-image-workflow.json)

2) [PixArt Image-to-Image workflow](https://huggingface.co/PixArt-alpha/PixArt-alpha/blob/main/PixArt-image-to-image-workflow.json)

Once you download these json files, you can open your server website which is `http://your-server-ip:11111` and drop the json file into the website window to begin the PixArt-ComfyUI playground.

================================================
FILE: PixArt-alpha-ToCa/asset/docs/pixart_controlnet.md
================================================
<!--Copyright 2023 The Huawei Noah’s Ark Lab Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->


## 🔥 ControlNet
We incorporate a ControlNet-like(https://github.com/lllyasviel/ControlNet) module enables fine-grained control over text-to-image diffusion models. We introduce a novel ControlNet-Transformer architecture, specifically tailored for Transformers, achieving explicit controllability alongside high-quality image generation.

For more details about PixArt-ControlNet, please check the technical report [PixArt-δ](https://arxiv.org/abs/2401.05252).

<p align="center">
  <img src="../images/controlnet.PNG"  height=480>
</p>


## Training the `PixArt + ControlNet` on your machine

```bash
# Train on 1024px
python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_controlnet.py configs/pixart_app_config/PixArt_xl2_img1024_controlHed.py --work-dir output/pixartcontrolnet-xl2-img1024

# Train on 512px
python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_controlnet.py configs/pixart_app_config/PixArt_xl2_img512_controlHed.py --work-dir output/pixartcontrolnet-xl2-img512
```

## Testing the `PixArt + ControlNet`
```bash
# Test on 1024px
DEMO_PORT= 12345 python app/app_controlnet.py configs/pixart_app_config/PixArt_xl2_img1024_controlHed.py --model_path path/to/1024px/PixArt-XL-2-1024-ControlNet.pth

# Test on 512px
DEMO_PORT= 12345 python app/app_controlnet.py configs/pixart_app_config/PixArt_xl2_img512_controlHed.py --model_path path/to/512px/pixart_controlnet_ckpt
```
Then have a look at a simple example using the http://your-server-ip:12345


================================================
FILE: PixArt-alpha-ToCa/asset/docs/pixart_inpaint.md
================================================

```python
import torch
from scripts.pipeline_pixart_inpaint import PixArtAlphaInpaintPipeline
from PIL import Image

pipe = PixArtAlphaInpaintPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)

prompt = ""
image = Image.open('')
mask_image = Image.open('')
out = pipe(prompt, image=image, mask_image=mask_image, strength=1.0).images[0]
out.save('./cactus_removed.png')
```

================================================
FILE: PixArt-alpha-ToCa/asset/docs/pixart_lcm.md
================================================
<!--Copyright 2023 The Huawei Noah’s Ark Lab Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->

<p align="center">
  <img src="https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/pixart-lcm2.png"  height=120>
</p>

## 🔥 Why Need PixArt-LCM
Following [LCM LoRA](https://huggingface.co/blog/lcm_lora), we illustrative of the generation speed we achieve on various computers. Let us stress again how liberating it is to explore image generation so easily with PixArt-LCM.

| Hardware                    | PixArt-LCM (4 steps) | SDXL LoRA LCM (4 steps) | PixArt standard (14 steps) | SDXL standard (25 steps) |
|-----------------------------|----------------------|-------------------------|----------------------------|---------------------------|
| T4 (Google Colab Free Tier) | 3.3s                 | 8.4s                    | 16.0s                      | 26.5s                     |
| A100 (80 GB)                | 0.51s                | 1.2s                    | 2.2s                       | 3.8s                      |
| V100 (32 GB)                | 0.8s                 | 1.2s                    | 5.5s                       | 7.7s                      |

These tests were run with a batch size of 1 in all cases.

For cards with a lot of capacity, such as A100, performance increases significantly when generating multiple images at once, which is usually the case for production workloads.

## Training the `PixArt + LCM` on your machine

```bash
python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_pixart_lcm.py configs/pixart_config/PixArt_xl2_img1024_lcm.py --work-dir output/pixartlcm-xl2-img1024_ft
```

## Trainig the `PixArt + LCM-LoRA`

```bash
python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_pixart_lcm_lora.py configs/pixart_config/PixArt_xl2_img1024_lcm.py --work-dir output/pixartlcm-lora-xl2-img1024_ft
```

## Testing the `PixArt + LCM` on your machine

```bash
DEMO_PORT=12345 python app/app_lcm.py

Then have a look at a simple example using the http://your-server-ip:12345
```

## Testing the `PixArt + LCM-LoRA`

```bash
DEMO_PORT=12345 python app/app_lcm.py --is_lora --lora_repo_id output/pixartlcm-lora-xl2-img1024_ft/checkpoint-xxx

Then have a look at a simple example using the http://your-server-ip:12345
```

## Integration in diffusers
### Using in 🧨 diffusers

Make sure you have the updated versions of the following libraries:

```bash
pip install -U transformers accelerate diffusers
```

And then:

```python
import torch
from diffusers import PixArtAlphaPipeline, AutoencoderKL

# for PixArt-LCM
pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", torch_dtype=torch.float16, use_safetensors=True)

# for PixArt-LCM-LoRA
# transformer = Transformer2DModel.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", subfolder="transformer", torch_dtype=torch.float16)
# transformer = PeftModel.from_pretrained(transformer, "PixArt-alpha/PixArt-LCM-LoRA-XL-2-1024-MS")
# pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", transformer=transformer, torch_dtype=torch.float16, use_safetensors=True)
# del transformer

# Enable memory optimizations.
pipe.enable_model_cpu_offload()

prompt = "A small cactus with a happy face in the Sahara desert."
image = pipe(prompt, guidance_scale=0., num_inference_steps=4).images[0]
```

This integration allows running the pipeline with a batch size of 4 under 11 GBs of GPU VRAM. 
Check out the [documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pixart) to learn more.

# Keeping updating

================================================
FILE: PixArt-alpha-ToCa/asset/docs/sasolver.md
================================================
## SA-Solver: Stochastic Adams Solver for Fast Sampling of Diffusion Models (Neurips 2023)
<div align="center">
  <a href="https://arxiv.org/pdf/2309.05019.pdf"><img src="https://img.shields.io/static/v1?label=Paper&message=Arxiv&color=red&logo=arxiv"></a> &ensp;
  <a href="https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/sa_solver_diffusers.py"><img src="https://img.shields.io/static/v1?label=Project%20Page&message=Github&color=blue&logo=github-pages"></a> &ensp;
</div>

> [**SA-Solver: Stochastic Adams Solver for Fast Sampling of Diffusion Models (Neurips 2023)**](https://arxiv.org/pdf/2309.05019.pdf)<br>
> [Shuchen Xue*](https://github.com/scxue), [Mingyang Yi]()&#8224;, 
> [Weijian Luo](), [Shifeng Zhang](), [Jiacheng Sun](),
> [Zhenguo Li](https://scholar.google.com/citations?user=XboZC1AAAAAJ),
> [Zhi-Ming Ma]()
> <br>University of Chinese Academy of Sciences, Huawei Noah’s Ark Lab, Peking University<br>
---

## 🐱 Abstract
SA-Solver is a stochastic diffusion sampler based on Stochastic Adams Method. It is training-free and can be employed into pretrained diffusion models. It is a multistep SDE solver that can do fast stochastic sampling. 

1. The parameter 'tau function' controls the stochasticity in the sampling process. Inspired by EDM, we choose the 'tau function' to be a piecewise constant function that is greater than 0 in the middle stage of sampling process and equals zero in the start and end stage. Specifically, we choose the default value of this parameter to be

```python
tau_func = lambda t: 1 if t >= 200 and t <= 800 else 0
```

in diffusers library and 

```python
tau_t = lambda t: eta if 0.2 <= t <= 0.8 else 0
```

in ldm library. (The difference is because the time transformation * 1000).

The value '1' represents the magnitude of stochasticity. Higher value are recommended with more NFEs.

If you want to employ deterministic sampling (solving diffusion ODE) in SA-Solver, please set

```python
tau_func = lambda t: 0
```

If you want to employ original stochastic sampling (solving original diffusion SDE) in SA-Solver, please set

```python
tau_func = lambda t: 1
```


2. The parameter 'predictor_order' and 'corrector_order' controls the specific orders of 'SA-Predictor' and 'SA-Corrector'. For unconditional generation and conditional generation with small classifier-free guidance scale, the recommended orders are 'predictor_order = 3' and 'corrector_order = 4'; for conditional generation with large classifier-free guidance scale (e.g. t2i), the recommended orders are 'predictor_order = 2' and 'corrector_order = 2'.


================================================
FILE: PixArt-alpha-ToCa/asset/examples.py
================================================
examples = [
    [
        "A small cactus with a happy face in the Sahara desert.",
        "dpm-solver", 20, 4.5,
        "https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/carousel/carousel1.png",
        "Prompt: A small cactus with a happy face in the Sahara desert. \nSize: --ar 1:1.",
        "Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"],
    [
        "Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, "
        "spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, "
        "intricate detail. --ar 6144:4096.",
        "dpm-solver", 20, 4.5,
        "https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/samples/15.png",
        "Prompt: Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, "
        "spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, "
        "intricate detail.\nSize: --ar 6144:4096.",
        "Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"],
    [
        "stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, "
        "blue and pink, brilliantly illuminated in the background.",
        "dpm-solver", 20, 4.5,
        "https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/samples/13.png",
        "stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.",
        "Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"],
    [
        "nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.",
        "dpm-solver", 20, 4.5,
        "https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/samples/14.png",
        "nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.",
        "Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"],
]

================================================
FILE: PixArt-alpha-ToCa/asset/samples.txt
================================================
A small cactus with a happy face in the Sahara desert.
Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.
beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background
stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.
nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.
Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism
anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur
The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8
Bright scene, aerial view, ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens.
8k uhd A man looks up at the starry sky, lonely and ethereal, Minimalism, Chaotic composition Op Art
A middle-aged woman of Asian descent, her dark hair streaked with silver, appears fractured and splintered, intricately embedded within a sea of broken porcelain. The porcelain glistens with splatter paint patterns in a harmonious blend of glossy and matte blues, greens, oranges, and reds, capturing her dance in a surreal juxtaposition of movement and stillness. Her skin tone, a light hue like the porcelain, adds an almost mystical quality to her form.
A 4k dslr image of a lemur wearing a red magician hat and a blue coat performing magic tricks with cards in a garden.
A alpaca made of colorful building blocks, cyberpunk
A baby painter trying to draw very simple picture, white background
A boy and a girl fall in love
A dog that has been meditating all the time
A man is sitting in a chair with his chin resting on his hand. The chair, along with the man's feet, are submerged in the sea. Strikingly, the man's back is on fire.
A painter study hard to learn how to draw with many concepts in the air, white background
A painter with low quality, white background, pixel art
A person standing on the desert, desert waves, gossip illustration, half red, half blue, abstract image of sand, clear style, trendy illustration, outdoor, top view, clear style, precision art, ultra high definition image
A silhouette of a grand piano overlooking a dusky cityscape viewed from a top-floor penthouse, rendered in the bold and vivid sytle of a vintage travel poster.
A sureal parallel world where mankind avoid extinction by preserving nature, epic trees, water streams, various flowers, intricate details, rich colors, rich vegetation, cinematic, symmetrical, beautiful lighting, V-Ray render, sun rays, magical lights, photography
A woman is shopping for fresh produce at the farmer's market.
A worker that looks like a mixture of cow and horse is working hard to type code
A young man dressed in ancient Chinese clothing, Asian people, White robe, Handsome, Hand gestures forming a spell, Martial arts and fairy-like vibe, Carrying a legendary-level giant sword on the back, Game character, Surrounded by runes, Cyberpunk style, neon lights, best quality, masterpiece, cg, hdr, high-definition, extremely detailed, photorealistic, epic, character design, detailed face, superhero, hero, detailed UHD, real-time, vfx, 3D rendering, 8k
An alien octopus floats through a protal reading a newspaper
An epressive oil painting of a basketbal player dunking, depicted as an explosion of  a nebula
art collection style and fashion shoot, in the style of made of glass, dark blue and light pink, paul rand, solarpunk, camille vivier, beth didonato hair, barbiecore, hyper-realistic
artistic
beautiful secen
Crocodile in a sweater
Design a letter A, 3D stereoscopic Ice material Interior light blue Conceptual product design Futuristic Blind box toy Handcrafted Exquisite 3D effect Full body display Ultra-high precision Ultra-detailed Perfect lighting OC Renderer Blender 8k Ultra-sharp Ultra-noise reduction
Floating,colossal,futuristic statue in the sky, awe-inspiring and serenein the style of Stuart Lippincott:2with detailed composition and subtle geometric elements.This sanctuary-ike atmosphere features crisp clarity and soft amber tones.In contrasttiny human figures surround the statueThe pieceincorporates flowing draperiesreminiscent of Shwedoff and Philip McKay's stylesemphasizing thejuxtaposition between the powerful presence of the statue and thevulnerability of the minuscule human figuresshwedoff
knolling of a drawing tools for painter
Leonardo da Vinci's Last Supper content, Van Goph's Starry Night Style
Luffy from ONEPIECE, handsome face, fantasy
photography shot through an outdoor window of a coffee shop with neon sign lighting, window glares and reflections, depth of field, {little girl with red hair sitting at a table, portrait, kodak portra 800,105 mm f1.8
poster of a mechanical cat, techical Schematics viewed from front and side view on light white blueprint paper, illustartion drafting style, illustation, typography, conceptual art, dark fantasy steampunk, cinematic, dark fantasy
The girl in the car is filled with goldfish and flowers, goldfish can fly, Kawaguchi Renko's art, natural posture, holiday dadcore, youthful energy and pressure, body stretching, goldfish simulation movies in the sky, super details, and dreamy high photography. Colorful. Covered by water and goldfish, indoor scene, close-up shot in XT4 movie
The image features a woman wearing a red shirt with an icon. She appears to be posing for the camera, and her outfit includes a pair of jeans. The woman seems to be in a good mood, as she is smiling. The background of the image is blurry, focusing more on the woman and her attire.
The towel was on top of the hard counter.
A vast landscape made entirely of various meats spreads out before the viewer. tender, succulent hills of roast beef, chicken drumstick trees, bacon rivers, and ham boulders create a surreal, yet appetizing scene. the sky is adorned with pepperoni sun and salami clouds.
I want to supplement vitamin c, please help me paint related food.
A vibrant yellow banana-shaped couch sits in a cozy living room, its curve cradling a pile of colorful cushions. on the wooden floor, a patterned rug adds a touch of eclectic charm, and a potted plant sits in the corner, reaching towards the sunlight filtering through the window.
A transparent sculpture of a duck made out of glass. The sculpture is in front of a painting of a landscape.
A blue jay standing on a large basket of rainbow macarons.
A bucket bag made of blue suede. The bag is decorated with intricate golden paisley patterns. The handle of the bag is made of rubies and pearls.
An alien octopus floats through a portal reading a newspaper.
bird's eye view of a city.
beautiful scene
A 2D animation of a folk music band composed of anthropomorphic autumn leaves, each playing traditional bluegrass instruments, amidst a rustic forest setting dappled with the soft light of a harvest moon.
In front of a deep black backdrop, a figure of middle years, her Tongan skin rich and glowing, is captured mid-twirl, her curly hair flowing like a storm behind her. Her attire resembles a whirlwind of marble and porcelain fragments. Illuminated by the gleam of scattered porcelain shards, creating a dreamlike atmosphere, the dancer manages to appear fragmented, yet maintains a harmonious and fluid form.
Digital illustration of a beach scene crafted from yarn. The sandy beach is depicted with beige yarn, waves are made of blue and white yarn crashing onto the shore. A yarn sun sets on the horizon, casting a warm glow. Yarn palm trees sway gently, and little yarn seashells dot the shoreline.
Illustration of a chic chair with a design reminiscent of a pumpkin’s form, with deep orange cushioning, in a stylish loft setting.
A detailed oil painting of an old sea captain, steering his ship through a storm. Saltwater is splashing against his weathered face, determination in his eyes. Twirling malevolent clouds are seen above and stern waves threaten to submerge the ship while seagulls dive and twirl through the chaotic landscape. Thunder and lights embark in the distance, illuminating the scene with an eerie green glow.
An illustration of a human heart made of translucent glass, standing on a pedestal amidst a stormy sea. Rays of sunlight pierce the clouds, illuminating the heart, revealing a tiny universe within. The quote 'Find the universe within you' is etched in bold letters across the horizon.
A modern architectural building with large glass windows, situated on a cliff overlooking a serene ocean at sunset
photo of an ancient shipwreck nestled on the ocean floor. Marine plants have claimed the wooden structure, and fish swim in and out of its hollow spaces. Sunken treasures and old cannons are scattered around, providing a glimpse into the past
A 3D render of a coffee mug placed on a window sill during a stormy day. The storm outside the window is reflected in the coffee, with miniature lightning bolts and turbulent waves seen inside the mug. The room is dimly lit, adding to the dramatic atmosphere.A minimap diorama of a cafe adorned with indoor plants. Wooden beams crisscross above, and a cold brew station stands out with tiny bottles and glasses.
An antique botanical illustration drawn with fine lines and a touch of watercolour whimsy, depicting a strange lily crossed with a Venus flytrap, its petals poised as if ready to snap shut on any unsuspecting insects.An illustration inspired by old-world botanical sketches blends a cactus with lilac blooms into a Möbius strip, using detailed lines and subtle watercolor touches to capture nature's diverse beauty and mathematical intrigue.
An ink sketch style illustration of a small hedgehog holding a piece of watermelon with its tiny paws, taking little bites with its eyes closed in delight.Photo of a lychee-inspired spherical chair, with a bumpy white exterior and plush interior, set against a tropical wallpaper.
3d digital art of an adorable ghost, glowing within, holding a heart shaped pumpkin, Halloween, super cute, spooky haunted house background
professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.
an astronaut sitting in a diner, eating fries, cinematic, analog film

================================================
FILE: PixArt-alpha-ToCa/configs/PixArt_xl2_internal.py
================================================
data_root = '/data/data'
data = dict(type='InternalData', root='images', image_list_json=['data_info.json'], transform='default_train', load_vae_feat=True)
image_size = 256  # the generated image resolution
train_batch_size = 32
eval_batch_size = 16
use_fsdp=False   # if use FSDP mode
valid_num=0      # take as valid aspect-ratio when sample number >= valid_num

# model setting
model = 'PixArt_XL_2'
aspect_ratio_type = None         # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256]
multi_scale = False     # if use multiscale dataset model training
lewei_scale = 1.0    # lewei_scale for positional embedding interpolation
# training setting
num_workers=4
train_sampling_steps = 1000
eval_sampling_steps = 250
model_max_length = 120
lora_rank = 4

num_epochs = 80
gradient_accumulation_steps = 1
grad_checkpointing = False
gradient_clip = 1.0
gc_step = 1
auto_lr = dict(rule='sqrt')

# we use different weight decay with the official implementation since it results better result
optimizer = dict(type='AdamW', lr=1e-4, weight_decay=3e-2, eps=1e-10)
lr_schedule = 'constant'
lr_schedule_args = dict(num_warmup_steps=500)

save_image_epochs = 1
save_model_epochs = 1
save_model_steps=1000000

sample_posterior = True
mixed_precision = 'fp16'
scale_factor = 0.18215
ema_rate = 0.9999
tensorboard_mox_interval = 50
log_interval = 50
cfg_scale = 4
mask_type='null'
num_group_tokens=0
mask_loss_coef=0.
load_mask_index=False    # load prepared mask_type index
# load model settings
vae_pretrained = "/cache/pretrained_models/sd-vae-ft-ema"
load_from = None
resume_from = dict(checkpoint=None, load_ema=False, resume_optimizer=True, resume_lr_scheduler=True)
snr_loss=False

# work dir settings
work_dir = '/cache/exps/'
s3_work_dir = None

seed = 43


================================================
FILE: PixArt-alpha-ToCa/configs/PixArt_xl2_sam.py
================================================
data_root = '/data/data'
data = dict(type='SAM', root='images', image_list_txt='part0.txt', transform='default_train', load_vae_feat=True)
image_size = 256  # the generated image resolution
train_batch_size = 32
eval_batch_size = 16
use_fsdp=False   # if use FSDP mode

# model setting
model = 'PixArt_XL_2'
aspect_ratio_type = None         # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_1024]
multi_scale = False     # if use multiscale dataset model training
lewei_scale = 1.0
model_max_length = 120
lora_rank = 4
# training setting
num_workers=4
train_sampling_steps = 1000
eval_sampling_steps = 250

num_epochs = 80
gradient_accumulation_steps = 1
grad_checkpointing = False
gc_step = 1
gradient_clip = 1.0
auto_lr = dict(rule='sqrt')

# we use different weight decay with the official implementation since it results better result
optimizer = dict(type='AdamW', lr=1e-4, weight_decay=3e-2, eps=1e-10)
lr_schedule = 'constant'
lr_schedule_args = dict(num_warmup_steps=500)

save_image_epochs = 1
save_model_epochs = 1
save_model_steps=1000000

sample_posterior = True
mixed_precision = 'fp16'
scale_factor = 0.18215
ema_rate = 0.9999
tensorboard_mox_interval = 50
log_interval = 50
cfg_scale = 4
mask_type='null'
num_group_tokens=0
mask_loss_coef=0.
load_mask_index=False    # load prepared mask_type index
# load model settings
vae_pretrained = "/cache/pretrained_models/sd-vae-ft-ema"
load_from = None
resume_from = dict(checkpoint=None, load_ema=False, resume_optimizer=True, resume_lr_scheduler=True)
snr_loss=False

# work dir settings
work_dir = '/cache/exps/'
s3_work_dir = None

seed = 43


================================================
FILE: PixArt-alpha-ToCa/configs/pixart_app_config/PixArt_xl2_img1024_controlHed.py
================================================
_base_ = ['../PixArt_xl2_internal.py']
data_root = 'data'
image_list_json = ['data_info.json',]

data = dict(type='InternalDataHed', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True)
image_size = 1024

# model setting
model = 'PixArtMS_XL_2'
fp32_attention = False  # Set to True if you got NaN loss
load_from = 'path-to-pixart-checkpoints'
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"
window_block_indexes = []
window_size=0
use_rel_pos=False
lewei_scale = 2.0

# training setting
num_workers=10
train_batch_size = 4 #  set the batch size according to your VRAM
num_epochs = 10 # 3
gradient_accumulation_steps = 4
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10)
lr_schedule_args = dict(num_warmup_steps=0)
save_model_epochs=5
save_model_steps=1000

log_interval = 20
eval_sampling_steps = 200
work_dir = 'output_debug/debug'

# controlnet related params
copy_blocks_num = 13
class_dropout_prob = 0.5
train_ratio = 1


================================================
FILE: PixArt-alpha-ToCa/configs/pixart_app_config/PixArt_xl2_img1024_dreambooth.py
================================================
_base_ = ['../PixArt_xl2_internal.py']
data_root = 'data/dreambooth/dataset'

data = dict(type='DreamBooth', root='dog6', prompt=['a photo of sks dog'], transform='default_train', load_vae_feat=True)
image_size = 1024

# model setting
model = 'PixArtMS_XL_2'     # model for multi-scale training
fp32_attention = True
load_from = 'Path/to/PixArt-XL-2-1024-MS.pth'
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"
window_block_indexes = []
window_size=0
use_rel_pos=False
aspect_ratio_type = 'ASPECT_RATIO_1024'         # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256]
multi_scale = True     # if use multiscale dataset model training
lewei_scale = 2.0

# training setting
num_workers=1
train_batch_size = 1
num_epochs = 200
gradient_accumulation_steps = 1
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=5e-6, weight_decay=3e-2, eps=1e-10)
lr_schedule_args = dict(num_warmup_steps=0)
auto_lr = None

log_interval = 1
save_model_epochs=10000
save_model_steps=100
work_dir = 'output/debug'


================================================
FILE: PixArt-alpha-ToCa/configs/pixart_app_config/PixArt_xl2_img512_controlHed.py
================================================
_base_ = ['../PixArt_xl2_internal.py']
data_root = 'data'
image_list_json = ['data_info.json',]

data = dict(type='InternalDataHed', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True)
image_size = 512

# model setting
model = 'PixArt_XL_2'
fp32_attention = False  # Set to True if you got NaN loss
load_from = 'path-to-pixart-checkpoints'
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"
window_block_indexes = []
window_size=0
use_rel_pos=False
lewei_scale = 1.0

# training setting
num_workers=10
train_batch_size = 12 # 32  # max 96 for DiT-L/4 when grad_checkpoint
num_epochs = 1000 # 3
gradient_accumulation_steps = 4
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10)
lr_schedule_args = dict(num_warmup_steps=0)
save_model_epochs=5
save_model_steps=1000

log_interval = 20
eval_sampling_steps = 200
work_dir = 'output_debug/debug'

# controlnet related params
copy_blocks_num = 13
class_dropout_prob = 0.5
train_ratio = 0.1


================================================
FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img1024_internal.py
================================================
_base_ = ['../PixArt_xl2_internal.py']
data_root = 'data'
image_list_json = ['data_info.json',]

data = dict(type='InternalData', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True)
image_size = 1024

# model setting
window_block_indexes = []
window_size=0
use_rel_pos=False
model = 'PixArt_XL_2'
fp32_attention = True
load_from = None
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"
lewei_scale = 2.0

# training setting
num_workers=10
train_batch_size = 2 # 32
num_epochs = 200 # 3
gradient_accumulation_steps = 1
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10)
lr_schedule_args = dict(num_warmup_steps=1000)

eval_sampling_steps = 200
log_interval = 20
save_model_epochs=1
save_model_steps=2000
work_dir = 'output/debug'


================================================
FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img1024_internalms.py
================================================
_base_ = ['../PixArt_xl2_internal.py']
data_root = 'data'
image_list_json = ['data_info.json',]

data = dict(type='InternalDataMS', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True)
image_size = 1024

# model setting
model = 'PixArtMS_XL_2'     # model for multi-scale training
fp32_attention = True
load_from = None
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"
window_block_indexes = []
window_size=0
use_rel_pos=False
aspect_ratio_type = 'ASPECT_RATIO_1024'         # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256]
multi_scale = True     # if use multiscale dataset model training
lewei_scale = 2.0

# training setting
num_workers=10
train_batch_size = 12   # max 14 for PixArt-xL/2 when grad_checkpoint
num_epochs = 10 # 3
gradient_accumulation_steps = 1
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10)
lr_schedule_args = dict(num_warmup_steps=1000)
save_model_epochs=1
save_model_steps=2000

log_interval = 20
eval_sampling_steps = 200
work_dir = 'output/debug'


================================================
FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img1024_lcm.py
================================================
_base_ = ['../PixArt_xl2_internal.py']
data_root = 'data'
image_list_json = ['data_info.json',]

data = dict(type='InternalDataMS', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True)
image_size = 1024

# model setting
model = 'PixArtMS_XL_2'     # model for multi-scale training
fp32_attention = False  # Set to True if you got NaN loss
load_from = None
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"
window_block_indexes = []
window_size=0
use_rel_pos=False
aspect_ratio_type = 'ASPECT_RATIO_1024'         # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256]
multi_scale = True     # if use multiscale dataset model training
lewei_scale = 2.0

# training setting
num_workers=4
train_batch_size = 16   # max 12 for PixArt-xL/2 when grad_checkpoint   16 for LCM-LoRA
num_epochs = 10 # 3
gradient_accumulation_steps = 1
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=2e-5, weight_decay=0.0, eps=1e-10)
# optimizer = dict(type='CAMEWrapper', lr=1e-7, weight_decay=0.0, betas=(0.9, 0.999, 0.9999), eps=(1e-30, 1e-16))
lr_schedule_args = dict(num_warmup_steps=100)
save_model_epochs=1
save_model_steps=200
valid_num=0      # take as valid aspect-ratio when sample number >= valid_num

log_interval = 10
eval_sampling_steps = 200
work_dir = 'output/debug'

# LCM
loss_type = 'huber'
huber_c = 0.001
num_ddim_timesteps=50
w_max = 15.0
w_min = 3.0
ema_decay = 0.95
cfg_scale = 4.5
class_dropout_prob = 0.
lora_rank = 32

================================================
FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img256_SAM.py
================================================
_base_ = ['../PixArt_xl2_sam.py']
data_root = 'data'
image_list_txt = ['part0.txt', 'part1.txt', 'part2.txt', 'part3.txt', 'part4.txt', 'part5.txt', 'part6.txt', 'part7.txt', 'part8.txt',
                  'part9.txt', 'part10.txt', 'part11.txt', 'part12.txt', 'part13.txt', 'part14.txt','part15.txt','part16.txt',
                  'part17.txt','part18.txt','part19.txt','part20.txt','part21.txt', 'part22.txt', 'part23.txt', 'part24.txt',
                  'part25.txt', 'part26.txt', 'part27.txt', 'part28.txt', 'part29.txt', 'part30.txt', 'part31.txt']
data = dict(type='SAM', root='SA1B', image_list_txt=image_list_txt, transform='default_train', load_vae_feat=True)
image_size = 256

# model setting
window_block_indexes=[]
window_size=0
use_rel_pos=False
model = 'PixArt_XL_2'
fp32_attention = True
load_from = None
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"

# training setting
use_fsdp=False   # if use FSDP mode
num_workers=10
train_batch_size = 176 # 32
num_epochs = 200 # 3
gradient_accumulation_steps = 1
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10)
lr_schedule_args = dict(num_warmup_steps=1000)

eval_sampling_steps = 200
log_interval = 20
save_model_epochs=2
save_model_steps=20000
work_dir = 'output/debug'


================================================
FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img256_internal.py
================================================
_base_ = ['../PixArt_xl2_internal.py']
data_root = 'data'
image_list_json = ['data_info.json',]

data = dict(type='InternalData', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True)
image_size = 256

# model setting
window_block_indexes=[]
window_size=0
use_rel_pos=False
model = 'PixArt_XL_2'
fp32_attention = True
load_from = None
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"
# training setting
eval_sampling_steps = 200

num_workers=10
train_batch_size = 176 # 32  # max 96 for PixArt-L/4 when grad_checkpoint
num_epochs = 200 # 3
gradient_accumulation_steps = 1
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10)
lr_schedule_args = dict(num_warmup_steps=1000)

log_interval = 20
save_model_epochs=5
work_dir = 'output/debug'


================================================
FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img512_internal.py
================================================
_base_ = ['../PixArt_xl2_internal.py']
data_root = 'data'
image_list_json = ['data_info.json',]

data = dict(type='InternalData', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True)
image_size = 512

# model setting
window_block_indexes = []
window_size=0
use_rel_pos=False
model = 'PixArt_XL_2'
fp32_attention = True
load_from = None
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"
lewei_scale = 1.0

# training setting
use_fsdp=False   # if use FSDP mode
num_workers=10
train_batch_size = 38 # 32
num_epochs = 200 # 3
gradient_accumulation_steps = 1
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10)
lr_schedule_args = dict(num_warmup_steps=1000)

eval_sampling_steps = 200
log_interval = 20
save_model_epochs=1
work_dir = 'output/debug'


================================================
FILE: PixArt-alpha-ToCa/configs/pixart_config/PixArt_xl2_img512_internalms.py
================================================
_base_ = ['../PixArt_xl2_internal.py']
data_root = 'data'
image_list_json = ['data_info.json',]

data = dict(type='InternalDataMS', root='InternData', image_list_json=image_list_json, transform='default_train', load_vae_feat=True)
image_size = 512

# model setting
model = 'PixArtMS_XL_2'     # model for multi-scale training
fp32_attention = True
load_from = None
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"
window_block_indexes = []
window_size=0
use_rel_pos=False
aspect_ratio_type = 'ASPECT_RATIO_512'         # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256]
multi_scale = True     # if use multiscale dataset model training
lewei_scale = 1.0

# training setting
num_workers=10
train_batch_size = 40   # max 40 for PixArt-xL/2 when grad_checkpoint
num_epochs = 20 # 3
gradient_accumulation_steps = 1
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10)
lr_schedule_args = dict(num_warmup_steps=1000)
save_model_epochs=1
save_model_steps=2000

log_interval = 20
eval_sampling_steps = 200
work_dir = 'output/debug'


================================================
FILE: PixArt-alpha-ToCa/diffusion/__init__.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py

from .iddpm import IDDPM
from .dpm_solver import DPMS
from .sa_sampler import SASolverSampler


================================================
FILE: PixArt-alpha-ToCa/diffusion/data/__init__.py
================================================
from .datasets import *
from .transforms import get_transform


================================================
FILE: PixArt-alpha-ToCa/diffusion/data/builder.py
================================================
import os
import time

from mmcv import Registry, build_from_cfg
from torch.utils.data import DataLoader

from diffusion.data.transforms import get_transform
from diffusion.utils.logger import get_root_logger

DATASETS = Registry('datasets')

DATA_ROOT = '/cache/data'


def set_data_root(data_root):
    global DATA_ROOT
    DATA_ROOT = data_root


def get_data_path(data_dir):
    if os.path.isabs(data_dir):
        return data_dir
    global DATA_ROOT
    return os.path.join(DATA_ROOT, data_dir)


def build_dataset(cfg, resolution=224, **kwargs):
    logger = get_root_logger()

    dataset_type = cfg.get('type')
    logger.info(f"Constructing dataset {dataset_type}...")
    t = time.time()
    transform = cfg.pop('transform', 'default_train')
    transform = get_transform(transform, resolution)
    dataset = build_from_cfg(cfg, DATASETS, default_args=dict(transform=transform, resolution=resolution, **kwargs))
    logger.info(f"Dataset {dataset_type} constructed. time: {(time.time() - t):.2f} s, length (use/ori): {len(dataset)}/{dataset.ori_imgs_nums}")
    return dataset


def build_dataloader(dataset, batch_size=256, num_workers=4, shuffle=True, **kwargs):
    return (
        DataLoader(
            dataset,
            batch_sampler=kwargs['batch_sampler'],
            num_workers=num_workers,
            pin_memory=True,
        )
        if 'batch_sampler' in kwargs
        else DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers,
            pin_memory=True,
            **kwargs
        )
    )


================================================
FILE: PixArt-alpha-ToCa/diffusion/data/datasets/Dreambooth.py
================================================
from PIL import Image
import numpy as np
import torch
from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS
from torch.utils.data import Dataset
from diffusers.utils.torch_utils import randn_tensor
from torchvision import transforms as T
import pathlib
from diffusers.models import AutoencoderKL

from diffusion.data.builder import get_data_path, DATASETS
from diffusion.data.datasets.utils import *

IMAGE_EXTENSIONS = {'bmp', 'jpg', 'jpeg', 'pgm', 'png', 'ppm', 'tif', 'tiff', 'webp', 'JPEG'}


@DATASETS.register_module()
class DreamBooth(Dataset):
    def __init__(self,
                 root,
                 transform=None,
                 resolution=1024,
                 **kwargs):
        self.root = get_data_path(root)
        path = pathlib.Path(self.root)
        self.transform = transform
        self.resolution = resolution
        self.img_samples = sorted(
            [file for ext in IMAGE_EXTENSIONS for file in path.glob(f'*.{ext}')]
        )
        self.ori_imgs_nums = len(self)
        self.loader = default_loader
        self.base_size = int(kwargs['aspect_ratio_type'].split('_')[-1])
        self.aspect_ratio = eval(kwargs.pop('aspect_ratio_type'))       # base aspect ratio
        self.ratio_nums = {}
        for k, v in self.aspect_ratio.items():
            self.ratio_nums[float(k)] = 0      # used for batch-sampler
        self.data_info = {'img_hw': torch.tensor([resolution, resolution], dtype=torch.float32), 'aspect_ratio': 1.}

        # image related
        with torch.inference_mode():
            vae = AutoencoderKL.from_pretrained("output/pretrained_models/sd-vae-ft-ema")
            imgs = []
            for img_path in self.img_samples:
                img = self.loader(img_path)
                self.ratio_nums[1.0] += 1
                if self.transform is not None:
                    imgs.append(self.transform(img))
            imgs = torch.stack(imgs, dim=0)
            self.img_vae = vae.encode(imgs).latent_dist.sample()
            del vae

    def __getitem__(self, index):
        return self.img_vae[index], self.data_info

    @staticmethod
    def vae_feat_loader(path):
        # [mean, std]
        mean, std = torch.from_numpy(np.load(path)).chunk(2)
        sample = randn_tensor(mean.shape, generator=None, device=mean.device, dtype=mean.dtype)
        return mean + std * sample

    def load_ori_img(self, img_path):
        # 加载图像并转换为Tensor
        transform = T.Compose([
            T.Resize(256),  # Image.BICUBIC
            T.CenterCrop(256),
            T.ToTensor(),
        ])
        return transform(Image.open(img_path))

    def __len__(self):
        return len(self.img_samples)

    def __getattr__(self, name):
        if name == "set_epoch":
            return lambda epoch: None
        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")

    def get_data_info(self, idx):
        return {'height': self.resolution, 'width': self.resolution}


================================================
FILE: PixArt-alpha-ToCa/diffusion/data/datasets/InternalData.py
================================================
import os
import random
from PIL import Image
import numpy as np
import torch
from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS
from torch.utils.data import Dataset
from diffusers.utils.torch_utils import randn_tensor
from torchvision import transforms as T
from diffusion.data.builder import get_data_path, DATASETS
from diffusion.utils.logger import get_root_logger

import json


@DATASETS.register_module()
class InternalData(Dataset):
    def __init__(self,
                 root,
                 image_list_json='data_info.json',
                 transform=None,
                 resolution=256,
                 sample_subset=None,
                 load_vae_feat=False,
                 input_size=32,
                 patch_size=2,
                 mask_ratio=0.0,
                 load_mask_index=False,
                 max_length=120,
                 config=None,
                 **kwargs):
        self.root = get_data_path(root)
        self.transform = transform
        self.load_vae_feat = load_vae_feat
        self.ori_imgs_nums = 0
        self.resolution = resolution
        self.N = int(resolution // (input_size // patch_size))
        self.mask_ratio = mask_ratio
        self.load_mask_index = load_mask_index
        self.max_lenth = max_length
        self.meta_data_clean = []
        self.img_samples = []
        self.txt_feat_samples = []
        self.vae_feat_samples = []
        self.mask_index_samples = []
        self.prompt_samples = []

        image_list_json = image_list_json if isinstance(image_list_json, list) else [image_list_json]
        for json_file in image_list_json:
            meta_data = self.load_json(os.path.join(self.root, 'partition', json_file))
            self.ori_imgs_nums += len(meta_data)
            meta_data_clean = [item for item in meta_data if item['ratio'] <= 4]
            self.meta_data_clean.extend(meta_data_clean)
            self.img_samples.extend([os.path.join(self.root.replace('InternData', "InternImgs"), item['path']) for item in meta_data_clean])
            self.txt_feat_samples.extend([os.path.join(self.root, 'caption_feature_wmask', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npz')) for item in meta_data_clean])
            self.vae_feat_samples.extend([os.path.join(self.root, f'img_vae_features_{resolution}resolution/noflip', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npy')) for item in meta_data_clean])
            self.prompt_samples.extend([item['prompt'] for item in meta_data_clean])

        # Set loader and extensions
        if load_vae_feat:
            self.transform = None
            self.loader = self.vae_feat_loader
        else:
            self.loader = default_loader

        if sample_subset is not None:
            self.sample_subset(sample_subset)  # sample dataset for local debug
        logger = get_root_logger() if config is None else get_root_logger(os.path.join(config.work_dir, 'train_log.log'))
        logger.info(f"T5 max token length: {self.max_lenth}")

    def getdata(self, index):
        img_path = self.img_samples[index]
        npz_path = self.txt_feat_samples[index]
        npy_path = self.vae_feat_samples[index]
        prompt = self.prompt_samples[index]
        data_info = {
            'img_hw': torch.tensor([torch.tensor(self.resolution), torch.tensor(self.resolution)], dtype=torch.float32),
            'aspect_ratio': torch.tensor(1.)
        }

        img = self.loader(npy_path) if self.load_vae_feat else self.loader(img_path)
        txt_info = np.load(npz_path)
        txt_fea = torch.from_numpy(txt_info['caption_feature'])     # 1xTx4096
        attention_mask = torch.ones(1, 1, txt_fea.shape[1])     # 1x1xT
        if 'attention_mask' in txt_info.keys():
            attention_mask = torch.from_numpy(txt_info['attention_mask'])[None]
        if txt_fea.shape[1] != self.max_lenth:
            txt_fea = torch.cat([txt_fea, txt_fea[:, -1:].repeat(1, self.max_lenth-txt_fea.shape[1], 1)], dim=1)
            attention_mask = torch.cat([attention_mask, torch.zeros(1, 1, self.max_lenth-attention_mask.shape[-1])], dim=-1)

        if self.transform:
            img = self.transform(img)

        data_info['prompt'] = prompt
        return img, txt_fea, attention_mask, data_info

    def __getitem__(self, idx):
        for _ in range(20):
            try:
                return self.getdata(idx)
            except Exception as e:
                print(f"Error details: {str(e)}")
                idx = np.random.randint(len(self))
        raise RuntimeError('Too many bad data.')

    def get_data_info(self, idx):
        data_info = self.meta_data_clean[idx]
        return {'height': data_info['height'], 'width': data_info['width']}

    @staticmethod
    def vae_feat_loader(path):
        # [mean, std]
        mean, std = torch.from_numpy(np.load(path)).chunk(2)
        sample = randn_tensor(mean.shape, generator=None, device=mean.device, dtype=mean.dtype)
        return mean + std * sample

    def load_ori_img(self, img_path):
        # 加载图像并转换为Tensor
        transform = T.Compose([
            T.Resize(256),  # Image.BICUBIC
            T.CenterCrop(256),
            T.ToTensor(),
        ])
        return transform(Image.open(img_path))

    def load_json(self, file_path):
        with open(file_path, 'r') as f:
            meta_data = json.load(f)

        return meta_data

    def sample_subset(self, ratio):
        sampled_idx = random.sample(list(range(len(self))), int(len(self) * ratio))
        self.img_samples = [self.img_samples[i] for i in sampled_idx]

    def __len__(self):
        return len(self.img_samples)

    def __getattr__(self, name):
        if name == "set_epoch":
            return lambda epoch: None
        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")


================================================
FILE: PixArt-alpha-ToCa/diffusion/data/datasets/InternalData_ms.py
================================================
import os
import numpy as np
import torch
import random
from torchvision.datasets.folder import default_loader
from diffusion.data.datasets.InternalData import InternalData
from diffusion.data.builder import get_data_path, DATASETS
from diffusion.utils.logger import get_root_logger
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from diffusion.data.datasets.utils import *

def get_closest_ratio(height: float, width: float, ratios: dict):
    aspect_ratio = height / width
    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
    return ratios[closest_ratio], float(closest_ratio)


@DATASETS.register_module()
class InternalDataMS(InternalData):
    def __init__(self,
                 root,
                 image_list_json='data_info.json',
                 transform=None,
                 resolution=256,
                 sample_subset=None,
                 load_vae_feat=False,
                 input_size=32,
                 patch_size=2,
                 mask_ratio=0.0,
                 mask_type='null',
                 load_mask_index=False,
                 max_length=120,
                 config=None,
                 **kwargs):
        self.root = get_data_path(root)
        self.transform = transform
        self.load_vae_feat = load_vae_feat
        self.ori_imgs_nums = 0
        self.resolution = resolution
        self.N = int(resolution // (input_size // patch_size))
        self.mask_ratio = mask_ratio
        self.load_mask_index = load_mask_index
        self.mask_type = mask_type
        self.base_size = int(kwargs['aspect_ratio_type'].split('_')[-1])
        self.max_lenth = max_length
        self.aspect_ratio = eval(kwargs.pop('aspect_ratio_type'))       # base aspect ratio
        self.meta_data_clean = []
        self.img_samples = []
        self.txt_feat_samples = []
        self.vae_feat_samples = []
        self.mask_index_samples = []
        self.ratio_index = {}
        self.ratio_nums = {}
        for k, v in self.aspect_ratio.items():
            self.ratio_index[float(k)] = []     # used for self.getitem
            self.ratio_nums[float(k)] = 0      # used for batch-sampler

        image_list_json = image_list_json if isinstance(image_list_json, list) else [image_list_json]
        for json_file in image_list_json:
            meta_data = self.load_json(os.path.join(self.root, 'partition_filter', json_file))
            self.ori_imgs_nums += len(meta_data)
            meta_data_clean = [item for item in meta_data if item['ratio'] <= 4]
            self.meta_data_clean.extend(meta_data_clean)
            self.img_samples.extend([os.path.join(self.root.replace('InternData', "InternImgs"), item['path']) for item in meta_data_clean])
            self.txt_feat_samples.extend([os.path.join(self.root, 'caption_feature_wmask', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npz')) for item in meta_data_clean])
            self.vae_feat_samples.extend([os.path.join(self.root, f'img_vae_fatures_{resolution}_multiscale/ms', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npy')) for item in meta_data_clean])

        # Set loader and extensions
        if load_vae_feat:
            self.transform = None
            self.loader = self.vae_feat_loader
        else:
            self.loader = default_loader

        if sample_subset is not None:
            self.sample_subset(sample_subset)  # sample dataset for local debug

        # scan the dataset for ratio static
        for i, info in enumerate(self.meta_data_clean[:len(self.meta_data_clean)//3]):
            ori_h, ori_w = info['height'], info['width']
            closest_size, closest_ratio = get_closest_ratio(ori_h, ori_w, self.aspect_ratio)
            self.ratio_nums[closest_ratio] += 1
            if len(self.ratio_index[closest_ratio]) == 0:
                self.ratio_index[closest_ratio].append(i)
        # print(self.ratio_nums)
        logger = get_root_logger() if config is None else get_root_logger(os.path.join(config.work_dir, 'train_log.log'))
        logger.info(f"T5 max token length: {self.max_lenth}")

    def getdata(self, index):
        img_path = self.img_samples[index]
        npz_path = self.txt_feat_samples[index]
        npy_path = self.vae_feat_samples[index]
        ori_h, ori_w = self.meta_data_clean[index]['height'], self.meta_data_clean[index]['width']

        # Calculate the closest aspect ratio and resize & crop image[w, h]
        closest_size, closest_ratio = get_closest_ratio(ori_h, ori_w, self.aspect_ratio)
        closest_size = list(map(lambda x: int(x), closest_size))
        self.closest_ratio = closest_ratio

        if self.load_vae_feat:
            try:
                img = self.loader(npy_path)
                if index not in self.ratio_index[closest_ratio]:
                    self.ratio_index[closest_ratio].append(index)
            except Exception:
                index = random.choice(self.ratio_index[closest_ratio])
                return self.getdata(index)
            h, w = (img.shape[1], img.shape[2])
            assert h, w == (ori_h//8, ori_w//8)
        else:
            img = self.loader(img_path)
            h, w = (img.size[1], img.size[0])
            assert h, w == (ori_h, ori_w)

        data_info = {'img_hw': torch.tensor([ori_h, ori_w], dtype=torch.float32)}
        data_info['aspect_ratio'] = closest_ratio
        data_info["mask_type"] = self.mask_type

        txt_info = np.load(npz_path)
        txt_fea = torch.from_numpy(txt_info['caption_feature'])
        attention_mask = torch.ones(1, 1, txt_fea.shape[1])
        if 'attention_mask' in txt_info.keys():
            attention_mask = torch.from_numpy(txt_info['attention_mask'])[None]

        if not self.load_vae_feat:
            if closest_size[0] / ori_h > closest_size[1] / ori_w:
                resize_size = closest_size[0], int(ori_w * closest_size[0] / ori_h)
            else:
                resize_size = int(ori_h * closest_size[1] / ori_w), closest_size[1]
            self.transform = T.Compose([
                T.Lambda(lambda img: img.convert('RGB')),
                T.Resize(resize_size, interpolation=InterpolationMode.BICUBIC),  # Image.BICUBIC
                T.CenterCrop(closest_size),
                T.ToTensor(),
                T.Normalize([.5], [.5]),
            ])

        if self.transform:
            img = self.transform(img)

        return img, txt_fea, attention_mask, data_info

    def __getitem__(self, idx):
        for _ in range(20):
            try:
                return self.getdata(idx)
            except Exception as e:
                print(f"Error details: {str(e)}")
                idx = random.choice(self.ratio_index[self.closest_ratio])
        raise RuntimeError('Too many bad data.')


================================================
FILE: PixArt-alpha-ToCa/diffusion/data/datasets/SA.py
================================================
import os
import random
import time

import numpy as np
import torch
from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS
from torch.utils.data import Dataset
from diffusers.utils.torch_utils import randn_tensor

from diffusion.data.builder import get_data_path, DATASETS


@DATASETS.register_module()
class SAM(Dataset):
    def __init__(self,
                 root,
                 image_list_txt='part0.txt',
                 transform=None,
                 resolution=256,
                 sample_subset=None,
                 load_vae_feat=False,
                 mask_ratio=0.0,
                 mask_type='null',
                 **kwargs):
        self.root = get_data_path(root)
        self.transform = transform
        self.load_vae_feat = load_vae_feat
        self.mask_type = mask_type
        self.mask_ratio = mask_ratio
        self.resolution = resolution
        self.img_samples = []
        self.txt_feat_samples = []
        self.vae_feat_samples = []
        image_list_txt = image_list_txt if isinstance(image_list_txt, list) else [image_list_txt]
        if image_list_txt == 'all':
            image_list_txts = os.listdir(os.path.join(self.root, 'partition'))
            for txt in image_list_txts:
                image_list = os.path.join(self.root, 'partition', txt)
                with open(image_list, 'r') as f:
                    lines = [line.strip() for line in f.readlines()]
                    self.img_samples.extend([os.path.join(self.root, 'images', i+'.jpg') for i in lines])
                    self.txt_feat_samples.extend([os.path.join(self.root, 'caption_feature_wmask', i+'.npz') for i in lines])
        elif isinstance(image_list_txt, list):
            for txt in image_list_txt:
                image_list = os.path.join(self.root, 'partition', txt)
                with open(image_list, 'r') as f:
                    lines = [line.strip() for line in f.readlines()]
                    self.img_samples.extend([os.path.join(self.root, 'images', i + '.jpg') for i in lines])
                    self.txt_feat_samples.extend([os.path.join(self.root, 'caption_feature_wmask', i + '.npz') for i in lines])
                    self.vae_feat_samples.extend([os.path.join(self.root, 'img_vae_feature/train_vae_256/noflip', i + '.npy') for i in lines])

        self.ori_imgs_nums = len(self)
        # self.img_samples = self.img_samples[:10000]
        # Set loader and extensions
        if load_vae_feat:
            self.transform = None
            self.loader = self.vae_feat_loader
        else:
            self.loader = default_loader

        if sample_subset is not None:
            self.sample_subset(sample_subset)  # sample dataset for local debug

    def getdata(self, idx):
        img_path = self.img_samples[idx]
        npz_path = self.txt_feat_samples[idx]
        npy_path = self.vae_feat_samples[idx]
        data_info = {'img_hw': torch.tensor([self.resolution, self.resolution], dtype=torch.float32),
                     'aspect_ratio': torch.tensor(1.)}

        img = self.loader(npy_path) if self.load_vae_feat else self.loader(img_path)
        npz_info = np.load(npz_path)
        txt_fea = torch.from_numpy(npz_info['caption_feature'])
        attention_mask = torch.ones(1, 1, txt_fea.shape[1])
        if 'attention_mask' in npz_info.keys():
            attention_mask = torch.from_numpy(npz_info['attention_mask'])[None]

        if self.transform:
            img = self.transform(img)

        data_info["mask_type"] = self.mask_type

        return img, txt_fea, attention_mask, data_info

    def __getitem__(self, idx):
        for _ in range(20):
            try:
                return self.getdata(idx)
            except Exception:
                print(self.img_samples[idx], ' info is not correct')
                idx = np.random.randint(len(self))
        raise RuntimeError('Too many bad data.')

    @staticmethod
    def vae_feat_loader(path):
        # [mean, std]
        mean, std = torch.from_numpy(np.load(path)).chunk(2)
        sample = randn_tensor(mean.shape, generator=None, device=mean.device, dtype=mean.dtype)
        return mean + std * sample
        # return mean

    def sample_subset(self, ratio):
        sampled_idx = random.sample(list(range(len(self))), int(len(self) * ratio))
        self.img_samples = [self.img_samples[i] for i in sampled_idx]
        self.txt_feat_samples = [self.txt_feat_samples[i] for i in sampled_idx]

    def __len__(self):
        return len(self.img_samples)


================================================
FILE: PixArt-alpha-ToCa/diffusion/data/datasets/__init__.py
================================================
from .SA import SAM
from .InternalData import InternalData
from .InternalData_ms import InternalDataMS
from .Dreambooth import DreamBooth
from .pixart_control import InternalDataHed
from .utils import *


================================================
FILE: PixArt-alpha-ToCa/diffusion/data/datasets/pixart_control.py
================================================
import os
import random
from PIL import Image
import numpy as np
import torch
from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS
from torch.utils.data import Dataset
from diffusers.utils.torch_utils import randn_tensor
from torchvision import transforms as T
from diffusion.data.builder import get_data_path, DATASETS

import json, time


@DATASETS.register_module()
class InternalDataHed(Dataset):
    def __init__(self,
                 root,
                 image_list_json='data_info.json',
                 transform=None,
                 resolution=256,
                 sample_subset=None,
                 load_vae_feat=False,
                 input_size=32,
                 patch_size=2,
                 mask_ratio=0.0,
                 load_mask_index=False,
                 train_ratio=1.0,
                 mode='train',
                 **kwargs):
        self.root = get_data_path(root)
        self.transform = transform
        self.load_vae_feat = load_vae_feat
        self.ori_imgs_nums = 0
        self.resolution = resolution
        self.N = int(resolution // (input_size // patch_size))
        self.mask_ratio = mask_ratio
        self.load_mask_index = load_mask_index
        self.meta_data_clean = []
        self.img_samples = []
        self.txt_feat_samples = []
        self.vae_feat_samples = []
        self.hed_feat_samples = []
        self.prompt_samples = []

        image_list_json = image_list_json if isinstance(image_list_json, list) else [image_list_json]
        for json_file in image_list_json:
            meta_data = self.load_json(os.path.join(self.root, 'partition_filter', json_file))
            self.ori_imgs_nums += len(meta_data)
            meta_data_clean = [item for item in meta_data if item['ratio'] <= 4]
            self.meta_data_clean.extend(meta_data_clean)
            self.img_samples.extend([os.path.join(self.root.replace('InternData', "InternImgs"), item['path']) for item in meta_data_clean])
            self.txt_feat_samples.extend([os.path.join(self.root, 'caption_features', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npz')) for item in meta_data_clean])
            self.vae_feat_samples.extend([os.path.join(self.root, f'img_vae_features_{resolution}resolution/noflip', '_'.join(item['path'].rsplit('/', 1)).replace('.png', '.npy')) for item in meta_data_clean])
            self.hed_feat_samples.extend([os.path.join(self.root, f'hed_feature_{resolution}', item['path'].replace('.png', '.npz')) for item in meta_data_clean])
            self.prompt_samples.extend([item['prompt'] for item in meta_data_clean])

        total_sample = len(self.img_samples)
        used_sample_num = int(total_sample * train_ratio)
        print("using mode", mode)
        if mode == 'train':
            self.img_samples = self.img_samples[:used_sample_num]
            self.txt_feat_samples = self.txt_feat_samples[:used_sample_num]
            self.vae_feat_samples = self.vae_feat_samples[:used_sample_num]
            self.hed_feat_samples = self.hed_feat_samples[:used_sample_num]
            self.prompt_samples = self.prompt_samples[:used_sample_num]
        else:
            self.img_samples = self.img_samples[-used_sample_num:]
            self.txt_feat_samples = self.txt_feat_samples[-used_sample_num:]
            self.vae_feat_samples = self.vae_feat_samples[-used_sample_num:]
            self.hed_feat_samples = self.hed_feat_samples[-used_sample_num:]
            self.prompt_samples = self.prompt_samples[-used_sample_num:]

        # Set loader and extensions
        if load_vae_feat:
            self.transform = None
            self.loader = self.vae_feat_loader
        else:
            self.loader = default_loader

        if sample_subset is not None:
            self.sample_subset(sample_subset)  # sample dataset for local debug

    def getdata(self, index):
        img_path = self.img_samples[index]
        npz_path = self.txt_feat_samples[index]
        npy_path = self.vae_feat_samples[index]
        hed_npz_path = self.hed_feat_samples[index]
        prompt = self.prompt_samples[index]
        # only trained on single-scale 1024 res data
        data_info = {'img_hw': torch.tensor([1024., 1024.], dtype=torch.float32), 'aspect_ratio': torch.tensor(1.)}

        if self.load_vae_feat:
            img = self.loader(npy_path)
        else:
            img = self.loader(img_path)
        hed_fea = self.vae_feat_loader_npz(hed_npz_path)
        txt_info = np.load(npz_path)
        txt_fea = torch.from_numpy(txt_info['caption_feature'])
        attention_mask = torch.ones(1, 1, txt_fea.shape[1])
        if 'attention_mask' in txt_info.keys():
            attention_mask = torch.from_numpy(txt_info['attention_mask'])[None]

        if self.transform:
            img = self.transform(img)

        data_info['condition'] = hed_fea
        data_info['prompt'] = prompt
        return img, txt_fea, attention_mask, data_info

    def __getitem__(self, idx):
        for i in range(20):
            try:
                data = self.getdata(idx)
                return data
            except Exception as e:
                print(f"Error details: {str(e)}")
                idx = np.random.randint(len(self))
        raise RuntimeError('Too many bad data.')

    def get_data_info(self, idx):
        data_info = self.meta_data_clean[idx]
        return {'height': data_info['height'], 'width': data_info['width']}

    @staticmethod
    def vae_feat_loader(path):
        # [mean, std]
        mean, std = torch.from_numpy(np.load(path)).chunk(2)
        sample = randn_tensor(mean.shape, generator=None, device=mean.device, dtype=mean.dtype)
        return mean + std * sample

    @staticmethod
    def vae_feat_loader_npz(path):
        # [mean, std]
        mean, std = torch.from_numpy(np.load(path)['arr_0']).chunk(2)
        sample = randn_tensor(mean.shape, generator=None, device=mean.device, dtype=mean.dtype)
        return mean + std * sample

    def load_json(self, file_path):
        with open(file_path, 'r') as f:
            meta_data = json.load(f)

        return meta_data

    def sample_subset(self, ratio):
        sampled_idx = random.sample(list(range(len(self))), int(len(self) * ratio))
        self.img_samples = [self.img_samples[i] for i in sampled_idx]

    def __len__(self):
        return len(self.img_samples)

    def __getattr__(self, name):
        if name == "set_epoch":
            return lambda epoch: None
        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")


================================================
FILE: PixArt-alpha-ToCa/diffusion/data/datasets/utils.py
================================================


ASPECT_RATIO_1024 = {
    '0.25': [512., 2048.], '0.26': [512., 1984.], '0.27': [512., 1920.], '0.28': [512., 1856.],
    '0.32': [576., 1792.], '0.33': [576., 1728.], '0.35': [576., 1664.], '0.4':  [640., 1600.],
    '0.42':  [640., 1536.], '0.48': [704., 1472.], '0.5': [704., 1408.], '0.52': [704., 1344.],
    '0.57': [768., 1344.], '0.6': [768., 1280.], '0.68': [832., 1216.], '0.72': [832., 1152.],
    '0.78': [896., 1152.], '0.82': [896., 1088.], '0.88': [960., 1088.], '0.94': [960., 1024.],
    '1.0':  [1024., 1024.], '1.07': [1024.,  960.], '1.13': [1088.,  960.], '1.21': [1088.,  896.],
    '1.29': [1152.,  896.], '1.38': [1152.,  832.], '1.46': [1216.,  832.], '1.67': [1280.,  768.],
    '1.75': [1344.,  768.], '2.0':  [1408.,  704.], '2.09':  [1472.,  704.], '2.4':  [1536.,  640.],
    '2.5':  [1600.,  640.], '2.89':  [1664.,  576.], '3.0':  [1728.,  576.], '3.11':  [1792.,  576.],
    '3.62':  [1856.,  512.], '3.75':  [1920.,  512.], '3.88':  [1984.,  512.], '4.0':  [2048.,  512.],
}

ASPECT_RATIO_512 = {
     '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0],
     '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
     '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
     '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
     '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
     '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
     '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
     '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
     '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0],
     '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0]
     }

ASPECT_RATIO_256 = {
     '0.25': [128.0, 512.0], '0.26': [128.0, 496.0], '0.27': [128.0, 480.0], '0.28': [128.0, 464.0],
     '0.32': [144.0, 448.0], '0.33': [144.0, 432.0], '0.35': [144.0, 416.0], '0.4': [160.0, 400.0],
     '0.42': [160.0, 384.0], '0.48': [176.0, 368.0], '0.5': [176.0, 352.0], '0.52': [176.0, 336.0],
     '0.57': [192.0, 336.0], '0.6': [192.0, 320.0], '0.68': [208.0, 304.0], '0.72': [208.0, 288.0],
     '0.78': [224.0, 288.0], '0.82': [224.0, 272.0], '0.88': [240.0, 272.0], '0.94': [240.0, 256.0],
     '1.0': [256.0, 256.0], '1.07': [256.0, 240.0], '1.13': [272.0, 240.0], '1.21': [272.0, 224.0],
     '1.29': [288.0, 224.0], '1.38': [288.0, 208.0], '1.46': [304.0, 208.0], '1.67': [320.0, 192.0],
     '1.75': [336.0, 192.0], '2.0': [352.0, 176.0], '2.09': [368.0, 176.0], '2.4': [384.0, 160.0],
     '2.5': [400.0, 160.0], '2.89': [416.0, 144.0], '3.0': [432.0, 144.0], '3.11': [448.0, 144.0],
     '3.62': [464.0, 128.0], '3.75': [480.0, 128.0], '3.88': [496.0, 128.0], '4.0': [512.0, 128.0]
}

ASPECT_RATIO_256_TEST = {
     '0.25': [128.0, 512.0], '0.28': [128.0, 464.0],
     '0.32': [144.0, 448.0], '0.33': [144.0, 432.0], '0.35': [144.0, 416.0], '0.4': [160.0, 400.0],
     '0.42': [160.0, 384.0], '0.48': [176.0, 368.0], '0.5': [176.0, 352.0], '0.52': [176.0, 336.0],
     '0.57': [192.0, 336.0], '0.6': [192.0, 320.0], '0.68': [208.0, 304.0], '0.72': [208.0, 288.0],
     '0.78': [224.0, 288.0], '0.82': [224.0, 272.0], '0.88': [240.0, 272.0], '0.94': [240.0, 256.0],
     '1.0': [256.0, 256.0], '1.07': [256.0, 240.0], '1.13': [272.0, 240.0], '1.21': [272.0, 224.0],
     '1.29': [288.0, 224.0], '1.38': [288.0, 208.0], '1.46': [304.0, 208.0], '1.67': [320.0, 192.0],
     '1.75': [336.0, 192.0], '2.0': [352.0, 176.0], '2.09': [368.0, 176.0], '2.4': [384.0, 160.0],
     '2.5': [400.0, 160.0], '3.0': [432.0, 144.0],
     '4.0': [512.0, 128.0]
}

ASPECT_RATIO_512_TEST = {
     '0.25': [256.0, 1024.0], '0.28': [256.0, 928.0],
     '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
     '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
     '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
     '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
     '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
     '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
     '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
     '2.5': [800.0, 320.0], '3.0': [864.0, 288.0],
     '4.0': [1024.0, 256.0]
     }

ASPECT_RATIO_1024_TEST = {
    '0.25': [512., 2048.], '0.28': [512., 1856.],
    '0.32': [576., 1792.], '0.33': [576., 1728.], '0.35': [576., 1664.], '0.4':  [640., 1600.],
    '0.42':  [640., 1536.], '0.48': [704., 1472.], '0.5': [704., 1408.], '0.52': [704., 1344.],
    '0.57': [768., 1344.], '0.6': [768., 1280.], '0.68': [832., 1216.], '0.72': [832., 1152.],
    '0.78': [896., 1152.], '0.82': [896., 1088.], '0.88': [960., 1088.], '0.94': [960., 1024.],
    '1.0':  [1024., 1024.], '1.07': [1024.,  960.], '1.13': [1088.,  960.], '1.21': [1088.,  896.],
    '1.29': [1152.,  896.], '1.38': [1152.,  832.], '1.46': [1216.,  832.], '1.67': [1280.,  768.],
    '1.75': [1344.,  768.], '2.0':  [1408.,  704.], '2.09':  [1472.,  704.], '2.4':  [1536.,  640.],
    '2.5':  [1600.,  640.], '3.0':  [1728.,  576.],
    '4.0':  [2048.,  512.],
}


def get_chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


================================================
FILE: PixArt-alpha-ToCa/diffusion/data/transforms.py
================================================
import torchvision.transforms as T

TRANSFORMS = {}


def register_transform(transform):
    name = transform.__name__
    if name in TRANSFORMS:
        raise RuntimeError(f'Transform {name} has already registered.')
    TRANSFORMS.update({name: transform})


def get_transform(type, resolution):
    transform = TRANSFORMS[type](resolution)
    transform = T.Compose(transform)
    transform.image_size = resolution
    return transform


@register_transform
def default_train(n_px):
    return [
        T.Lambda(lambda img: img.convert('RGB')),
        T.Resize(n_px),  # Image.BICUBIC
        T.CenterCrop(n_px),
        # T.RandomHorizontalFlip(),
        T.ToTensor(),
        T.Normalize([0.5], [0.5]),
    ]


================================================
FILE: PixArt-alpha-ToCa/diffusion/dpm_solver.py
================================================
import torch
from .model import gaussian_diffusion as gd
from .model.dpm_solver import model_wrapper, DPM_Solver, NoiseScheduleVP


def DPMS(model, condition, uncondition, cfg_scale, model_type='noise', noise_schedule="linear", guidance_type='classifier-free', model_kwargs=None, diffusion_steps=1000):
    if model_kwargs is None:
        model_kwargs = {}
    betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps))

    ## 1. Define the noise schedule.
    noise_schedule = NoiseScheduleVP(schedule='discrete', betas=betas)

    ## 2. Convert your discrete-time `model` to the continuous-time
    ## noise prediction model. Here is an example for a diffusion model
    ## `model` with the noise prediction type ("noise") .
    model_fn = model_wrapper(
        model,
        noise_schedule,
        model_type=model_type,
        model_kwargs=model_kwargs,
        guidance_type=guidance_type,
        condition=condition,
        unconditional_condition=uncondition,
        guidance_scale=cfg_scale,
    )
    ## 3. Define dpm-solver and sample by multistep DPM-Solver.
    return DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")

================================================
FILE: PixArt-alpha-ToCa/diffusion/iddpm.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
from diffusion.model.respace import SpacedDiffusion, space_timesteps
from .model import gaussian_diffusion as gd


def IDDPM(
        timestep_respacing,
        noise_schedule="linear",
        use_kl=False,
        sigma_small=False,
        predict_xstart=False,
        learn_sigma=True,
        pred_sigma=True,
        rescale_learned_sigmas=False,
        diffusion_steps=1000,
        snr=False,
        return_startx=False,
):
    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
    if use_kl:
        loss_type = gd.LossType.RESCALED_KL
    elif rescale_learned_sigmas:
        loss_type = gd.LossType.RESCALED_MSE
    else:
        loss_type = gd.LossType.MSE
    if timestep_respacing is None or timestep_respacing == "":
        timestep_respacing = [diffusion_steps]
    return SpacedDiffusion(
        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
        betas=betas,
        model_mean_type=(
            gd.ModelMeanType.START_X if predict_xstart else gd.ModelMeanType.EPSILON
        ),
        model_var_type=(
            (gd.ModelVarType.LEARNED_RANGE if learn_sigma else (
                                 gd.ModelVarType.FIXED_LARGE
                                 if not sigma_small
                                 else gd.ModelVarType.FIXED_SMALL
                             )
             )
            if pred_sigma
            else None
        ),
        loss_type=loss_type,
        snr=snr,
        return_startx=return_startx,
        # rescale_timesteps=rescale_timesteps,
    )

================================================
FILE: PixArt-alpha-ToCa/diffusion/lcm_scheduler.py
================================================
# Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
# and https://github.com/hojonathanho/diffusion

import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import numpy as np
import torch

from diffusers import ConfigMixin, SchedulerMixin
from diffusers.configuration_utils import register_to_config
from diffusers.utils import BaseOutput


@dataclass
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
class LCMSchedulerOutput(BaseOutput):
    """
    Output class for the scheduler's `step` function output.
    Args:
        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
            denoising loop.
        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
            `pred_original_sample` can be used to preview progress or for guidance.
    """

    prev_sample: torch.FloatTensor
    denoised: Optional[torch.FloatTensor] = None


# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
        num_diffusion_timesteps,
        max_beta=0.999,
        alpha_transform_type="cosine",
):
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
        num_diffusion_timesteps (`int`): the number of betas to produce.
        max_beta (`float`): the maximum beta to use; use values lower than 1 to
                     prevent singularities.
        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
                     Choose from `cosine` or `exp`
    Returns:
        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
    """
    if alpha_transform_type == "cosine":

        def alpha_bar_fn(t):
            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2

    elif alpha_transform_type == "exp":

        def alpha_bar_fn(t):
            return math.exp(t * -12.0)

    else:
        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
        t1 = i / num_diffusion_timesteps
        t2 = (i + 1) / num_diffusion_timesteps
        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
    return torch.tensor(betas, dtype=torch.float32)


def rescale_zero_terminal_snr(betas):
    """
    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
    Args:
        betas (`torch.FloatTensor`):
            the betas that the scheduler is being initialized with.
    Returns:
        `torch.FloatTensor`: rescaled betas with zero terminal SNR
    """
    # Convert betas to alphas_bar_sqrt
    alphas = 1.0 - betas
    alphas_cumprod = torch.cumprod(alphas, dim=0)
    alphas_bar_sqrt = alphas_cumprod.sqrt()

    # Store old values.
    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()

    # Shift so the last timestep is zero.
    alphas_bar_sqrt -= alphas_bar_sqrt_T

    # Scale so the first timestep is back to the old value.
    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)

    # Convert alphas_bar_sqrt to betas
    alphas_bar = alphas_bar_sqrt ** 2  # Revert sqrt
    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
    alphas = torch.cat([alphas_bar[:1], alphas])
    betas = 1 - alphas

    return betas


class LCMScheduler(SchedulerMixin, ConfigMixin):
    """
    `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
    non-Markovian guidance.
    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
    methods the library implements for all schedulers such as loading and saving.
    Args:
        num_train_timesteps (`int`, defaults to 1000):
            The number of diffusion steps to train the model.
        beta_start (`float`, defaults to 0.0001):
            The starting `beta` value of inference.
        beta_end (`float`, defaults to 0.02):
            The final `beta` value.
        beta_schedule (`str`, defaults to `"linear"`):
            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
        trained_betas (`np.ndarray`, *optional*):
            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
        clip_sample (`bool`, defaults to `True`):
            Clip the predicted sample for numerical stability.
        clip_sample_range (`float`, defaults to 1.0):
            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
        set_alpha_to_one (`bool`, defaults to `True`):
            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
            otherwise it uses the alpha value at step 0.
        steps_offset (`int`, defaults to 0):
            An offset added to the inference steps. You can use a combination of `offset=1` and
            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
            Diffusion.
        prediction_type (`str`, defaults to `epsilon`, *optional*):
            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
            Video](https://imagen.research.google/video/paper.pdf) paper).
        thresholding (`bool`, defaults to `False`):
            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
            as Stable Diffusion.
        dynamic_thresholding_ratio (`float`, defaults to 0.995):
            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
        sample_max_value (`float`, defaults to 1.0):
            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
        timestep_spacing (`str`, defaults to `"leading"`):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        rescale_betas_zero_snr (`bool`, defaults to `False`):
            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
            dark samples instead of limiting it to samples with medium brightness. Loosely related to
            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
    """

    # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
    order = 1

    @register_to_config
    def __init__(
            self,
            num_train_timesteps: int = 1000,
            beta_start: float = 0.0001,
            beta_end: float = 0.02,
            beta_schedule: str = "linear",
            trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
            clip_sample: bool = True,
            set_alpha_to_one: bool = True,
            steps_offset: int = 0,
            prediction_type: str = "epsilon",
            thresholding: bool = False,
            dynamic_thresholding_ratio: float = 0.995,
            clip_sample_range: float = 1.0,
            sample_max_value: float = 1.0,
            timestep_spacing: str = "leading",
            rescale_betas_zero_snr: bool = False,
    ):
        if trained_betas is not None:
            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
        elif beta_schedule == "linear":
            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
        elif beta_schedule == "scaled_linear":
            # this schedule is very specific to the latent diffusion model.
            self.betas = (
                    torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_train_timesteps, dtype=torch.float32) ** 2
            )
        elif beta_schedule == "squaredcos_cap_v2":
            # Glide cosine schedule
            self.betas = betas_for_alpha_bar(num_train_timesteps)
        else:
            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")

        # Rescale for zero SNR
        if rescale_betas_zero_snr:
            self.betas = rescale_zero_terminal_snr(self.betas)

        self.alphas = 1.0 - self.betas
        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)

        # At every step in ddim, we are looking into the previous alphas_cumprod
        # For the final step, there is no previous alphas_cumprod because we are already at 0
        # `set_alpha_to_one` decides whether we set this parameter simply to one or
        # whether we use the final alpha of the "non-previous" one.
        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]

        # standard deviation of the initial noise distribution
        self.init_noise_sigma = 1.0

        # setable values
        self.num_inference_steps = None
        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))

    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
        """
        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
        current timestep.
        Args:
            sample (`torch.FloatTensor`):
                The input sample.
            timestep (`int`, *optional*):
                The current timestep in the diffusion chain.
        Returns:
            `torch.FloatTensor`:
                A scaled input sample.
        """
        return sample

    def _get_variance(self, timestep, prev_timestep):
        alpha_prod_t = self.alphas_cumprod[timestep]
        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        return (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)

    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
        """
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://arxiv.org/abs/2205.11487
        """
        dtype = sample.dtype
        batch_size, channels, height, width = sample.shape

        if dtype not in (torch.float32, torch.float64):
            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half

        # Flatten sample for doing quantile calculation along each image
        sample = sample.reshape(batch_size, channels * height * width)

        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"

        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
        s = torch.clamp(
            s, min=1, max=self.config.sample_max_value
        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]

        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"

        sample = sample.reshape(batch_size, channels, height, width)
        sample = sample.to(dtype)

        return sample

    def set_timesteps(self, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None):
        """
        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
        Args:
            num_inference_steps (`int`):
                The number of diffusion steps used when generating samples with a pre-trained model.
        """

        if num_inference_steps > self.config.num_train_timesteps:
            raise ValueError(
                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
                f" maximal {self.config.num_train_timesteps} timesteps."
            )

        self.num_inference_steps = num_inference_steps

        # LCM Timesteps Setting:  # Linear Spacing
        c = self.config.num_train_timesteps // lcm_origin_steps
        lcm_origin_timesteps = np.asarray(list(range(1, lcm_origin_steps + 1))) * c - 1  # LCM Training  Steps Schedule
        skipping_step = len(lcm_origin_timesteps) // num_inference_steps
        timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps]  # LCM Inference Steps Schedule

        self.timesteps = torch.from_numpy(timesteps.copy()).to(device)

    def get_scalings_for_boundary_condition_discrete(self, t):
        self.sigma_data = 0.5  # Default: 0.5

        # By dividing 0.1: This is almost a delta function at t=0.
        c_skip = self.sigma_data ** 2 / ((t / 0.1) ** 2 + self.sigma_data ** 2)
        c_out = ((t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data ** 2) ** 0.5)
        return c_skip, c_out

    def step(
            self,
            model_output: torch.FloatTensor,
            timeindex: int,
            timestep: int,
            sample: torch.FloatTensor,
            eta: float = 0.0,
            use_clipped_model_output: bool = False,
            generator=None,
            variance_noise: Optional[torch.FloatTensor] = None,
            return_dict: bool = True,
    ) -> Union[LCMSchedulerOutput, Tuple]:
        """
        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
        process from the learned model outputs (most often the predicted noise).
        Args:
            model_output (`torch.FloatTensor`):
                The direct output from learned diffusion model.
            timestep (`float`):
                The current discrete timestep in the diffusion chain.
            sample (`torch.FloatTensor`):
                A current instance of a sample created by the diffusion process.
            eta (`float`):
                The weight of noise for added noise in diffusion step.
            use_clipped_model_output (`bool`, defaults to `False`):
                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
                `use_clipped_model_output` has no effect.
            generator (`torch.Generator`, *optional*):
                A random number generator.
            variance_noise (`torch.FloatTensor`):
                Alternative to generating noise with `generator` by directly providing the noise for the variance
                itself. Useful for methods such as [`CycleDiffusion`].
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
        Returns:
            [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
                If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
                tuple is returned where the first element is the sample tensor.
        """
        if self.num_inference_steps is None:
            raise ValueError(
                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
            )

        # 1. get previous step value
        prev_timeindex = timeindex + 1
        if prev_timeindex < len(self.timesteps):
            prev_timestep = self.timesteps[prev_timeindex]
        else:
            prev_timestep = timestep

        # 2. compute alphas, betas
        alpha_prod_t = self.alphas_cumprod[timestep]
        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod

        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

        # 3. Get scalings for boundary conditions
        c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)

        # 4. Different Parameterization:
        parameterization = self.config.prediction_type

        if parameterization == "epsilon":  # noise-prediction
            pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()

        elif parameterization == "sample":  # x-prediction
            pred_x0 = model_output

        elif parameterization == "v_prediction":  # v-prediction
            pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output

        # 4. Denoise model output using boundary conditions
        denoised = c_out * pred_x0 + c_skip * sample

        # 5. Sample z ~ N(0, I), For MultiStep Inference
        # Noise is not used for one-step sampling.
        if len(self.timesteps) > 1:
            noise = torch.randn(model_output.shape).to(model_output.device)
            prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
        else:
            prev_sample = denoised

        if not return_dict:
            return (prev_sample, denoised)

        return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)

    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
    def add_noise(
            self,
            original_samples: torch.FloatTensor,
            noise: torch.FloatTensor,
            timesteps: torch.IntTensor,
    ) -> torch.FloatTensor:
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
        timesteps = timesteps.to(original_samples.device)

        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)

        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)

        return sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise

    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
    def get_velocity(
            self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
    ) -> torch.FloatTensor:
        # Make sure alphas_cumprod and timestep have same device and dtype as sample
        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
        timesteps = timesteps.to(sample.device)

        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
        while len(sqrt_alpha_prod.shape) < len(sample.shape):
            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)

        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)

        return sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample

    def __len__(self):
        return self.config.num_train_timesteps


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/__init__.py
================================================
from .nets import *


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/builder.py
================================================
from mmcv import Registry

from diffusion.model.utils import set_grad_checkpoint

MODELS = Registry('models')


def build_model(cfg, use_grad_checkpoint=False, use_fp32_attention=False, gc_step=1, **kwargs):
    if isinstance(cfg, str):
        cfg = dict(type=cfg)
    model = MODELS.build(cfg, default_args=kwargs)
    if use_grad_checkpoint:
        set_grad_checkpoint(model, use_fp32_attention=use_fp32_attention, gc_step=gc_step)
    return model


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/__init__.py
================================================
from .cache_cutfresh import cache_cutfresh
from .fresh_ratio_scheduler import fresh_ratio_scheduler
from .score_evaluate import score_evaluate
from .global_force_fresh import global_force_fresh
from .cache_cutfresh import cache_cutfresh
from .update_cache import update_cache
from .force_init import force_init
from .attention import cached_attention_forward
from .cache_init import cache_init

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/attention.py
================================================
# Besides, re-arrange the attention module
from torch.jit import Final
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Union
from xformers.ops.fmha.attn_bias import BlockDiagonalMask
def cached_attention_forward(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attn_bias: Optional[Union[torch.Tensor, BlockDiagonalMask]] = None,
    p: float = 0.0,
    scale: Optional[float] = None
) -> torch.Tensor:
    scale = 1.0 / query.shape[-1] ** 0.5
    query = query * scale
    query = query.transpose(1, 2)
    key = key.transpose(1, 2)
    value = value.transpose(1, 2)
    attn = query @ key.transpose(-2, -1)
    if attn_bias is not None:
        attn_bias = attn_bias.materialize(shape= attn.shape, dtype= attn.dtype, device= attn.device)
        attn = attn + attn_bias
    #out_map = attn
    attn_map = attn.softmax(-1)
    attn = F.dropout(attn_map, p)
    attn = attn @ value

    return attn.transpose(1, 2).contiguous(), attn_map.mean(dim=1)

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/cache_cutfresh.py
================================================
from .fresh_ratio_scheduler import fresh_ratio_scheduler
from .score_evaluate import score_evaluate
#from .token_merge import token_merge
import torch
def cache_cutfresh(cache_dic, tokens, current):
    '''
    Cut fresh tokens from the input tokens and update the cache counter.
    
    cache_dic: dict, the cache dictionary containing cache(main extra memory cost), indices and some other information.
    tokens: torch.Tensor, the input tokens to be cut.
    current: dict, the current step, layer, and module information. Particularly convenient for debugging.
    '''
    step = current['step']
    layer = current['layer']
    module = current['module']
    
    fresh_ratio = fresh_ratio_scheduler(cache_dic, current)
    fresh_ratio = torch.clamp(torch.tensor(fresh_ratio, device = tokens.device), min=0, max=1)
    # Generate the index tensor for fresh tokens
    score = score_evaluate(cache_dic, tokens, current) # s1, s2, s3 mentioned in the paper
    score = local_selection_with_bonus(score, 0.4, 4) # Uniform Spatial Distribution s4 mentioned in the paper
    indices = score.argsort(dim=-1, descending=True)
    topk = int(fresh_ratio * score.shape[1])
    fresh_indices = indices[:, :topk]
    stale_indices = indices[:, topk:]
    # (B, fresh_ratio *N)

    # Updating the Cache Frequency Score s3 mentioned in the paper
    # stale tokens index + 1 in each ***module***, fresh tokens index = 0
    cache_dic['cache_index'][-1][layer][module] += 1
    cache_dic['cache_index'][-1][layer][module].scatter_(dim=1, index=fresh_indices, 
                                                                    src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device))
    cache_dic['cache_index']['layer_index'][module] += 1
    cache_dic['cache_index']['layer_index'][module].scatter_(dim=1, index=fresh_indices, 
                                                                    src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device))
    
    fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])
    if module in ['mlp', 'attn', 'cross-attn']:

        fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices_expand)

        return fresh_indices, fresh_tokens
    else:
        raise ValueError("Unrecognized module?", module)
    
def local_selection_with_bonus(score, bonus_ratio, grid_size=2):
    batch_size, num_tokens = score.shape
    image_size = int(num_tokens ** 0.5)
    block_size = grid_size * grid_size
    
    assert num_tokens % block_size == 0, "The number of tokens must be divisible by the block size."
    
    # Step 1: Reshape score to group it by blocks
    score_reshaped = score.view(batch_size, image_size // grid_size, grid_size, image_size // grid_size, grid_size)
    score_reshaped = score_reshaped.permute(0, 1, 3, 2, 4).contiguous()
    score_reshaped = score_reshaped.view(batch_size, -1, block_size)  # [batch_size, num_blocks, block_size]
    
    # Step 2: Find the max token in each block
    max_scores, max_indices = score_reshaped.max(dim=-1, keepdim=True)  # [batch_size, num_blocks, 1]
    
    # Step 3: Create a mask to identify max score tokens
    mask = torch.zeros_like(score_reshaped)
    mask.scatter_(-1, max_indices, 1)  # Set mask to 1 at the max indices
    
    # Step 4: Apply the bonus only to the max score tokens
    score_reshaped = score_reshaped + (mask * max_scores * bonus_ratio)  # Apply bonus only to max tokens
    
    # Step 5: Reshape the score back to its original shape
    score_modified = score_reshaped.view(batch_size, image_size // grid_size, image_size // grid_size, grid_size, grid_size)
    score_modified = score_modified.permute(0, 1, 3, 2, 4).contiguous()
    score_modified = score_modified.view(batch_size, num_tokens)
    
    return score_modified

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/cache_init.py
================================================
def cache_init(model_kwargs, num_steps):   
    '''
    Initialization for cache.
    '''
    cache_dic = {}
    cache = {}
    cache_index = {}
    cache[-1]={}
    cache_index[-1]={}
    cache_index['layer_index']={}
    cache_dic['attn_map'] = {}
    cache_dic['attn_map'][-1] = {}
    cache_dic['cross_attn_map'] = {}
    cache_dic['cross_attn_map'][-1] = {}

    for j in range(28):
        cache[-1][j] = {}
        cache_index[-1][j] = {}
        cache_dic['attn_map'][-1][j] = {}
        cache_dic['cross_attn_map'][-1][j] = {}

    cache_dic['cache_type'] = model_kwargs['cache_type']
    cache_dic['cache_index'] = cache_index
    cache_dic['cache'] = cache
    cache_dic['fresh_ratio_schedule'] = model_kwargs['ratio_scheduler']
    cache_dic['fresh_ratio'] = model_kwargs['fresh_ratio']
    cache_dic['fresh_threshold'] = model_kwargs['fresh_threshold']
    cache_dic['force_fresh'] = model_kwargs['force_fresh']
    cache_dic['soft_fresh_weight'] = model_kwargs['soft_fresh_weight']
    #cache_dic['merge_weight'] = merge_weight
    current = {}
    current['num_steps'] = num_steps
    return cache_dic, current
    

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/force_init.py
================================================
import torch
from .force_scheduler import force_scheduler
def force_init(cache_dic, current, tokens):
    '''
    Initialization for Force Activation step.
    '''
    cache_dic['cache_index'][-1][current['layer']][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device)
    force_scheduler(cache_dic, current)
    if current['layer'] == 0:
        cache_dic['cache_index']['layer_index'][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device)

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/force_scheduler.py
================================================
import torch
def force_scheduler(cache_dic, current):
    if cache_dic['fresh_ratio'] == 0:
        # FORA
        linear_step_weight = 0.0
    else: 
        # TokenCache
        linear_step_weight = 0.2
    step_factor = torch.tensor(1 - linear_step_weight + 2 * linear_step_weight * current['step'] / current['num_steps'])
    threshold = torch.round(cache_dic['fresh_threshold'] / step_factor)

    # no force constrain for sensitive steps, cause the performance is good enough.
    # you may have a try.
    
    cache_dic['cal_threshold'] = threshold
    #return threshold

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/fresh_ratio_scheduler.py
================================================
import torch
def fresh_ratio_scheduler(cache_dic, current):
    '''
    Return the fresh ratio for the current step.
    '''
    fresh_ratio = cache_dic['fresh_ratio']
    fresh_ratio_schedule = cache_dic['fresh_ratio_schedule']
    step = current['step']
    num_steps = current['num_steps']
    threshold = cache_dic['fresh_threshold']
    weight = 0.9
    if fresh_ratio_schedule == 'constant':
        return fresh_ratio
    elif fresh_ratio_schedule == 'linear':
        return fresh_ratio * (1 + weight - 2 * weight * step / num_steps)
    elif fresh_ratio_schedule == 'exp':
        #return 0.5 * (0.052 ** (step/num_steps))
        return fresh_ratio * (weight ** (step / num_steps))
    elif fresh_ratio_schedule == 'linear-mode':
        mode = (step % threshold)/threshold - 0.5
        mode_weight = 0.1
        return fresh_ratio * (1 + weight - 2 * weight * step / num_steps + mode_weight * mode)
    elif fresh_ratio_schedule == 'layerwise':
        return fresh_ratio * (1 + weight - 2 * weight * current['layer'] / 27)
    elif fresh_ratio_schedule == 'linear-layerwise':
        step_weight = -0.9 #0.9
        step_factor = 1 - step_weight + 2 * step_weight * step / num_steps
        #if current['layer'] == 2:
        #    return 1.0
        #sigmoid
        #sigmoid_weight = 0.13
        #layer_factor = 2 * torch.sigmoid(torch.tensor([sigmoid_weight * (13.5 - current['layer'])]))
        layer_weight = 0.6
        layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27

        module_weight = 1.0 #TokenCache N=8 2.5 N=6 2.5 #N=4 2.1
        module_time_weight = 0.6
        module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight)
        
        return fresh_ratio * layer_factor * step_factor * module_factor

    elif fresh_ratio_schedule == 'ToCa':
        step_weight = -0.9 #0.9
        step_factor = 1 - step_weight + 2 * step_weight * step / num_steps

        layer_weight = 0.6
        layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27

        module_weight = 1.0
        module_time_weight = 0.6
        # this means 60*x% cross-attn computation, and 160*x% mlp computation. This is designed for cross-attn has best temporal redundancy, and mlp has worse.
        # so cross-attn compute less and mlp compute more.
        module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight)
        
        return fresh_ratio * layer_factor * step_factor * module_factor

    else:
        raise ValueError("unrecognized fresh ratio schedule", fresh_ratio_schedule)


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/global_force_fresh.py
================================================
from .force_scheduler import force_scheduler
def global_force_fresh(cache_dic, current):
    '''
    Return whether to force fresh tokens globally.
    '''
    first_step = (current['step'] == 0)
    force_fresh = cache_dic['force_fresh']
    if not first_step:
        fresh_threshold = cache_dic['cal_threshold']
    else:
        fresh_threshold = cache_dic['fresh_threshold']

    if force_fresh == 'global':
        return (first_step or (current['step']% fresh_threshold == 0))
    elif force_fresh == 'local':
        return first_step
    elif force_fresh == 'none':
        return first_step
    else:
        raise ValueError("unrecognized force fresh strategy", force_fresh)

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/score_evaluate.py
================================================
import torch
import torch.nn as nn
from .scores import attn_score, similarity_score, norm_score
def score_evaluate(cache_dic, tokens, current) -> torch.Tensor:
    '''
    Return the score tensor (B, N) for the given tokens.
    '''

    #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')):
    #    # abandoned branch, if you want to explore the local force fresh strategy, this may help.
    #    force_fresh_mask = torch.as_tensor((cache_dic['cache_index'][-1][current['layer']][current['module']] >= 2 * cache_dic['fresh_threshold']), dtype = int) # 2 because the threshold is for step, not module
    #    force_len = force_fresh_mask.sum(dim=1)
    #    force_indices = force_fresh_mask.argsort(dim = -1, descending = True)[:, :force_len.min()]
    #    force_indices = force_indices[:, torch.randperm(force_indices.shape[1])]

    # Just see more explanation in the version of DiT-ToCa if needed.

    if cache_dic['cache_type'] == 'random':
        score = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1], device=tokens.device)
        score = torch.cat([score, score], dim=0).to(tokens.device)

    elif cache_dic['cache_type'] == 'straight':
        score = torch.ones(tokens.shape[0], tokens.shape[1]).to(tokens.device)
    
    elif cache_dic['cache_type'] == 'attention':
        # cache_dic['attn_map'][step][layer] (B, N, N), the last dimention has get softmaxed
        score = attn_score(cache_dic, current)
        #score = score + 0.0 * torch.rand_like(score, device= score.device)
    
    elif cache_dic['cache_type'] == 'similarity':
        score = similarity_score(cache_dic, current, tokens)

    elif cache_dic['cache_type'] == 'norm':
        score = norm_score(cache_dic, current, tokens)

    elif cache_dic['cache_type'] == 'compress':
        score1 = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1])
        score1 = torch.cat([score1, score1], dim=0).to(tokens.device)
        score2 = cache_dic['attn_map'][-1][current['layer']].sum(dim=1)#.mean(dim=0) # (B, N)
        # normalize
        score2 = score2 / score2.max(dim=1, keepdim=True)[0]
        score = 0.5 * score1 + 0.5 * score2
    
    # abandoned the branch, if you want to explore the local force fresh strategy, this may help.
    #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # current['is_force_fresh'] is False, cause when it is True, no cut and fresh are needed
    #        #print(torch.ones_like(force_indices, dtype=float, device=force_indices.device).dtype)
    #    score.scatter_(dim=1, index=force_indices, src=torch.ones_like(force_indices, dtype=torch.float32, 
    #                                                                       device=force_indices.device))
    
    if (True and (cache_dic['force_fresh'] == 'global')):
        soft_step_score = cache_dic['cache_index'][-1][current['layer']][current['module']].float() / (cache_dic['fresh_threshold'])
        soft_layer_score = cache_dic['cache_index']['layer_index'][current['module']].float() / (27)
        score = score + cache_dic['soft_fresh_weight'] * soft_step_score #+ 0.1 *soft_layer_score
    
    return score.to(tokens.device)

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/scores.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

def attn_score(cache_dic, current):
    #self_attn_score = 1- cache_dic['attn_map'][-1][current['layer']].diagonal(dim1=1, dim2=2)
    #self_attn_score = F.normalize(self_attn_score, dim=1, p=2)
    #attention_score = F.normalize(cache_dic['attn_map'][-1][current['layer']].sum(dim=1), dim=1, p=2)
    #cross_attn_map = F.threshold(cache_dic['cross_attn_map'][-1][current['layer']],threshold=0.0, value=0.0)
    #cross_attention_score = F.normalize(cross_attn_map.sum(dim=-1), dim=-1, p=2)

    # Note: It is important to give a same selection method for cfg and no cfg.
    # Because the influence of **Cross-Attention** in text-contidional models makes cfg and no cfg a BIG difference.

    # Same selection for cfg and no cfg
    cond_cmap, uncond_cmap = torch.split(cache_dic['cross_attn_map'][-1][current['layer']], len(cache_dic['cross_attn_map'][-1][current['layer']]) // 2, dim=0)
    cond_weight = 0.5
    cmap = cond_weight * cond_cmap + (1 - cond_weight) * uncond_cmap

    # Entropy score
    cross_attention_entropy = -torch.sum(cmap * torch.log(cmap + 1e-7), dim=-1)
    cross_attention_score   = F.normalize(1 + cross_attention_entropy, dim=1, p=2) # Note here "1" does not influence the sorted sequence, but provie stability.
    score = cross_attention_score.repeat(2, 1)

    # In PixArt, the cross_attention_score (s2) is used as the score, for a better text-image alignment.

    # You can try conbining the self_attention_score (s1) and cross_attention_score (s2) as the final score, there exists a balance.
    #cross_weight = 0.0
    #score =  (1-cross_weight) * attention_score + cross_weight * cross_attention_score
    return score

def similarity_score(cache_dic, current, tokens):
    cosine_sim = F.cosine_similarity(tokens, cache_dic['cache'][-1][current['layer']][current['module']], dim=-1)

    return F.normalize(1- cosine_sim, dim=-1, p=2)

def norm_score(cache_dic, current, tokens):
    norm = tokens.norm(dim=-1, p=2)
    return F.normalize(norm, dim=-1, p=2)


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/token_merge.py
================================================
import torch
def token_merge(cache_dic, tokens, current, fresh_indices, stale_indices):
    '''
    An abandoned branch in exploring if token merge helps. The answer is no, at least no for training-free strategy.
    '''
    if (current['layer'] % 1 == 0):
        fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]))
        stale_tokens = torch.gather(input = tokens, dim = 1, index = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]))
        method = 'similarity'
        if method == 'distance':
            descending = False
            distance = torch.cdist(stale_tokens, fresh_tokens, p=1)
            stale_fresh_dist, stale_fresh_indices_allstale = torch.min(distance, dim=2)
        elif method == 'similarity':
            descending = True
            fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1)
            stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1)
            similarity = stale_tokens @ fresh_tokens.transpose(1, 2)
            stale_fresh_dist, stale_fresh_indices_allstale = torch.max(similarity, dim=2)
        

        saved_topk_stale = int((stale_fresh_dist > 0.995).sum(dim=1).min())
        merged_stale_sequence = torch.sort(stale_fresh_dist, dim=1, descending=descending)[1][:,:saved_topk_stale]
        stale_fresh_indices = stale_fresh_indices_allstale.gather(1, merged_stale_sequence)
        merged_stale_sequence = stale_indices.gather(1, merged_stale_sequence)
        merged_stale_fresh_indices = fresh_indices.gather(1, stale_fresh_indices)
        cache_dic['merged_stale_fresh_indices'] = merged_stale_fresh_indices
        cache_dic['merged_stale_sequence'] = merged_stale_sequence 


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/cache_functions/update_cache.py
================================================
import torch
def update_cache(fresh_indices, fresh_tokens, cache_dic, current, fresh_attn_map=None):
    '''
    Update the cache with the fresh tokens.
    '''
    step = current['step']
    layer = current['layer']
    module = current['module']
    # Update the cached tokens at the positions
    if module == 'attn':
        # this branch is not used in the final version, but if you explore the partial fresh strategy of attention, it works (probably a few bugs).
        indices = fresh_indices#.sort(dim=1, descending=False)[0]
        cache_dic['attn_map'][-1][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map)
    elif module == 'cross-attn':
        indices = fresh_indices#.sort(dim=1, descending=False)[0]
        cache_dic['cross_attn_map'][-1][layer].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_attn_map.shape[-1]), src=fresh_attn_map)
    elif module == 'mlp':
        indices = fresh_indices

    cache_dic['cache'][-1][layer][module].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]), src=fresh_tokens)
    
    
================================================
FILE: PixArt-alpha-ToCa/diffusion/model/diffusion_utils.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py

import numpy as np
import torch as th


def normal_kl(mean1, logvar1, mean2, logvar2):
    """
    Compute the KL divergence between two gaussians.
    Shapes are automatically broadcasted, so batches can be compared to
    scalars, among other use cases.
    """
    tensor = next(
        (
            obj
            for obj in (mean1, logvar1, mean2, logvar2)
            if isinstance(obj, th.Tensor)
        ),
        None,
    )
    assert tensor is not None, "at least one argument must be a Tensor"

    # Force variances to be Tensors. Broadcasting helps convert scalars to
    # Tensors, but it does not work for th.exp().
    logvar1, logvar2 = [
        x if isinstance(x, th.Tensor) else th.tensor(x, device=tensor.device)
        for x in (logvar1, logvar2)
    ]

    return 0.5 * (
        -1.0
        + logvar2
        - logvar1
        + th.exp(logvar1 - logvar2)
        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
    )


def approx_standard_normal_cdf(x):
    """
    A fast approximation of the cumulative distribution function of the
    standard normal.
    """
    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))


def continuous_gaussian_log_likelihood(x, *, means, log_scales):
    """
    Compute the log-likelihood of a continuous Gaussian distribution.
    :param x: the targets
    :param means: the Gaussian mean Tensor.
    :param log_scales: the Gaussian log stddev Tensor.
    :return: a tensor like x of log probabilities (in nats).
    """
    centered_x = x - means
    inv_stdv = th.exp(-log_scales)
    normalized_x = centered_x * inv_stdv
    return th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(
        normalized_x
    )


def discretized_gaussian_log_likelihood(x, *, means, log_scales):
    """
    Compute the log-likelihood of a Gaussian distribution discretizing to a
    given image.
    :param x: the target images. It is assumed that this was uint8 values,
              rescaled to the range [-1, 1].
    :param means: the Gaussian mean Tensor.
    :param log_scales: the Gaussian log stddev Tensor.
    :return: a tensor like x of log probabilities (in nats).
    """
    assert x.shape == means.shape == log_scales.shape
    centered_x = x - means
    inv_stdv = th.exp(-log_scales)
    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
    cdf_plus = approx_standard_normal_cdf(plus_in)
    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
    cdf_min = approx_standard_normal_cdf(min_in)
    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
    cdf_delta = cdf_plus - cdf_min
    log_probs = th.where(
        x < -0.999,
        log_cdf_plus,
        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
    )
    assert log_probs.shape == x.shape
    return log_probs


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/dpm_solver.py
================================================
import torch
from tqdm import tqdm
from ..model.cache_functions import cache_init

class NoiseScheduleVP:
    def __init__(
            self,
            schedule='discrete',
            betas=None,
            alphas_cumprod=None,
            continuous_beta_0=0.1,
            continuous_beta_1=20.,
            dtype=torch.float32,
    ):
        """Create a wrapper class for the forward SDE (VP type).

        ***
        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
        ***

        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:

            log_alpha_t = self.marginal_log_mean_coeff(t)
            sigma_t = self.marginal_std(t)
            lambda_t = self.marginal_lambda(t)

        Moreover, as lambda(t) is an invertible function, we also support its inverse function:

            t = self.inverse_lambda(lambda_t)

        ===============================================================

        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).

        1. For discrete-time DPMs:

            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
                t_i = (i + 1) / N
            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.

            Args:
                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)

            Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.

            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
                and
                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).


        2. For continuous-time DPMs:

            We support the linear VPSDE for the continuous time setting. The hyperparameters for the noise
            schedule are the default settings in Yang Song's ScoreSDE:

            Args:
                beta_min: A `float` number. The smallest beta for the linear schedule.
                beta_max: A `float` number. The largest beta for the linear schedule.
                T: A `float` number. The ending time of the forward process.

        ===============================================================

        Args:
            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
                    'linear' for continuous-time DPMs.
        Returns:
            A wrapper object of the forward SDE (VP type).

        ===============================================================

        Example:

        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
        >>> ns = NoiseScheduleVP('discrete', betas=betas)

        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)

        # For continuous-time DPMs (VPSDE), linear schedule:
        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)

        """

        if schedule not in ['discrete', 'linear']:
            raise ValueError(
                f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear'"
            )

        self.schedule = schedule
        if schedule == 'discrete':
            if betas is not None:
                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
            else:
                assert alphas_cumprod is not None
                log_alphas = 0.5 * torch.log(alphas_cumprod)
            self.T = 1.
            self.log_alpha_array = self.numerical_clip_alpha(log_alphas).reshape((1, -1,)).to(dtype=dtype)
            self.total_N = self.log_alpha_array.shape[1]
            self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
        else:
            self.T = 1.
            self.total_N = 1000
            self.beta_0 = continuous_beta_0
            self.beta_1 = continuous_beta_1

    def numerical_clip_alpha(self, log_alphas, clipped_lambda=-5.1):
        """
        For some beta schedules such as cosine schedule, the log-SNR has numerical isssues.
        We clip the log-SNR near t=T within -5.1 to ensure the stability.
        Such a trick is very useful for diffusion models with the cosine schedule, such as i-DDPM, guided-diffusion and GLIDE.
        """
        log_sigmas = 0.5 * torch.log(1. - torch.exp(2. * log_alphas))
        lambs = log_alphas - log_sigmas
        idx = torch.searchsorted(torch.flip(lambs, [0]), clipped_lambda)
        if idx > 0:
            log_alphas = log_alphas[:-idx]
        return log_alphas

    def marginal_log_mean_coeff(self, t):
        """
        Compute log(alpha_t) of a given continuous-time label t in [0, T].
        """
        if self.schedule == 'discrete':
            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
                                  self.log_alpha_array.to(t.device)).reshape((-1))
        elif self.schedule == 'linear':
            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0

    def marginal_alpha(self, t):
        """
        Compute alpha_t of a given continuous-time label t in [0, T].
        """
        return torch.exp(self.marginal_log_mean_coeff(t))

    def marginal_std(self, t):
        """
        Compute sigma_t of a given continuous-time label t in [0, T].
        """
        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))

    def marginal_lambda(self, t):
        """
        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
        """
        log_mean_coeff = self.marginal_log_mean_coeff(t)
        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
        return log_mean_coeff - log_std

    def inverse_lambda(self, lamb):
        """
        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
        """
        if self.schedule == 'linear':
            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
            Delta = self.beta_0 ** 2 + tmp
            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
        elif self.schedule == 'discrete':
            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
                               torch.flip(self.t_array.to(lamb.device), [1]))
            return t.reshape((-1,))


def model_wrapper(
        model,
        noise_schedule,
        model_type="noise",
        model_kwargs={},
        guidance_type="uncond",
        condition=None,
        unconditional_condition=None,
        guidance_scale=1.,
        classifier_fn=None,
        classifier_kwargs={},
):
    """Create a wrapper function for the noise prediction model.

    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.

    We support four types of the diffusion model by setting `model_type`:

        1. "noise": noise prediction model. (Trained by predicting noise).

        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).

        3. "v": velocity prediction model. (Trained by predicting the velocity).
            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].

            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
                arXiv preprint arXiv:2202.00512 (2022).
            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
                arXiv preprint arXiv:2210.02303 (2022).

        4. "score": marginal score function. (Trained by denoising score matching).
            Note that the score function and the noise prediction model follows a simple relationship:
            ```
                noise(x_t, t) = -sigma_t * score(x_t, t)
            ```

    We support three types of guided sampling by DPMs by setting `guidance_type`:
        1. "uncond": unconditional sampling by DPMs.
            The input `model` has the following format:
            ``
                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
            ``

        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
            The input `model` has the following format:
            ``
                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
            ``

            The input `classifier_fn` has the following format:
            ``
                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
            ``

            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.

        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
            The input `model` has the following format:
            ``
                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
            ``
            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.

            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
                arXiv preprint arXiv:2207.12598 (2022).


    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
    or continuous-time labels (i.e. epsilon to T).

    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
    ``
        def model_fn(x, t_continuous) -> noise:
            t_input = get_model_input_time(t_continuous)
            return noise_pred(model, x, t_input, **model_kwargs)
    ``
    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.

    ===============================================================

    Args:
        model: A diffusion model with the corresponding format described above.
        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
        model_type: A `str`. The parameterization type of the diffusion model.
                    "noise" or "x_start" or "v" or "score".
        model_kwargs: A `dict`. A dict for the other inputs of the model function.
        guidance_type: A `str`. The type of the guidance for sampling.
                    "uncond" or "classifier" or "classifier-free".
        condition: A pytorch tensor. The condition for the guided sampling.
                    Only used for "classifier" or "classifier-free" guidance type.
        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
                    Only used for "classifier-free" guidance type.
        guidance_scale: A `float`. The scale for the guided sampling.
        classifier_fn: A classifier function. Only used for the classifier guidance.
        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
    Returns:
        A noise prediction model that accepts the noised data and the continuous time as the inputs.
    """
   
    def get_model_input_time(t_continuous):
        """
        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
        For continuous-time DPMs, we just use `t_continuous`.
        """
        if noise_schedule.schedule == 'discrete':
            return (t_continuous - 1. / noise_schedule.total_N) * 1000.
        else:
            return t_continuous

    def noise_pred_fn(x, t_continuous, current, cache_dic, cond=None):
        t_input = get_model_input_time(t_continuous)
        if cond is None:
            output = model(x, t_input, current, cache_dic, **model_kwargs)
        else:
            output = model(x, t_input, current, cache_dic, cond, **model_kwargs)
        if model_type == "noise":
            return output
        elif model_type == "x_start":
            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
            return (x - expand_dims(alpha_t, x.dim()) * output) / expand_dims(sigma_t, x.dim())
        elif model_type == "v":
            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
            return expand_dims(alpha_t, x.dim()) * output + expand_dims(sigma_t, x.dim()) * x
        elif model_type == "score":
            sigma_t = noise_schedule.marginal_std(t_continuous)
            return -expand_dims(sigma_t, x.dim()) * output

    def cond_grad_fn(x, t_input):
        """
        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
        """
        with torch.enable_grad():
            x_in = x.detach().requires_grad_(True)
            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
            return torch.autograd.grad(log_prob.sum(), x_in)[0]

    def model_fn(x, t_continuous, current, cache_dic):
        """
        The noise predicition model function that is used for DPM-Solver.
        """
        if guidance_type == "uncond":
            return noise_pred_fn(x, t_continuous)
        elif guidance_type == "classifier":
            assert classifier_fn is not None
            t_input = get_model_input_time(t_continuous)
            cond_grad = cond_grad_fn(x, t_input)
            sigma_t = noise_schedule.marginal_std(t_continuous)
            noise = noise_pred_fn(x, t_continuous)
            return noise - guidance_scale * expand_dims(sigma_t, x.dim()) * cond_grad
        elif guidance_type == "classifier-free":
            if guidance_scale == 1. or unconditional_condition is None:
                return noise_pred_fn(x, t_continuous, cond=condition)
            x_in = torch.cat([x] * 2)
            t_in = torch.cat([t_continuous] * 2)
            c_in = torch.cat([unconditional_condition, condition])
            noise_uncond, noise = noise_pred_fn(x_in, t_in, current, cache_dic, cond=c_in).chunk(2)
            return noise_uncond + guidance_scale * (noise - noise_uncond)

    assert model_type in ["noise", "x_start", "v", "score"]
    assert guidance_type in ["uncond", "classifier", "classifier-free"]

    return model_fn


class DPM_Solver:
    def __init__(
            self,
            model_fn,
            noise_schedule,
            algorithm_type="dpmsolver++",
            correcting_x0_fn=None,
            correcting_xt_fn=None,
            thresholding_max_val=1.,
            dynamic_thresholding_ratio=0.995,
    ):
        """Construct a DPM-Solver.

        We support both DPM-Solver (`algorithm_type="dpmsolver"`) and DPM-Solver++ (`algorithm_type="dpmsolver++"`).

        We also support the "dynamic thresholding" method in Imagen[1]. For pixel-space diffusion models, you
        can set both `algorithm_type="dpmsolver++"` and `correcting_x0_fn="dynamic_thresholding"` to use the
        dynamic thresholding. The "dynamic thresholding" can greatly improve the sample quality for pixel-space
        DPMs with large guidance scales. Note that the thresholding method is **unsuitable** for latent-space
        DPMs (such as stable-diffusion).

        To support advanced algorithms in image-to-image applications, we also support corrector functions for
        both x0 and xt.

        Args:
            model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
                ``
                def model_fn(x, t_continuous):
                    return noise
                ``
                The shape of `x` is `(batch_size, **shape)`, and the shape of `t_continuous` is `(batch_size,)`.
            noise_schedule: A noise schedule object, such as NoiseScheduleVP.
            algorithm_type: A `str`. Either "dpmsolver" or "dpmsolver++".
            correcting_x0_fn: A `str` or a function with the following format:
                ```
                def correcting_x0_fn(x0, t):
                    x0_new = ...
                    return x0_new
                ```
                This function is to correct the outputs of the data prediction model at each sampling step. e.g.,
                ```
                x0_pred = data_pred_model(xt, t)
                if correcting_x0_fn is not None:
                    x0_pred = correcting_x0_fn(x0_pred, t)
                xt_1 = update(x0_pred, xt, t)
                ```
                If `correcting_x0_fn="dynamic_thresholding"`, we use the dynamic thresholding proposed in Imagen[1].
            correcting_xt_fn: A function with the following format:
                ```
                def correcting_xt_fn(xt, t, step):
                    x_new = ...
                    return x_new
                ```
                This function is to correct the intermediate samples xt at each sampling step. e.g.,
                ```
                xt = ...
                xt = correcting_xt_fn(xt, t, step)
                ```
            thresholding_max_val: A `float`. The max value for thresholding.
                Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
            dynamic_thresholding_ratio: A `float`. The ratio for dynamic thresholding (see Imagen[1] for details).
                Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.

        [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour,
            Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models
            with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
        """
        self.model = lambda x, t, current, cache_dic: model_fn(x, t.expand((x.shape[0])), current, cache_dic)
        self.noise_schedule = noise_schedule
        assert algorithm_type in ["dpmsolver", "dpmsolver++"]
        self.algorithm_type = algorithm_type
        if correcting_x0_fn == "dynamic_thresholding":
            self.correcting_x0_fn = self.dynamic_thresholding_fn
        else:
            self.correcting_x0_fn = correcting_x0_fn
        self.correcting_xt_fn = correcting_xt_fn
        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
        self.thresholding_max_val = thresholding_max_val

    def dynamic_thresholding_fn(self, x0, t):
        """
        The dynamic thresholding method.
        """
        dims = x0.dim()
        p = self.dynamic_thresholding_ratio
        s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
        s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
        x0 = torch.clamp(x0, -s, s) / s
        return x0

    def noise_prediction_fn(self, x, t, current, cache_dic):
        """
        Return the noise prediction model.
        """
        return self.model(x, t, current, cache_dic)

    def data_prediction_fn(self, x, t, current, cache_dic):
        """
        Return the data prediction model (with corrector).
        """
        noise = self.noise_prediction_fn(x, t, current, cache_dic)
        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
        x0 = (x - sigma_t * noise) / alpha_t
        if self.correcting_x0_fn is not None:
            x0 = self.correcting_x0_fn(x0, t)
        return x0

    def model_fn(self, x, t, current, cache_dic):
        """
        Convert the model to the noise prediction model or the data prediction model.
        """
        if self.algorithm_type == "dpmsolver++":
            return self.data_prediction_fn(x, t, current, cache_dic)
        else:
            return self.noise_prediction_fn(x, t, current, cache_dic)

    def get_time_steps(self, skip_type, t_T, t_0, N, device):
        """Compute the intermediate time steps for sampling.

        Args:
            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
                - 'logSNR': uniform logSNR for the time steps.
                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
            t_T: A `float`. The starting time of the sampling (default is T).
            t_0: A `float`. The ending time of the sampling (default is epsilon).
            N: A `int`. The total number of the spacing of the time steps.
            device: A torch device.
        Returns:
            A pytorch tensor of the time steps, with the shape (N + 1,).
        """
        if skip_type == 'logSNR':
            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
            return self.noise_schedule.inverse_lambda(logSNR_steps)
        elif skip_type == 'time_uniform':
            return torch.linspace(t_T, t_0, N + 1).to(device)
        elif skip_type == 'time_quadratic':
            t_order = 2
            return (
                torch.linspace(
                    t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1
                )
                .pow(t_order)
                .to(device)
            )
        else:
            raise ValueError(
                f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'"
            )

    def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
        """
        Get the order of each step for sampling by the singlestep DPM-Solver.

        We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast".
        Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is:
            - If order == 1:
                We take `steps` of DPM-Solver-1 (i.e. DDIM).
            - If order == 2:
                - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling.
                - If steps % 2 == 0, we use K steps of DPM-Solver-2.
                - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1.
            - If order == 3:
                - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
                - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
                - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
                - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.

        ============================================
        Args:
            order: A `int`. The max order for the solver (2 or 3).
            steps: A `int`. The total number of function evaluations (NFE).
            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
                - 'logSNR': uniform logSNR for the time steps.
                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
            t_T: A `float`. The starting time of the sampling (default is T).
            t_0: A `float`. The ending time of the sampling (default is epsilon).
            device: A torch device.
        Returns:
            orders: A list of the solver order of each step.
        """
        if order == 3:
            K = steps // 3 + 1
            if steps % 3 == 0:
                orders = [3, ] * (K - 2) + [2, 1]
            elif steps % 3 == 1:
                orders = [3, ] * (K - 1) + [1]
            else:
                orders = [3, ] * (K - 1) + [2]
        elif order == 2:
            if steps % 2 == 0:
                K = steps // 2
                orders = [2, ] * K
            else:
                K = steps // 2 + 1
                orders = [2, ] * (K - 1) + [1]
        elif order == 1:
            K = 1
            orders = [1, ] * steps
        else:
            raise ValueError("'order' must be '1' or '2' or '3'.")
        if skip_type == 'logSNR':
            # To reproduce the results in DPM-Solver paper
            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
        else:
            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
                torch.cumsum(torch.tensor([0, ] + orders), 0).to(device)]
        return timesteps_outer, orders

    def denoise_to_zero_fn(self, x, s):
        """
        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
        """
        return self.data_prediction_fn(x, s)

    def dpm_solver_first_update(self, x, s, t, current, cache_dic, model_s=None, return_intermediate=False):
        """
        DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            s: A pytorch tensor. The starting time, with the shape (1,).
            t: A pytorch tensor. The ending time, with the shape (1,).
            model_s: A pytorch tensor. The model function evaluated at time `s`.
                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
            return_intermediate: A `bool`. If true, also return the model value at time `s`.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        ns = self.noise_schedule
        dims = x.dim()
        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
        h = lambda_t - lambda_s
        log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
        sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
        alpha_t = torch.exp(log_alpha_t)

        if self.algorithm_type == "dpmsolver++":
            phi_1 = torch.expm1(-h)
            if model_s is None:
                model_s = self.model_fn(x, s, current, cache_dic)
            x_t = (
                    sigma_t / sigma_s * x
                    - alpha_t * phi_1 * model_s
            )
        else:
            phi_1 = torch.expm1(h)
            if model_s is None:
                model_s = self.model_fn(x, s, current, cache_dic)
            x_t = (
                    torch.exp(log_alpha_t - log_alpha_s) * x
                    - (sigma_t * phi_1) * model_s
            )
        return (x_t, {'model_s': model_s}) if return_intermediate else x_t

    def singlestep_dpm_solver_second_update(self, x, s, t, current, cache_dic, r1=0.5, model_s=None, return_intermediate=False,
                                            solver_type='dpmsolver'):
        """
        Singlestep solver DPM-Solver-2 from time `s` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            s: A pytorch tensor. The starting time, with the shape (1,).
            t: A pytorch tensor. The ending time, with the shape (1,).
            r1: A `float`. The hyperparameter of the second-order solver.
            model_s: A pytorch tensor. The model function evaluated at time `s`.
                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
            return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        if solver_type not in ['dpmsolver', 'taylor']:
            raise ValueError(
                f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}"
            )
        if r1 is None:
            r1 = 0.5
        ns = self.noise_schedule
        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
        h = lambda_t - lambda_s
        lambda_s1 = lambda_s + r1 * h
        s1 = ns.inverse_lambda(lambda_s1)
        log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(
            s1), ns.marginal_log_mean_coeff(t)
        sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
        alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)

        if self.algorithm_type == "dpmsolver++":
            phi_11 = torch.expm1(-r1 * h)
            phi_1 = torch.expm1(-h)

            if model_s is None:
                model_s = self.model_fn(x, s, current, cache_dic)
            x_s1 = (
                    (sigma_s1 / sigma_s) * x
                    - (alpha_s1 * phi_11) * model_s
            )
            model_s1 = self.model_fn(x_s1, s1, current, cache_dic)
            if solver_type == 'dpmsolver':
                x_t = (
                        (sigma_t / sigma_s) * x
                        - (alpha_t * phi_1) * model_s
                        - (0.5 / r1) * (alpha_t * phi_1) * (model_s1 - model_s)
                )
            elif solver_type == 'taylor':
                x_t = (
                        (sigma_t / sigma_s) * x
                        - (alpha_t * phi_1) * model_s
                        + (1. / r1) * (alpha_t * (phi_1 / h + 1.)) * (model_s1 - model_s)
                )
        else:
            phi_11 = torch.expm1(r1 * h)
            phi_1 = torch.expm1(h)

            if model_s is None:
                model_s = self.model_fn(x, s, current, cache_dic)
            x_s1 = (
                    torch.exp(log_alpha_s1 - log_alpha_s) * x
                    - (sigma_s1 * phi_11) * model_s
            )
            model_s1 = self.model_fn(x_s1, s1, current, cache_dic)
            if solver_type == 'dpmsolver':
                x_t = (
                        torch.exp(log_alpha_t - log_alpha_s) * x
                        - (sigma_t * phi_1) * model_s
                        - (0.5 / r1) * (sigma_t * phi_1) * (model_s1 - model_s)
                )
            elif solver_type == 'taylor':
                x_t = (
                        torch.exp(log_alpha_t - log_alpha_s) * x
                        - (sigma_t * phi_1) * model_s
                        - (1. / r1) * (sigma_t * (phi_1 / h - 1.)) * (model_s1 - model_s)
                )
        if return_intermediate:
            return x_t, {'model_s': model_s, 'model_s1': model_s1}
        else:
            return x_t

    def singlestep_dpm_solver_third_update(self, x, s, t, current, cache_dic, r1=1. / 3., r2=2. / 3., model_s=None, model_s1=None,
                                           return_intermediate=False, solver_type='dpmsolver'):
        """
        Singlestep solver DPM-Solver-3 from time `s` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            s: A pytorch tensor. The starting time, with the shape (1,).
            t: A pytorch tensor. The ending time, with the shape (1,).
            r1: A `float`. The hyperparameter of the third-order solver.
            r2: A `float`. The hyperparameter of the third-order solver.
            model_s: A pytorch tensor. The model function evaluated at time `s`.
                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
            model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
                If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        if solver_type not in ['dpmsolver', 'taylor']:
            raise ValueError(
                f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}"
            )
        if r1 is None:
            r1 = 1. / 3.
        if r2 is None:
            r2 = 2. / 3.
        ns = self.noise_schedule
        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
        h = lambda_t - lambda_s
        lambda_s1 = lambda_s + r1 * h
        lambda_s2 = lambda_s + r2 * h
        s1 = ns.inverse_lambda(lambda_s1)
        s2 = ns.inverse_lambda(lambda_s2)
        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(
            s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
        sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(
            s2), ns.marginal_std(t)
        alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)

        if self.algorithm_type == "dpmsolver++":
            phi_11 = torch.expm1(-r1 * h)
            phi_12 = torch.expm1(-r2 * h)
            phi_1 = torch.expm1(-h)
            phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.
            phi_2 = phi_1 / h + 1.
            phi_3 = phi_2 / h - 0.5

            if model_s is None:
                model_s = self.model_fn(x, s, current, cache_dic)
            if model_s1 is None:
                x_s1 = (
                        (sigma_s1 / sigma_s) * x
                        - (alpha_s1 * phi_11) * model_s
                )
                model_s1 = self.model_fn(x_s1, s1, current, cache_dic)
            x_s2 = (
                    (sigma_s2 / sigma_s) * x
                    - (alpha_s2 * phi_12) * model_s
                    + r2 / r1 * (alpha_s2 * phi_22) * (model_s1 - model_s)
            )
            model_s2 = self.model_fn(x_s2, s2, current, cache_dic)
            if solver_type == 'dpmsolver':
                x_t = (
                        (sigma_t / sigma_s) * x
                        - (alpha_t * phi_1) * model_s
                        + (1. / r2) * (alpha_t * phi_2) * (model_s2 - model_s)
                )
            elif solver_type == 'taylor':
                D1_0 = (1. / r1) * (model_s1 - model_s)
                D1_1 = (1. / r2) * (model_s2 - model_s)
                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
                x_t = (
                        (sigma_t / sigma_s) * x
                        - (alpha_t * phi_1) * model_s
                        + (alpha_t * phi_2) * D1
                        - (alpha_t * phi_3) * D2
                )
        else:
            phi_11 = torch.expm1(r1 * h)
            phi_12 = torch.expm1(r2 * h)
            phi_1 = torch.expm1(h)
            phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.
            phi_2 = phi_1 / h - 1.
            phi_3 = phi_2 / h - 0.5

            if model_s is None:
                model_s = self.model_fn(x, s, current, cache_dic)
            if model_s1 is None:
                x_s1 = (
                        (torch.exp(log_alpha_s1 - log_alpha_s)) * x
                        - (sigma_s1 * phi_11) * model_s
                )
                model_s1 = self.model_fn(x_s1, s1, current, cache_dic)
            x_s2 = (
                    (torch.exp(log_alpha_s2 - log_alpha_s)) * x
                    - (sigma_s2 * phi_12) * model_s
                    - r2 / r1 * (sigma_s2 * phi_22) * (model_s1 - model_s)
            )
            model_s2 = self.model_fn(x_s2, s2, current, cache_dic)
            if solver_type == 'dpmsolver':
                x_t = (
                        (torch.exp(log_alpha_t - log_alpha_s)) * x
                        - (sigma_t * phi_1) * model_s
                        - (1. / r2) * (sigma_t * phi_2) * (model_s2 - model_s)
                )
            elif solver_type == 'taylor':
                D1_0 = (1. / r1) * (model_s1 - model_s)
                D1_1 = (1. / r2) * (model_s2 - model_s)
                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
                x_t = (
                        (torch.exp(log_alpha_t - log_alpha_s)) * x
                        - (sigma_t * phi_1) * model_s
                        - (sigma_t * phi_2) * D1
                        - (sigma_t * phi_3) * D2
                )

        if return_intermediate:
            return x_t, {'model_s': model_s, 'model_s1': model_s1, 'model_s2': model_s2}
        else:
            return x_t

    def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"):
        """
        Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            model_prev_list: A list of pytorch tensor. The previous computed model values.
            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
            t: A pytorch tensor. The ending time, with the shape (1,).
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        if solver_type not in ['dpmsolver', 'taylor']:
            raise ValueError(
                f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}"
            )
        ns = self.noise_schedule
        model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1]
        t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1]
        lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(
            t_prev_0), ns.marginal_lambda(t)
        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
        alpha_t = torch.exp(log_alpha_t)

        h_0 = lambda_prev_0 - lambda_prev_1
        h = lambda_t - lambda_prev_0
        r0 = h_0 / h
        D1_0 = (1. / r0) * (model_prev_0 - model_prev_1)
        if self.algorithm_type == "dpmsolver++":
            phi_1 = torch.expm1(-h)
            if solver_type == 'dpmsolver':
                x_t = (
                        (sigma_t / sigma_prev_0) * x
                        - (alpha_t * phi_1) * model_prev_0
                        - 0.5 * (alpha_t * phi_1) * D1_0
                )
            elif solver_type == 'taylor':
                x_t = (
                        (sigma_t / sigma_prev_0) * x
                        - (alpha_t * phi_1) * model_prev_0
                        + (alpha_t * (phi_1 / h + 1.)) * D1_0
                )
        else:
            phi_1 = torch.expm1(h)
            if solver_type == 'dpmsolver':
                x_t = (
                        (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
                        - (sigma_t * phi_1) * model_prev_0
                        - 0.5 * (sigma_t * phi_1) * D1_0
                )
            elif solver_type == 'taylor':
                x_t = (
                        (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
                        - (sigma_t * phi_1) * model_prev_0
                        - (sigma_t * (phi_1 / h - 1.)) * D1_0
                )
        return x_t

    def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type='dpmsolver'):
        """
        Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            model_prev_list: A list of pytorch tensor. The previous computed model values.
            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
            t: A pytorch tensor. The ending time, with the shape (1,).
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        ns = self.noise_schedule
        model_prev_2, model_prev_1, model_prev_0 = model_prev_list
        t_prev_2, t_prev_1, t_prev_0 = t_prev_list
        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(
            t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
        alpha_t = torch.exp(log_alpha_t)

        h_1 = lambda_prev_1 - lambda_prev_2
        h_0 = lambda_prev_0 - lambda_prev_1
        h = lambda_t - lambda_prev_0
        r0, r1 = h_0 / h, h_1 / h
        D1_0 = (1. / r0) * (model_prev_0 - model_prev_1)
        D1_1 = (1. / r1) * (model_prev_1 - model_prev_2)
        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
        D2 = (1. / (r0 + r1)) * (D1_0 - D1_1)
        if self.algorithm_type == "dpmsolver++":
            phi_1 = torch.expm1(-h)
            phi_2 = phi_1 / h + 1.
            phi_3 = phi_2 / h - 0.5
            return (
                (sigma_t / sigma_prev_0) * x
                - (alpha_t * phi_1) * model_prev_0
                + (alpha_t * phi_2) * D1
                - (alpha_t * phi_3) * D2
            )
        else:
            phi_1 = torch.expm1(h)
            phi_2 = phi_1 / h - 1.
            phi_3 = phi_2 / h - 0.5
            return (
                (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
                - (sigma_t * phi_1) * model_prev_0
                - (sigma_t * phi_2) * D1
                - (sigma_t * phi_3) * D2
            )

    def singlestep_dpm_solver_update(self, x, s, t, current, cache_dic, order, return_intermediate=False, solver_type='dpmsolver', r1=None,
                                     r2=None):
        """
        Singlestep DPM-Solver with the order `order` from time `s` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            s: A pytorch tensor. The starting time, with the shape (1,).
            t: A pytorch tensor. The ending time, with the shape (1,).
            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
            r1: A `float`. The hyperparameter of the second-order or third-order solver.
            r2: A `float`. The hyperparameter of the third-order solver.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        if order == 1:
            return self.dpm_solver_first_update(x, s, t, current, cache_dic, return_intermediate=return_intermediate)
        elif order == 2:
            return self.singlestep_dpm_solver_second_update(x, s, t, current, cache_dic, return_intermediate=return_intermediate,
                                                            solver_type=solver_type, r1=r1)
        elif order == 3:
            return self.singlestep_dpm_solver_third_update(x, s, t, current, cache_dic, return_intermediate=return_intermediate,
                                                           solver_type=solver_type, r1=r1, r2=r2)
        else:
            raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")

    def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, current, cache_dic, order, solver_type='dpmsolver'):
        """
        Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.

        Args:
            x: A pytorch tensor. The initial value at time `s`.
            model_prev_list: A list of pytorch tensor. The previous computed model values.
            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
            t: A pytorch tensor. The ending time, with the shape (1,).
            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_t: A pytorch tensor. The approximated solution at time `t`.
        """
        if order == 1:
            return self.dpm_solver_first_update(x, t_prev_list[-1], t, current, cache_dic, model_s=model_prev_list[-1])
        elif order == 2:
            return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
        elif order == 3:
            return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
        else:
            raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")

    def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5,
                            solver_type='dpmsolver'):
        """
        The adaptive step size solver based on singlestep DPM-Solver.

        Args:
            x: A pytorch tensor. The initial value at time `t_T`.
            order: A `int`. The (higher) order of the solver. We only support order == 2 or 3.
            t_T: A `float`. The starting time of the sampling (default is T).
            t_0: A `float`. The ending time of the sampling (default is epsilon).
            h_init: A `float`. The initial step size (for logSNR).
            atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1].
            rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05.
            theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
            t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
                current time and `t_0` is less than `t_err`. The default setting is 1e-5.
            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
        Returns:
            x_0: A pytorch tensor. The approximated solution at time `t_0`.

        [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
        """
        ns = self.noise_schedule
        s = t_T * torch.ones((1,)).to(x)
        lambda_s = ns.marginal_lambda(s)
        lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
        h = h_init * torch.ones_like(s).to(x)
        x_prev = x
        nfe = 0
        if order == 2:
            r1 = 0.5
            lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
                                                                                               solver_type=solver_type,
                                                                                               **kwargs)
        elif order == 3:
            r1, r2 = 1. / 3., 2. / 3.
            lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
                                                                                    return_intermediate=True,
                                                                                    solver_type=solver_type)
            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2,
                                                                                              solver_type=solver_type,
                                                                                              **kwargs)
        else:
            raise ValueError(
                f"For adaptive step size solver, order must be 2 or 3, got {order}"
            )
        while torch.abs((s - t_0)).mean() > t_err:
            t = ns.inverse_lambda(lambda_s + h)
            x_lower, lower_noise_kwargs = lower_update(x, s, t)
            x_higher = higher_update(x, s, t, **lower_noise_kwargs)
            delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
            norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
            E = norm_fn((x_higher - x_lower) / delta).max()
            if torch.all(E <= 1.):
                x = x_higher
                s = t
                x_prev = x_lower
                lambda_s = ns.marginal_lambda(s)
            h = torch.min(theta * h * torch.float_power(E, -1. / order).float(), lambda_0 - lambda_s)
            nfe += order
        print('adaptive solver nfe', nfe)
        return x

    def add_noise(self, x, t, noise=None):
        """
        Compute the noised input xt = alpha_t * x + sigma_t * noise.

        Args:
            x: A `torch.Tensor` with shape `(batch_size, *shape)`.
            t: A `torch.Tensor` with shape `(t_size,)`.
        Returns:
            xt with shape `(t_size, batch_size, *shape)`.
        """
        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
        if noise is None:
            noise = torch.randn((t.shape[0], *x.shape), device=x.device)
        x = x.reshape((-1, *x.shape))
        xt = expand_dims(alpha_t, x.dim()) * x + expand_dims(sigma_t, x.dim()) * noise
        return xt.squeeze(0) if t.shape[0] == 1 else xt

    def inverse(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform',
                method='multistep', lower_order_final=True, denoise_to_zero=False, solver_type='dpmsolver',
                atol=0.0078, rtol=0.05, return_intermediate=False,
                ):
        """
        Inverse the sample `x` from time `t_start` to `t_end` by DPM-Solver.
        For discrete-time DPMs, we use `t_start=1/N`, where `N` is the total time steps during training.
        """
        t_0 = 1. / self.noise_schedule.total_N if t_start is None else t_start
        t_T = self.noise_schedule.T if t_end is None else t_end
        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
        return self.sample(x, steps=steps, t_start=t_0, t_end=t_T, order=order, skip_type=skip_type,
                           method=method, lower_order_final=lower_order_final, denoise_to_zero=denoise_to_zero,
                           solver_type=solver_type,
                           atol=atol, rtol=rtol, return_intermediate=return_intermediate)

    def sample(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform',
               method='multistep', lower_order_final=True, denoise_to_zero=False, solver_type='dpmsolver',
               atol=0.0078, rtol=0.05, return_intermediate=False, model_kwargs = {}, rank = None,
               ):
        """
        Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.

        =====================================================

        We support the following algorithms for both noise prediction model and data prediction model:
            - 'singlestep':
                Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver.
                We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps).
                The total number of function evaluations (NFE) == `steps`.
                Given a fixed NFE == `steps`, the sampling procedure is:
                    - If `order` == 1:
                        - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM).
                    - If `order` == 2:
                        - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling.
                        - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2.
                        - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
                    - If `order` == 3:
                        - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
                        - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
                        - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1.
                        - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2.
            - 'multistep':
                Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`.
                We initialize the first `order` values by lower order multistep solvers.
                Given a fixed NFE == `steps`, the sampling procedure is:
                    Denote K = steps.
                    - If `order` == 1:
                        - We use K steps of DPM-Solver-1 (i.e. DDIM).
                    - If `order` == 2:
                        - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2.
                    - If `order` == 3:
                        - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3.
            - 'singlestep_fixed':
                Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3).
                We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE.
            - 'adaptive':
                Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper).
                We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`.
                You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs
                (NFE) and the sample quality.
                    - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2.
                    - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3.

        =====================================================

        Some advices for choosing the algorithm:
            - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
                Use singlestep DPM-Solver or DPM-Solver++ ("DPM-Solver-fast" in the paper) with `order = 3`.
                e.g., DPM-Solver:
                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver")
                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
                            skip_type='time_uniform', method='singlestep')
                e.g., DPM-Solver++:
                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
                            skip_type='time_uniform', method='singlestep')
            - For **guided sampling with large guidance scale** by DPMs:
                Use multistep DPM-Solver with `algorithm_type="dpmsolver++"` and `order = 2`.
                e.g.
                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
                            skip_type='time_uniform', method='multistep')

        We support three types of `skip_type`:
            - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images**
            - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**.
            - 'time_quadratic': quadratic time for the time steps.

        =====================================================
        Args:
            x: A pytorch tensor. The initial value at time `t_start`
                e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution.
            steps: A `int`. The total number of function evaluations (NFE).
            t_start: A `float`. The starting time of the sampling.
                If `T` is None, we use self.noise_schedule.T (default is 1.0).
            t_end: A `float`. The ending time of the sampling.
                If `t_end` is None, we use 1. / self.noise_schedule.total_N.
                e.g. if total_N == 1000, we have `t_end` == 1e-3.
                For discrete-time DPMs:
                    - We recommend `t_end` == 1. / self.noise_schedule.total_N.
                For continuous-time DPMs:
                    - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15.
            order: A `int`. The order of DPM-Solver.
            skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
            method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
            denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step.
                Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1).

                This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and
                score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID
                for diffusion models sampling by diffusion SDEs for low-resolutional images
                (such as CIFAR-10). However, we observed that such trick does not matter for
                high-resolutional images. As it needs an additional NFE, we do not recommend
                it for high-resolutional images.
            lower_order_final: A `bool`. Whether to use lower order solvers at the final steps.
                Only valid for `method=multistep` and `steps < 15`. We empirically find that
                this trick is a key to stabilizing the sampling by DPM-Solver with very few steps
                (especially for steps <= 10). So we recommend to set it to be `True`.
            solver_type: A `str`. The taylor expansion type for the solver. `dpmsolver` or `taylor`. We recommend `dpmsolver`.
            atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
            rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
            return_intermediate: A `bool`. Whether to save the xt at each step.
                When set to `True`, method returns a tuple (x0, intermediates); when set to False, method returns only x0.
        Returns:
            x_end: A pytorch tensor. The approximated solution at time `t_end`.

        """
        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
        t_T = self.noise_schedule.T if t_start is None else t_start
        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
        if return_intermediate:
            assert method in ['multistep', 'singlestep',
                              'singlestep_fixed'], "Cannot use adaptive solver when saving intermediate values"
        if self.correcting_xt_fn is not None:
            assert method in ['multistep', 'singlestep',
                              'singlestep_fixed'], "Cannot use adaptive solver when correcting_xt_fn is not None"
        device = x.device
        intermediates = []

        cache_dic, current = cache_init(model_kwargs=model_kwargs, num_steps=steps)
        
        with torch.no_grad():
            if method == 'adaptive':
                x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol,
                                             solver_type=solver_type)
            elif method == 'multistep':
                assert steps >= order
                timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
                assert timesteps.shape[0] - 1 == steps
                # Init the initial values.
                step = 0
                current['step'] = step
                t = timesteps[step]
                t_prev_list = [t]
                model_prev_list = [self.model_fn(x, t, current, cache_dic)]
                if self.correcting_xt_fn is not None:
                    x = self.correcting_xt_fn(x, t, step)
                if return_intermediate:
                    intermediates.append(x)
                # Init the first `order` values by lower order multistep DPM-Solver.
                for step in range(1, order):
                    current['step'] = step
                    t = timesteps[step]
                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, t, current, cache_dic, step,
                                                         solver_type=solver_type)
                    if self.correcting_xt_fn is not None:
                        x = self.correcting_xt_fn(x, t, step)
                    if return_intermediate:
                        intermediates.append(x)
                    t_prev_list.append(t)
                    model_prev_list.append(self.model_fn(x, t, current, cache_dic))
                # Compute the remaining values by `order`-th order multistep DPM-Solver.
                pbar = tqdm(range(order, steps + 1), leave=False) if (rank == 0) or (rank == None) else range(order, steps + 1)
                for step in pbar:
                    current['step'] = step
                    t = timesteps[step]
                    # We only use lower order for steps < 10
                    if lower_order_final and steps < 10:
                        step_order = min(order, steps + 1 - step)
                    else:
                        step_order = order
                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, t, current, cache_dic, step_order,
                                                         solver_type=solver_type)
                    if self.correcting_xt_fn is not None:
                        x = self.correcting_xt_fn(x, t, step)
                    if return_intermediate:
                        intermediates.append(x)
                    for i in range(order - 1):
                        t_prev_list[i] = t_prev_list[i + 1]
                        model_prev_list[i] = model_prev_list[i + 1]
                    t_prev_list[-1] = t
                    # We do not need to evaluate the final model value.
                    if step < steps:
                        model_prev_list[-1] = self.model_fn(x, t, current, cache_dic)
            elif method in ['singlestep', 'singlestep_fixed']:
                if method == 'singlestep':
                    timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps,
                                                                                                  order=order,
                                                                                                  skip_type=skip_type,
                                                                                                  t_T=t_T, t_0=t_0,
                                                                                                  device=device)
                elif method == 'singlestep_fixed':
                    K = steps // order
                    orders = [order, ] * K
                    timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
                for step, order in enumerate(orders):
                    s, t = timesteps_outer[step], timesteps_outer[step + 1]
                    timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=s.item(), t_0=t.item(), N=order,
                                                          device=device)
                    lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
                    h = lambda_inner[-1] - lambda_inner[0]
                    r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
                    r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
                    x = self.singlestep_dpm_solver_update(x, s, t, order, solver_type=solver_type, r1=r1, r2=r2)
                    if self.correcting_xt_fn is not None:
                        x = self.correcting_xt_fn(x, t, step)
                    if return_intermediate:
                        intermediates.append(x)
            else:
                raise ValueError(f"Got wrong method {method}")
            if denoise_to_zero:
                t = torch.ones((1,)).to(device) * t_0
                x = self.denoise_to_zero_fn(x, t)
                if self.correcting_xt_fn is not None:
                    x = self.correcting_xt_fn(x, t, step + 1)
                if return_intermediate:
                    intermediates.append(x)
        return (x, intermediates) if return_intermediate else x


#############################################################
# other utility functions
#############################################################

def interpolate_fn(x, xp, yp):
    """
    A piecewise linear function y = f(x), using xp and yp as keypoints.
    We implement f(x) in a differentiable way (i.e. applicable for autograd).
    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)

    Args:
        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
        yp: PyTorch tensor with shape [C, K].
    Returns:
        The function values f(x), with shape [N, C].
    """
    N, K = x.shape[0], xp.shape[1]
    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
    x_idx = torch.argmin(x_indices, dim=2)
    cand_start_idx = x_idx - 1
    start_idx = torch.where(
        torch.eq(x_idx, 0),
        torch.tensor(1, device=x.device),
        torch.where(
            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
        ),
    )
    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
    start_idx2 = torch.where(
        torch.eq(x_idx, 0),
        torch.tensor(0, device=x.device),
        torch.where(
            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
        ),
    )
    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
    return start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)


def expand_dims(v, dims):
    """
    Expand the tensor `v` to the dim `dims`.

    Args:
        `v`: a PyTorch tensor with shape [N].
        `dim`: a `int`.
    Returns:
        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
    """
    return v[(...,) + (None,) * (dims - 1)]

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/edm_sample.py
================================================
import random
import numpy as np
from tqdm import tqdm

from diffusion.model.utils import *


# ----------------------------------------------------------------------------
# Proposed EDM sampler (Algorithm 2).

def edm_sampler(
        net, latents, class_labels=None, cfg_scale=None, randn_like=torch.randn_like,
        num_steps=18, sigma_min=0.002, sigma_max=80, rho=7,
        S_churn=0, S_min=0, S_max=float('inf'), S_noise=1, **kwargs
):
    # Adjust noise levels based on what's supported by the network.
    sigma_min = max(sigma_min, net.sigma_min)
    sigma_max = min(sigma_max, net.sigma_max)

    # Time step discretization.
    step_indices = torch.arange(num_steps, dtype=torch.float64, device=latents.device)
    t_steps = (sigma_max ** (1 / rho) + step_indices / (num_steps - 1) * (
                sigma_min ** (1 / rho) - sigma_max ** (1 / rho))) ** rho
    t_steps = torch.cat([net.round_sigma(t_steps), torch.zeros_like(t_steps[:1])])  # t_N = 0

    # Main sampling loop.
    x_next = latents.to(torch.float64) * t_steps[0]
    for i, (t_cur, t_next) in tqdm(list(enumerate(zip(t_steps[:-1], t_steps[1:])))):  # 0, ..., N-1
        x_cur = x_next

        # Increase noise temporarily.
        gamma = min(S_churn / num_steps, np.sqrt(2) - 1) if S_min <= t_cur <= S_max else 0
        t_hat = net.round_sigma(t_cur + gamma * t_cur)
        x_hat = x_cur + (t_hat ** 2 - t_cur ** 2).sqrt() * S_noise * randn_like(x_cur)

        # Euler step.
        denoised = net(x_hat.float(), t_hat, class_labels, cfg_scale, **kwargs)['x'].to(torch.float64)
        d_cur = (x_hat - denoised) / t_hat
        x_next = x_hat + (t_next - t_hat) * d_cur

        # Apply 2nd order correction.
        if i < num_steps - 1:
            denoised = net(x_next.float(), t_next, class_labels, cfg_scale, **kwargs)['x'].to(torch.float64)
            d_prime = (x_next - denoised) / t_next
            x_next = x_hat + (t_next - t_hat) * (0.5 * d_cur + 0.5 * d_prime)

    return x_next


# ----------------------------------------------------------------------------
# Generalized ablation sampler, representing the superset of all sampling
# methods discussed in the paper.

def ablation_sampler(
        net, latents, class_labels=None, cfg_scale=None, feat=None, randn_like=torch.randn_like,
        num_steps=18, sigma_min=None, sigma_max=None, rho=7,
        solver='heun', discretization='edm', schedule='linear', scaling='none',
        epsilon_s=1e-3, C_1=0.001, C_2=0.008, M=1000, alpha=1,
        S_churn=0, S_min=0, S_max=float('inf'), S_noise=1,
):
    assert solver in ['euler', 'heun']
    assert discretization in ['vp', 've', 'iddpm', 'edm']
    assert schedule in ['vp', 've', 'linear']
    assert scaling in ['vp', 'none']

    # Helper functions for VP & VE noise level schedules.
    vp_sigma = lambda beta_d, beta_min: lambda t: (np.e ** (0.5 * beta_d * (t ** 2) + beta_min * t) - 1) ** 0.5
    vp_sigma_deriv = lambda beta_d, beta_min: lambda t: 0.5 * (beta_min + beta_d * t) * (sigma(t) + 1 / sigma(t))
    vp_sigma_inv = lambda beta_d, beta_min: lambda sigma: ((beta_min ** 2 + 2 * beta_d * (
            sigma ** 2 + 1).log()).sqrt() - beta_min) / beta_d
    ve_sigma = lambda t: t.sqrt()
    ve_sigma_deriv = lambda t: 0.5 / t.sqrt()
    ve_sigma_inv = lambda sigma: sigma ** 2

    # Select default noise level range based on the specified time step discretization.
    if sigma_min is None:
        vp_def = vp_sigma(beta_d=19.1, beta_min=0.1)(t=epsilon_s)
        sigma_min = {'vp': vp_def, 've': 0.02, 'iddpm': 0.002, 'edm': 0.002}[discretization]
    if sigma_max is None:
        vp_def = vp_sigma(beta_d=19.1, beta_min=0.1)(t=1)
        sigma_max = {'vp': vp_def, 've': 100, 'iddpm': 81, 'edm': 80}[discretization]

    # Adjust noise levels based on what's supported by the network.
    sigma_min = max(sigma_min, net.sigma_min)
    sigma_max = min(sigma_max, net.sigma_max)

    # Compute corresponding betas for VP.
    vp_beta_d = 2 * (np.log(sigma_min ** 2 + 1) / epsilon_s - np.log(sigma_max ** 2 + 1)) / (epsilon_s - 1)
    vp_beta_min = np.log(sigma_max ** 2 + 1) - 0.5 * vp_beta_d

    # Define time steps in terms of noise level.
    step_indices = torch.arange(num_steps, dtype=torch.float64, device=latents.device)
    if discretization == 'vp':
        orig_t_steps = 1 + step_indices / (num_steps - 1) * (epsilon_s - 1)
        sigma_steps = vp_sigma(vp_beta_d, vp_beta_min)(orig_t_steps)
    elif discretization == 've':
        orig_t_steps = (sigma_max ** 2) * ((sigma_min ** 2 / sigma_max ** 2) ** (step_indices / (num_steps - 1)))
        sigma_steps = ve_sigma(orig_t_steps)
    elif discretization == 'iddpm':
        u = torch.zeros(M + 1, dtype=torch.float64, device=latents.device)
        alpha_bar = lambda j: (0.5 * np.pi * j / M / (C_2 + 1)).sin() ** 2
        for j in torch.arange(M, 0, -1, device=latents.device):  # M, ..., 1
            u[j - 1] = ((u[j] ** 2 + 1) / (alpha_bar(j - 1) / alpha_bar(j)).clip(min=C_1) - 1).sqrt()
        u_filtered = u[torch.logical_and(u >= sigma_min, u <= sigma_max)]
        sigma_steps = u_filtered[((len(u_filtered) - 1) / (num_steps - 1) * step_indices).round().to(torch.int64)]
    else:
        assert discretization == 'edm'
        sigma_steps = (sigma_max ** (1 / rho) + step_indices / (num_steps - 1) * (
                sigma_min ** (1 / rho) - sigma_max ** (1 / rho))) ** rho

    # Define noise level schedule.
    if schedule == 'vp':
        sigma = vp_sigma(vp_beta_d, vp_beta_min)
        sigma_deriv = vp_sigma_deriv(vp_beta_d, vp_beta_min)
        sigma_inv = vp_sigma_inv(vp_beta_d, vp_beta_min)
    elif schedule == 've':
        sigma = ve_sigma
        sigma_deriv = ve_sigma_deriv
        sigma_inv = ve_sigma_inv
    else:
        assert schedule == 'linear'
        sigma = lambda t: t
        sigma_deriv = lambda t: 1
        sigma_inv = lambda sigma: sigma

    # Define scaling schedule.
    if scaling == 'vp':
        s = lambda t: 1 / (1 + sigma(t) ** 2).sqrt()
        s_deriv = lambda t: -sigma(t) * sigma_deriv(t) * (s(t) ** 3)
    else:
        assert scaling == 'none'
        s = lambda t: 1
        s_deriv = lambda t: 0

    # Compute final time steps based on the corresponding noise levels.
    t_steps = sigma_inv(net.round_sigma(sigma_steps))
    t_steps = torch.cat([t_steps, torch.zeros_like(t_steps[:1])])  # t_N = 0

    # Main sampling loop.
    t_next = t_steps[0]
    x_next = latents.to(torch.float64) * (sigma(t_next) * s(t_next))
    for i, (t_cur, t_next) in enumerate(zip(t_steps[:-1], t_steps[1:])):  # 0, ..., N-1
        x_cur = x_next

        # Increase noise temporarily.
        gamma = min(S_churn / num_steps, np.sqrt(2) - 1) if S_min <= sigma(t_cur) <= S_max else 0
        t_hat = sigma_inv(net.round_sigma(sigma(t_cur) + gamma * sigma(t_cur)))
        x_hat = s(t_hat) / s(t_cur) * x_cur + (sigma(t_hat) ** 2 - sigma(t_cur) ** 2).clip(min=0).sqrt() * s(
            t_hat) * S_noise * randn_like(x_cur)

        # Euler step.
        h = t_next - t_hat
        denoised = net(x_hat.float() / s(t_hat), sigma(t_hat), class_labels, cfg_scale, feat=feat)['x'].to(
            torch.float64)
        d_cur = (sigma_deriv(t_hat) / sigma(t_hat) + s_deriv(t_hat) / s(t_hat)) * x_hat - sigma_deriv(t_hat) * s(
            t_hat) / sigma(t_hat) * denoised
        x_prime = x_hat + alpha * h * d_cur
        t_prime = t_hat + alpha * h

        # Apply 2nd order correction.
        if solver == 'euler' or i == num_steps - 1:
            x_next = x_hat + h * d_cur
        else:
            assert solver == 'heun'
            denoised = net(x_prime.float() / s(t_prime), sigma(t_prime), class_labels, cfg_scale, feat=feat)['x'].to(
                torch.float64)
            d_prime = (sigma_deriv(t_prime) / sigma(t_prime) + s_deriv(t_prime) / s(t_prime)) * x_prime - sigma_deriv(
                t_prime) * s(t_prime) / sigma(t_prime) * denoised
            x_next = x_hat + h * ((1 - 1 / (2 * alpha)) * d_cur + 1 / (2 * alpha) * d_prime)

    return x_next


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/gaussian_diffusion.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py


import enum
import math

import numpy as np
import torch as th
import torch.nn.functional as F

from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
from .cache_functions import cache_init

def mean_flat(tensor):
    """
    Take the mean over all non-batch dimensions.
    """
    return tensor.mean(dim=list(range(1, len(tensor.shape))))


class ModelMeanType(enum.Enum):
    """
    Which type of output the model predicts.
    """

    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
    START_X = enum.auto()  # the model predicts x_0
    EPSILON = enum.auto()  # the model predicts epsilon


class ModelVarType(enum.Enum):
    """
    What is used as the model's output variance.
    The LEARNED_RANGE option has been added to allow the model to predict
    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
    """

    LEARNED = enum.auto()
    FIXED_SMALL = enum.auto()
    FIXED_LARGE = enum.auto()
    LEARNED_RANGE = enum.auto()


class LossType(enum.Enum):
    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
    RESCALED_MSE = (
        enum.auto()
    )  # use raw MSE loss (with RESCALED_KL when learning variances)
    KL = enum.auto()  # use the variational lower-bound
    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB

    def is_vb(self):
        return self in [LossType.KL, LossType.RESCALED_KL]


def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
    warmup_time = int(num_diffusion_timesteps * warmup_frac)
    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
    return betas


def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
    """
    This is the deprecated API for creating beta schedules.
    See get_named_beta_schedule() for the new library of schedules.
    """
    if beta_schedule == "quad":
        betas = (
            np.linspace(
                beta_start ** 0.5,
                beta_end ** 0.5,
                num_diffusion_timesteps,
                dtype=np.float64,
            )
            ** 2
        )
    elif beta_schedule == "linear":
        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
    elif beta_schedule == "warmup10":
        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
    elif beta_schedule == "warmup50":
        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
    elif beta_schedule == "const":
        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
        betas = 1.0 / np.linspace(
            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
        )
    else:
        raise NotImplementedError(beta_schedule)
    assert betas.shape == (num_diffusion_timesteps,)
    return betas


def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
    """
    Get a pre-defined beta schedule for the given name.
    The beta schedule library consists of beta schedules which remain similar
    in the limit of num_diffusion_timesteps.
    Beta schedules may be added, but should not be removed or changed once
    they are committed to maintain backwards compatibility.
    """
    if schedule_name == "linear":
        # Linear schedule from Ho et al, extended to work for any number of
        # diffusion steps.
        scale = 1000 / num_diffusion_timesteps
        return get_beta_schedule(
            "linear",
            beta_start=scale * 0.0001,
            beta_end=scale * 0.02,
            num_diffusion_timesteps=num_diffusion_timesteps,
        )
    elif schedule_name == "squaredcos_cap_v2":
        return betas_for_alpha_bar(
            num_diffusion_timesteps,
            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
        )
    else:
        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")


def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
    """
    Create a beta schedule that discretizes the given alpha_t_bar function,
    which defines the cumulative product of (1-beta) over time from t = [0,1].
    :param num_diffusion_timesteps: the number of betas to produce.
    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
                      produces the cumulative product of (1-beta) up to that
                      part of the diffusion process.
    :param max_beta: the maximum beta to use; use values lower than 1 to
                     prevent singularities.
    """
    betas = []
    for i in range(num_diffusion_timesteps):
        t1 = i / num_diffusion_timesteps
        t2 = (i + 1) / num_diffusion_timesteps
        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
    return np.array(betas)


class GaussianDiffusion:
    """
    Utilities for training and sampling diffusion models.
    Original ported from this codebase:
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
    :param betas: a 1-D numpy array of betas for each diffusion timestep,
                  starting at T and going to 1.
    """

    def __init__(
        self,
        *,
        betas,
        model_mean_type,
        model_var_type,
        loss_type,
        snr=False,
        return_startx=False,
    ):

        self.model_mean_type = model_mean_type
        self.model_var_type = model_var_type
        self.loss_type = loss_type
        self.snr = snr
        self.return_startx = return_startx

        # Use float64 for accuracy.
        betas = np.array(betas, dtype=np.float64)
        self.betas = betas
        assert len(betas.shape) == 1, "betas must be 1-D"
        assert (betas > 0).all() and (betas <= 1).all()

        self.num_timesteps = int(betas.shape[0])

        alphas = 1.0 - betas
        self.alphas_cumprod = np.cumprod(alphas, axis=0)
        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)

        # calculations for diffusion q(x_t | x_{t-1}) and others
        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)

        # calculations for posterior q(x_{t-1} | x_t, x_0)
        self.posterior_variance = (
            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        )
        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
        self.posterior_log_variance_clipped = np.log(
            np.append(self.posterior_variance[1], self.posterior_variance[1:])
        ) if len(self.posterior_variance) > 1 else np.array([])

        self.posterior_mean_coef1 = (
            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        )
        self.posterior_mean_coef2 = (
            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
        )

    def q_mean_variance(self, x_start, t):
        """
        Get the distribution q(x_t | x_0).
        :param x_start: the [N x C x ...] tensor of noiseless inputs.
        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
        """
        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
        return mean, variance, log_variance

    def q_sample(self, x_start, t, noise=None):
        """
        Diffuse the data for a given number of diffusion steps.
        In other words, sample from q(x_t | x_0).
        :param x_start: the initial data batch.
        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
        :param noise: if specified, the split-out normal noise.
        :return: A noisy version of x_start.
        """
        if noise is None:
            noise = th.randn_like(x_start)
        assert noise.shape == x_start.shape
        return (
            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
        )

    def q_posterior_mean_variance(self, x_start, x_t, t):
        """
        Compute the mean and variance of the diffusion posterior:
            q(x_{t-1} | x_t, x_0)
        """
        assert x_start.shape == x_t.shape
        posterior_mean = (
            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
        )
        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
        posterior_log_variance_clipped = _extract_into_tensor(
            self.posterior_log_variance_clipped, t, x_t.shape
        )
        assert (
            posterior_mean.shape[0]
            == posterior_variance.shape[0]
            == posterior_log_variance_clipped.shape[0]
            == x_start.shape[0]
        )
        return posterior_mean, posterior_variance, posterior_log_variance_clipped

    def p_mean_variance(self, model, x, t, current, cache_dic, clip_denoised=True, denoised_fn=None, model_kwargs=None):
        """
        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
        the initial x, x_0.
        :param model: the model, which takes a signal and a batch of timesteps
                      as input.
        :param x: the [N x C x ...] tensor at time t.
        :param t: a 1-D Tensor of timesteps.
        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample. Applies before
            clip_denoised.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict with the following keys:
                 - 'mean': the model mean output.
                 - 'variance': the model variance output.
                 - 'log_variance': the log of 'variance'.
                 - 'pred_xstart': the prediction for x_0.
        """
        if model_kwargs is None:
            model_kwargs = {}

        B, C = x.shape[:2]
        assert t.shape == (B,)
        model_output = model(x, t, current=current, cache_dic=cache_dic, **model_kwargs)
        if isinstance(model_output, tuple):
            model_output, extra = model_output
        else:
            extra = None

        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
            assert model_output.shape == (B, C * 2, *x.shape[2:])
            model_output, model_var_values = th.split(model_output, C, dim=1)
            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
            # The model_var_values is [-1, 1] for [min_var, max_var].
            frac = (model_var_values + 1) / 2
            model_log_variance = frac * max_log + (1 - frac) * min_log
            model_variance = th.exp(model_log_variance)
        elif self.model_var_type in [ModelVarType.FIXED_LARGE, ModelVarType.FIXED_SMALL]:
            model_variance, model_log_variance = {
                # for fixedlarge, we set the initial (log-)variance like so
                # to get a better decoder log likelihood.
                ModelVarType.FIXED_LARGE: (
                    np.append(self.posterior_variance[1], self.betas[1:]),
                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
                ),
                ModelVarType.FIXED_SMALL: (
                    self.posterior_variance,
                    self.posterior_log_variance_clipped,
                ),
            }[self.model_var_type]
            model_variance = _extract_into_tensor(model_variance, t, x.shape)
            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
        else:
            model_variance = th.zeros_like(model_output)
            model_log_variance = th.zeros_like(model_output)

        def process_xstart(x):
            if denoised_fn is not None:
                x = denoised_fn(x)
            return x.clamp(-1, 1) if clip_denoised else x

        if self.model_mean_type == ModelMeanType.START_X:
            pred_xstart = process_xstart(model_output)
        else:
            pred_xstart = process_xstart(
                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
            )
        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)

        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
        return {
            "mean": model_mean,
            "variance": model_variance,
            "log_variance": model_log_variance,
            "pred_xstart": pred_xstart,
            "extra": extra,
        }

    def _predict_xstart_from_eps(self, x_t, t, eps):
        assert x_t.shape == eps.shape
        return (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
        )

    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
        return (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)

    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
        """
        Compute the mean for the previous step, given a function cond_fn that
        computes the gradient of a conditional log probability with respect to
        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
        condition on y.
        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
        """
        gradient = cond_fn(x, t, **model_kwargs)
        return p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()

    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
        """
        Compute what the p_mean_variance output would have been, should the
        model's score function be conditioned by cond_fn.
        See condition_mean() for details on cond_fn.
        Unlike condition_mean(), this instead uses the conditioning strategy
        from Song et al (2020).
        """
        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)

        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)

        out = p_mean_var.copy()
        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
        return out

    def p_sample(
        self,
        model,
        x,
        t,
        current=None,
        cache_dic=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
    ):
        """
        Sample x_{t-1} from the model at the given timestep.
        :param model: the model to sample from.
        :param x: the current tensor at x_{t-1}.
        :param t: the value of t, starting at 0 for the first diffusion step.
        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample.
        :param cond_fn: if not None, this is a gradient function that acts
                        similarly to the model.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict containing the following keys:
                 - 'sample': a random sample from the model.
                 - 'pred_xstart': a prediction of x_0.
        """
        out = self.p_mean_variance(
            model,
            x,
            t,
            current=current,
            cache_dic=cache_dic,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        noise = th.randn_like(x)
        nonzero_mask = (
            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
        )  # no noise when t == 0
        if cond_fn is not None:
            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
        return {"sample": sample, "pred_xstart": out["pred_xstart"]}

    def p_sample_loop(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False        
    ):
        """
        Generate samples from the model.
        :param model: the model module.
        :param shape: the shape of the samples, (N, C, H, W).
        :param noise: if specified, the noise from the encoder to sample.
                      Should be of the same shape as `shape`.
        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample.
        :param cond_fn: if not None, this is a gradient function that acts
                        similarly to the model.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :param device: if specified, the device to create the samples on.
                       If not specified, use a model parameter's device.
        :param progress: if True, show a tqdm progress bar.
        :return: a non-differentiable batch of samples.
        """
        final = None
        for sample in self.p_sample_loop_progressive(
            model,
            shape,
            noise=noise,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            cond_fn=cond_fn,
            model_kwargs=model_kwargs,
            device=device,
            progress=progress            
        ):
            final = sample
        return final["sample"]

    def p_sample_loop_progressive(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False
    ):
        """
        Generate samples from the model and yield intermediate samples from
        each timestep of diffusion.
        Arguments are the same as p_sample_loop().
        Returns a generator over dicts, where each dict is the return value of
        p_sample().
        """
        if device is None:
            device = next(model.parameters()).device
        assert isinstance(shape, (tuple, list))
        img = noise if noise is not None else th.randn(*shape, device=device)
        indices = list(range(self.num_timesteps))[::-1]

        if progress:
            # Lazy import so that we don't depend on tqdm.
            from tqdm.auto import tqdm

            indices = tqdm(indices)

        cache_dic, current = cache_init(model_kwargs=model_kwargs, num_steps=self.num_timesteps)
        
        for i in indices:
            t = th.tensor([i] * shape[0], device=device)
            with th.no_grad():
                current['step'] = i
                out = self.p_sample(
                    model,
                    img,
                    t,
                    current=current,
                    cache_dic=cache_dic,
                    clip_denoised=clip_denoised,
                    denoised_fn=denoised_fn,
                    cond_fn=cond_fn,
                    model_kwargs=model_kwargs,
                )
                yield out
                img = out["sample"]

    def ddim_sample(
        self,
        model,
        x,
        t,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        eta=0.0,
    ):
        """
        Sample x_{t-1} from the model using DDIM.
        Same usage as p_sample().
        """
        out = self.p_mean_variance(
            model,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        if cond_fn is not None:
            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)

        # Usually our model outputs epsilon, but we re-derive it
        # in case we used x_start or x_prev prediction.
        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])

        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
        sigma = (
            eta
            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
        )
        # Equation 12.
        noise = th.randn_like(x)
        mean_pred = (
            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
        )
        nonzero_mask = (
            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
        )  # no noise when t == 0
        sample = mean_pred + nonzero_mask * sigma * noise
        return {"sample": sample, "pred_xstart": out["pred_xstart"]}

    def ddim_reverse_sample(
        self,
        model,
        x,
        t,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        eta=0.0,
    ):
        """
        Sample x_{t+1} from the model using DDIM reverse ODE.
        """
        assert eta == 0.0, "Reverse ODE only for deterministic path"
        out = self.p_mean_variance(
            model,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        if cond_fn is not None:
            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
        # Usually our model outputs epsilon, but we re-derive it
        # in case we used x_start or x_prev prediction.
        eps = (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
            - out["pred_xstart"]
        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)

        # Equation 12. reversed
        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps

        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}

    def ddim_sample_loop(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
        eta=0.0,
    ):
        """
        Generate samples from the model using DDIM.
        Same usage as p_sample_loop().
        """
        final = None
        for sample in self.ddim_sample_loop_progressive(
            model,
            shape,
            noise=noise,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            cond_fn=cond_fn,
            model_kwargs=model_kwargs,
            device=device,
            progress=progress,
            eta=eta,
        ):
            final = sample
        return final["sample"]

    def ddim_sample_loop_progressive(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
        eta=0.0,
    ):
        """
        Use DDIM to sample from the model and yield intermediate samples from
        each timestep of DDIM.
        Same usage as p_sample_loop_progressive().
        """
        if device is None:
            device = next(model.parameters()).device
        assert isinstance(shape, (tuple, list))
        img = noise if noise is not None else th.randn(*shape, device=device)
        indices = list(range(self.num_timesteps))[::-1]

        if progress:
            # Lazy import so that we don't depend on tqdm.
            from tqdm.auto import tqdm

            indices = tqdm(indices)

        for i in indices:
            t = th.tensor([i] * shape[0], device=device)
            with th.no_grad():
                out = self.ddim_sample(
                    model,
                    img,
                    t,
                    clip_denoised=clip_denoised,
                    denoised_fn=denoised_fn,
                    cond_fn=cond_fn,
                    model_kwargs=model_kwargs,
                    eta=eta,
                )
                yield out
                img = out["sample"]

    def _vb_terms_bpd(
            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
    ):
        """
        Get a term for the variational lower-bound.
        The resulting units are bits (rather than nats, as one might expect).
        This allows for comparison to other papers.
        :return: a dict with the following keys:
                 - 'output': a shape [N] tensor of NLLs or KLs.
                 - 'pred_xstart': the x_0 predictions.
        """
        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
            x_start=x_start, x_t=x_t, t=t
        )
        out = self.p_mean_variance(
            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
        )
        kl = normal_kl(
            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
        )
        kl = mean_flat(kl) / np.log(2.0)

        decoder_nll = -discretized_gaussian_log_likelihood(
            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
        )
        assert decoder_nll.shape == x_start.shape
        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)

        # At the first timestep return the decoder NLL,
        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
        output = th.where((t == 0), decoder_nll, kl)
        return {"output": output, "pred_xstart": out["pred_xstart"]}

    def training_losses(self, model, x_start, timestep, model_kwargs=None, noise=None, skip_noise=False):
        """
        Compute training losses for a single timestep.
        :param model: the model to evaluate loss on.
        :param x_start: the [N x C x ...] tensor of inputs.
        :param t: a batch of timestep indices.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :param noise: if specified, the specific Gaussian noise to try to remove.
        :return: a dict with the key "loss" containing a tensor of shape [N].
                 Some mean or variance settings may also have other keys.
        """
        t = timestep
        if model_kwargs is None:
            model_kwargs = {}
        if skip_noise:
            x_t = x_start
        else:
            if noise is None:
                noise = th.randn_like(x_start)
            x_t = self.q_sample(x_start, t, noise=noise)

        terms = {}

        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
            terms["loss"] = self._vb_terms_bpd(
                model=model,
                x_start=x_start,
                x_t=x_t,
                t=t,
                clip_denoised=False,
                model_kwargs=model_kwargs,
            )["output"]
            if self.loss_type == LossType.RESCALED_KL:
                terms["loss"] *= self.num_timesteps
        elif self.loss_type in [LossType.MSE, LossType.RESCALED_MSE]:
            model_output = model(x_t, t, **model_kwargs)
            if isinstance(model_output, dict) and model_output.get('x', None) is not None:
                output = model_output['x']
            else:
                output = model_output

            if self.return_startx and self.model_mean_type == ModelMeanType.EPSILON:
                return self._extracted_from_training_losses_diffusers(x_t, output, t)
            # self.model_var_type = ModelVarType.LEARNED_RANGE:4
            if self.model_var_type in [
                ModelVarType.LEARNED,
                ModelVarType.LEARNED_RANGE,
            ]:
                B, C = x_t.shape[:2]
                assert output.shape == (B, C * 2, *x_t.shape[2:])
                output, model_var_values = th.split(output, C, dim=1)
                # Learn the variance using the variational bound, but don't let it affect our mean prediction.
                frozen_out = th.cat([output.detach(), model_var_values], dim=1)
                # vb variational bound
                terms["vb"] = self._vb_terms_bpd(
                    model=lambda *args, r=frozen_out, **kwargs: r,
                    x_start=x_start,
                    x_t=x_t,
                    t=t,
                    clip_denoised=False,
                )["output"]
                if self.loss_type == LossType.RESCALED_MSE:
                    # Divide by 1000 for equivalence with initial implementation.
                    # Without a factor of 1/1000, the VB term hurts the MSE term.
                    terms["vb"] *= self.num_timesteps / 1000.0

            target = {
                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
                    x_start=x_start, x_t=x_t, t=t
                )[0],
                ModelMeanType.START_X: x_start,
                ModelMeanType.EPSILON: noise,
            }[self.model_mean_type]
            assert output.shape == target.shape == x_start.shape
            if self.snr:
                if self.model_mean_type == ModelMeanType.START_X:
                    pred_noise = self._predict_eps_from_xstart(x_t=x_t, t=t, pred_xstart=output)
                    pred_startx = output
                elif self.model_mean_type == ModelMeanType.EPSILON:
                    pred_noise = output
                    pred_startx = self._predict_xstart_from_eps(x_t=x_t, t=t, eps=output)
                # terms["mse_eps"] = mean_flat((noise - pred_noise) ** 2)
                # terms["mse_x0"] = mean_flat((x_start - pred_startx) ** 2)

                t = t[:, None, None, None].expand(pred_startx.shape)  # [128, 4, 32, 32]
                # best
                target = th.where(t > 249, noise, x_start)
                output = th.where(t > 249, pred_noise, pred_startx)
            loss = (target - output) ** 2
            if model_kwargs.get('mask_ratio', False) and model_kwargs['mask_ratio'] > 0:
                assert 'mask' in model_output
                loss = F.avg_pool2d(loss.mean(dim=1), model.model.module.patch_size).flatten(1)
                mask = model_output['mask']
                unmask = 1 - mask
                terms['mse'] = mean_flat(loss * unmask) * unmask.shape[1]/unmask.sum(1)
                if model_kwargs['mask_loss_coef'] > 0:
                    terms['mae'] = model_kwargs['mask_loss_coef'] * mean_flat(loss * mask) * mask.shape[1]/mask.sum(1)
            else:
                terms["mse"] = mean_flat(loss)
            terms["loss"] = terms["mse"] + terms["vb"] if "vb" in terms else terms["mse"]
            if "mae" in terms:
                terms["loss"] = terms["loss"] + terms["mae"]
        else:
            raise NotImplementedError(self.loss_type)

        return terms

    def training_losses_diffusers(self, model, x_start, timestep, model_kwargs=None, noise=None, skip_noise=False):
        """
        Compute training losses for a single timestep.
        :param model: the model to evaluate loss on.
        :param x_start: the [N x C x ...] tensor of inputs.
        :param t: a batch of timestep indices.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :param noise: if specified, the specific Gaussian noise to try to remove.
        :return: a dict with the key "loss" containing a tensor of shape [N].
                 Some mean or variance settings may also have other keys.
        """
        t = timestep
        if model_kwargs is None:
            model_kwargs = {}
        if skip_noise:
            x_t = x_start
        else:
            if noise is None:
                noise = th.randn_like(x_start)
            x_t = self.q_sample(x_start, t, noise=noise)

        terms = {}

        if self.loss_type in [LossType.KL, LossType.RESCALED_KL]:
            terms["loss"] = self._vb_terms_bpd(
                model=model,
                x_start=x_start,
                x_t=x_t,
                t=t,
                clip_denoised=False,
                model_kwargs=model_kwargs,
            )["output"]
            if self.loss_type == LossType.RESCALED_KL:
                terms["loss"] *= self.num_timesteps
        elif self.loss_type in [LossType.MSE, LossType.RESCALED_MSE]:
            output = model(x_t, timestep=t, **model_kwargs, return_dict=False)[0]

            if self.return_startx and self.model_mean_type == ModelMeanType.EPSILON:
                return self._extracted_from_training_losses_diffusers(x_t, output, t)

            if self.model_var_type in [
                ModelVarType.LEARNED,
                ModelVarType.LEARNED_RANGE,
            ]:
                B, C = x_t.shape[:2]
                assert output.shape == (B, C * 2, *x_t.shape[2:])
                output, model_var_values = th.split(output, C, dim=1)
                # Learn the variance using the variational bound, but don't let it affect our mean prediction.
                frozen_out = th.cat([output.detach(), model_var_values], dim=1)
                terms["vb"] = self._vb_terms_bpd(
                    model=lambda *args, r=frozen_out, **kwargs: r,
                    x_start=x_start,
                    x_t=x_t,
                    t=t,
                    clip_denoised=False,
                )["output"]
                if self.loss_type == LossType.RESCALED_MSE:
                    # Divide by 1000 for equivalence with initial implementation.
                    # Without a factor of 1/1000, the VB term hurts the MSE term.
                    terms["vb"] *= self.num_timesteps / 1000.0

            target = {
                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
                    x_start=x_start, x_t=x_t, t=t
                )[0],
                ModelMeanType.START_X: x_start,
                ModelMeanType.EPSILON: noise,
            }[self.model_mean_type]
            assert output.shape == target.shape == x_start.shape
            if self.snr:
                if self.model_mean_type == ModelMeanType.START_X:
                    pred_noise = self._predict_eps_from_xstart(x_t=x_t, t=t, pred_xstart=output)
                    pred_startx = output
                elif self.model_mean_type == ModelMeanType.EPSILON:
                    pred_noise = output
                    pred_startx = self._predict_xstart_from_eps(x_t=x_t, t=t, eps=output)
                # terms["mse_eps"] = mean_flat((noise - pred_noise) ** 2)
                # terms["mse_x0"] = mean_flat((x_start - pred_startx) ** 2)

                t = t[:, None, None, None].expand(pred_startx.shape)  # [128, 4, 32, 32]
                # best
                target = th.where(t > 249, noise, x_start)
                output = th.where(t > 249, pred_noise, pred_startx)
            loss = (target - output) ** 2
            terms["mse"] = mean_flat(loss)
            terms["loss"] = terms["mse"] + terms["vb"] if "vb" in terms else terms["mse"]
            if "mae" in terms:
                terms["loss"] = terms["loss"] + terms["mae"]
        else:
            raise NotImplementedError(self.loss_type)

        return terms

    def _extracted_from_training_losses_diffusers(self, x_t, output, t):
        B, C = x_t.shape[:2]
        assert output.shape == (B, C * 2, *x_t.shape[2:])
        output = th.split(output, C, dim=1)[0]
        return output, self._predict_xstart_from_eps(x_t=x_t, t=t, eps=output), x_t

    def _prior_bpd(self, x_start):
        """
        Get the prior KL term for the variational lower-bound, measured in
        bits-per-dim.
        This term can't be optimized, as it only depends on the encoder.
        :param x_start: the [N x C x ...] tensor of inputs.
        :return: a batch of [N] KL values (in bits), one per batch element.
        """
        batch_size = x_start.shape[0]
        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
        kl_prior = normal_kl(
            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
        )
        return mean_flat(kl_prior) / np.log(2.0)

    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
        """
        Compute the entire variational lower-bound, measured in bits-per-dim,
        as well as other related quantities.
        :param model: the model to evaluate loss on.
        :param x_start: the [N x C x ...] tensor of inputs.
        :param clip_denoised: if True, clip denoised samples.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict containing the following keys:
                 - total_bpd: the total variational lower-bound, per batch element.
                 - prior_bpd: the prior term in the lower-bound.
                 - vb: an [N x T] tensor of terms in the lower-bound.
                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
        """
        device = x_start.device
        batch_size = x_start.shape[0]

        vb = []
        xstart_mse = []
        mse = []
        for t in list(range(self.num_timesteps))[::-1]:
            t_batch = th.tensor([t] * batch_size, device=device)
            noise = th.randn_like(x_start)
            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
            # Calculate VLB term at the current timestep
            with th.no_grad():
                out = self._vb_terms_bpd(
                    model,
                    x_start=x_start,
                    x_t=x_t,
                    t=t_batch,
                    clip_denoised=clip_denoised,
                    model_kwargs=model_kwargs,
                )
            vb.append(out["output"])
            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
            mse.append(mean_flat((eps - noise) ** 2))

        vb = th.stack(vb, dim=1)
        xstart_mse = th.stack(xstart_mse, dim=1)
        mse = th.stack(mse, dim=1)

        prior_bpd = self._prior_bpd(x_start)
        total_bpd = vb.sum(dim=1) + prior_bpd
        return {
            "total_bpd": total_bpd,
            "prior_bpd": prior_bpd,
            "vb": vb,
            "xstart_mse": xstart_mse,
            "mse": mse,
        }


def _extract_into_tensor(arr, timesteps, broadcast_shape):
    """
    Extract values from a 1-D numpy array for a batch of indices.
    :param arr: the 1-D numpy array.
    :param timesteps: a tensor of indices into the array to extract.
    :param broadcast_shape: a larger shape of K dimensions with the batch
                            dimension equal to the length of timesteps.
    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
    """
    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
    while len(res.shape) < len(broadcast_shape):
        res = res[..., None]
    return res + th.zeros(broadcast_shape, device=timesteps.device)


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/hed.py
================================================
# This is an improved version and model of HED edge detection with Apache License, Version 2.0.
# Please use this implementation in your products
# This implementation may produce slightly different results from Saining Xie's official implementations,
# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
# Different from official models and other implementations, this is an RGB-input model (rather than BGR)
# and in this way it works better for gradio's RGB protocol
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent.parent))
from torch import nn
import torch
import numpy as np
from torchvision import transforms as T
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import json
from PIL import Image
import torchvision.transforms.functional as TF
from accelerate import Accelerator
from diffusers.models import AutoencoderKL
import os

image_resize = 1024


class DoubleConvBlock(nn.Module):
    def __init__(self, input_channel, output_channel, layer_number):
        super().__init__()
        self.convs = torch.nn.Sequential()
        self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
        for i in range(1, layer_number):
            self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
        self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)

    def forward(self, x, down_sampling=False):
        h = x
        if down_sampling:
            h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
        for conv in self.convs:
            h = conv(h)
            h = torch.nn.functional.relu(h)
        return h, self.projection(h)


class ControlNetHED_Apache2(nn.Module):
    def __init__(self):
        super().__init__()
        self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
        self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
        self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
        self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
        self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
        self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)

    def forward(self, x):
        h = x - self.norm
        h, projection1 = self.block1(h)
        h, projection2 = self.block2(h, down_sampling=True)
        h, projection3 = self.block3(h, down_sampling=True)
        h, projection4 = self.block4(h, down_sampling=True)
        h, projection5 = self.block5(h, down_sampling=True)
        return projection1, projection2, projection3, projection4, projection5


class InternData(Dataset):
    def __init__(self):
        ####
        with open('data/InternData/partition/data_info.json', 'r') as f:
            self.j = json.load(f)
        self.transform = T.Compose([
            T.Lambda(lambda img: img.convert('RGB')),
            T.Resize(image_resize),  # Image.BICUBIC
            T.CenterCrop(image_resize),
            T.ToTensor(),
        ])

    def __len__(self):
        return len(self.j)

    def getdata(self, idx):

        path = self.j[idx]['path']
        image = Image.open("data/InternImgs/" + path)
        image = self.transform(image)
        return image, path

    def __getitem__(self, idx):
        for i in range(20):
            try:
                data = self.getdata(idx)
                return data
            except Exception as e:
                print(f"Error details: {str(e)}")
                idx = np.random.randint(len(self))
        raise RuntimeError('Too many bad data.')

class HEDdetector(nn.Module):
    def __init__(self, feature=True, vae=None):
        super().__init__()
        self.model = ControlNetHED_Apache2()
        self.model.load_state_dict(torch.load('output/pretrained_models/ControlNetHED.pth', map_location='cpu'))
        self.model.eval()
        self.model.requires_grad_(False)
        if feature:
            if vae is None:
                self.vae = AutoencoderKL.from_pretrained("output/pretrained_models/sd-vae-ft-ema")
            else:
                self.vae = vae
            self.vae.eval()
            self.vae.requires_grad_(False)
        else:
            self.vae = None

    def forward(self, input_image):
        B, C, H, W = input_image.shape
        with torch.inference_mode():
            edges = self.model(input_image * 255.)
            edges = torch.cat([TF.resize(e, [H, W]) for e in edges], dim=1)
            edge = 1 / (1 + torch.exp(-torch.mean(edges, dim=1, keepdim=True)))
            edge.clip_(0, 1)
            if self.vae:
                edge = TF.normalize(edge, [.5], [.5])
                edge = edge.repeat(1, 3, 1, 1)
                posterior = self.vae.encode(edge).latent_dist
                edge = torch.cat([posterior.mean, posterior.std], dim=1).cpu().numpy()
        return edge


def main():
    dataset = InternData()
    dataloader = DataLoader(dataset, batch_size=10, shuffle=False, num_workers=8, pin_memory=True)
    hed = HEDdetector()

    accelerator = Accelerator()
    hed, dataloader = accelerator.prepare(hed, dataloader)


    for img, path in tqdm(dataloader):
        out = hed(img.cuda())
        for p, o in zip(path, out):
            save = f'data/InternalData/hed_feature_{image_resize}/' + p.replace('.png', '.npz')
            if os.path.exists(save):
                continue
            os.makedirs(os.path.dirname(save), exist_ok=True)
            np.savez_compressed(save, o)


if __name__ == "__main__":
    main()


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/llava/__init__.py
================================================
from diffusion.model.llava.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/llava/llava_mpt.py
================================================
#    Copyright 2023 Haotian Liu
#
#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at
#
#        http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.


from typing import List, Optional, Tuple, Union
import warnings

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

import math

from transformers import AutoConfig, AutoModelForCausalLM, CLIPVisionModel, CLIPImageProcessor

from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast

from diffusion.model.llava.mpt.modeling_mpt import MPTConfig, MPTForCausalLM, MPTModel


DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"


class LlavaMPTConfig(MPTConfig):
    model_type = "llava_mpt"


class LlavaMPTModel(MPTModel):
    config_class = LlavaMPTConfig

    def __init__(self, config: MPTConfig, mm_vision_tower=None, mm_hidden_size=None):
        super(LlavaMPTModel, self).__init__(config)

        if hasattr(config, "mm_vision_tower"):
            # HACK: for FSDP
            self.vision_tower = [CLIPVisionModel.from_pretrained(config.mm_vision_tower)]
            # self.vision_tower = CLIPVisionModel.from_pretrained(config.mm_vision_tower)

        if hasattr(config, "use_mm_proj"):
            self.mm_projector = nn.Linear(config.mm_hidden_size, config.d_model)

    def initialize_vision_modules(self, vision_tower, mm_vision_select_layer,
                                  pretrain_mm_mlp_adapter=None, tune_mm_mlp_adapter=False):
        self.config.mm_vision_tower = vision_tower

        image_processor = CLIPImageProcessor.from_pretrained(vision_tower)

        if not hasattr(self, 'vision_tower'):
            vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
        else:
            vision_tower = self.vision_tower[0]
        vision_tower.requires_grad_(False)
        vision_tower = vision_tower.to(torch.float16)
        self.vision_tower = [vision_tower]

        vision_config = vision_tower.config
        num_patches = (vision_config.image_size // vision_config.patch_size) ** 2

        self.config.use_mm_proj = True
        self.config.mm_hidden_size = vision_config.hidden_size
        self.config.mm_vision_select_layer = mm_vision_select_layer

        if not hasattr(self, 'mm_projector'):
            self.mm_projector = nn.Linear(vision_config.hidden_size, self.config.d_model)

        if pretrain_mm_mlp_adapter is not None:
            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
            self.mm_projector.load_state_dict({k.split('.')[-1]: v for k, v in mm_projector_weights.items() if 'mm_projector' in k})

        return dict(
            image_processor=image_processor,
            image_token_len=num_patches,
            vision_config=vision_config
        )

    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None):

        # HACK: replace back original embeddings for LLaVA pretraining
        orig_embeds_params = getattr(self, 'orig_embeds_params', None)
        # if orig_embeds_params is not None:
        #     orig_embeds_params = orig_embeds_params[0]
        #     with torch.no_grad():
        #         self.get_input_embeddings().weight.data[:-2] = orig_embeds_params[:-2].data

        inputs_embeds = self.wte(input_ids)

        vision_tower = getattr(self, 'vision_tower', None)
        if vision_tower is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
            # TODO: this is a modified multimodal LLM -- Haotian Liu
            vision_tower = vision_tower[0]  # HACK: for FSDP
            with torch.no_grad():
                if type(images) is list:
                    # variable length images
                    image_features = []
                    for image in images:
                        image_forward_out = vision_tower(image.unsqueeze(0), output_hidden_states=True)
                        select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1)
                        select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
                        image_feature = select_hidden_state[:, 1:]
                        image_features.append(image_feature)
                else:
                    image_forward_outs = vision_tower(images, output_hidden_states=True)
                    select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1)
                    select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer]
                    image_features = select_hidden_state[:, 1:]
            if type(images) is list:
                image_features = [self.mm_projector(image_feature)[0] for image_feature in image_features]
            else:
                image_features = self.mm_projector(image_features)
            dummy_image_features = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
            dummy_image_features = self.mm_projector(dummy_image_features)

            new_input_embeds = []
            cur_image_idx = 0
            for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds):
                if (cur_input_ids == vision_tower.config.im_patch_token).sum() == 0:
                    # multimodal LLM, but the current sample is not multimodal
                    cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
                    new_input_embeds.append(cur_input_embeds)
                    continue
                cur_image_features = image_features[cur_image_idx]
                num_patches = cur_image_features.shape[0]
                if vision_tower.config.use_im_start_end:
                    if (cur_input_ids == vision_tower.config.im_start_token).sum() != (cur_input_ids == vision_tower.config.im_end_token).sum():
                        raise ValueError("The number of image start tokens and image end tokens should be the same.")
                    image_start_tokens = torch.where(cur_input_ids == vision_tower.config.im_start_token)[0]
                    for image_start_token_pos in image_start_tokens:
                        cur_image_features = image_features[cur_image_idx].to(device=cur_input_embeds.device)
                        num_patches = cur_image_features.shape[0]
                        if cur_input_ids[image_start_token_pos + num_patches + 1] != vision_tower.config.im_end_token:
                            raise ValueError("The image end token should follow the image start token.")
                        if orig_embeds_params is not None:
                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
                        else:
                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
                        cur_image_idx += 1
                else:
                    if (cur_input_ids == vision_tower.config.im_patch_token).sum() != num_patches:
                        raise ValueError("The number of image patch tokens should be the same as the number of image patches.")
                    masked_indices = torch.where(cur_input_ids == vision_tower.config.im_patch_token)[0]
                    mask_index_start = masked_indices[0]
                    if (masked_indices != torch.arange(mask_index_start, mask_index_start+num_patches, device=masked_indices.device, dtype=masked_indices.dtype)).any():
                        raise ValueError("The image patch tokens should be consecutive.")
                    if orig_embeds_params is not None:
                        cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start].detach(), cur_image_features, cur_input_embeds[mask_index_start+num_patches:].detach()), dim=0)
                    else:
                        cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start], cur_image_features, cur_input_embeds[mask_index_start+num_patches:]), dim=0)
                new_input_embeds.append(cur_new_input_embeds)
            inputs_embeds = torch.stack(new_input_embeds, dim=0)

        return super(LlavaMPTModel, self).forward(input_ids=None, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, tok_emb=inputs_embeds)


class LlavaMPTForCausalLM(MPTForCausalLM):
    config_class = LlavaMPTConfig
    supports_gradient_checkpointing = True

    def __init__(self, config):
        super(MPTForCausalLM, self).__init__(config)

        if not config.tie_word_embeddings:
            raise ValueError('MPTForCausalLM only supports tied word embeddings')
        self.transformer = LlavaMPTModel(config)
        self.logit_scale = None
        if config.logit_scale is not None:
            logit_scale = config.logit_scale
            if isinstance(logit_scale, str):
                if logit_scale == 'inv_sqrt_d_model':
                    logit_scale = 1 / math.sqrt(config.d_model)
                else:
                    raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
            self.logit_scale = logit_scale

    def get_model(self):
        return self.transformer

    def _set_gradient_checkpointing(self, module, value=False):
        if isinstance(module, LlavaMPTModel):
            module.gradient_checkpointing = value

    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None):
        return_dict = return_dict if return_dict is not None else self.config.return_dict
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, images=images)
        logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight)
        if self.logit_scale is not None:
            if self.logit_scale == 0:
                warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
            logits *= self.logit_scale
        loss = None
        if labels is not None:
            labels = torch.roll(labels, shifts=-1)
            labels[:, -1] = -100
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
        if inputs_embeds is not None:
            raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
        attention_mask = kwargs['attention_mask'].bool()
        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
            raise NotImplementedError('MPT does not support generation with right padding.')
        if self.transformer.attn_uses_sequence_id and self.training:
            sequence_id = torch.zeros_like(input_ids[:1])
        else:
            sequence_id = None
        if past_key_values is not None:
            input_ids = input_ids[:, -1].unsqueeze(-1)
        if self.transformer.prefix_lm:
            prefix_mask = torch.ones_like(attention_mask)
            if kwargs.get('use_cache') == False:
                raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
        else:
            prefix_mask = None
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True), "images": kwargs.get("images", None)}

    def initialize_vision_tokenizer(self, mm_use_im_start_end, tokenizer, device,
                                    tune_mm_mlp_adapter=False, pretrain_mm_mlp_adapter=None):
        vision_config = self.get_model().vision_tower[0].config
        vision_config.use_im_start_end = mm_use_im_start_end
        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
        self.resize_token_embeddings(len(tokenizer))

        if mm_use_im_start_end:
            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
            self.resize_token_embeddings(len(tokenizer))
            vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])

            if num_new_tokens > 0:
                input_embeddings = (
                    self._extracted_from_initialize_vision_tokenizer_14(
                        num_new_tokens
                    )
                )
            if tune_mm_mlp_adapter:
                self.get_model().orig_embeds_params = [self.get_input_embeddings().weight.data.clone().to(device=device)]
                for p in self.get_input_embeddings().parameters():
                    p.requires_grad = True
                for p in self.get_output_embeddings().parameters():
                    p.requires_grad = False

            if pretrain_mm_mlp_adapter:
                mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
                embed_tokens_weight = mm_projector_weights['transformer.wte.weight']
                assert num_new_tokens == 2
                if input_embeddings.shape == embed_tokens_weight.shape:
                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
                elif embed_tokens_weight.shape[0] == num_new_tokens:
                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
                else:
                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")

        vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]

    # TODO Rename this here and in `initialize_vision_tokenizer`
    def _extracted_from_initialize_vision_tokenizer_14(self, num_new_tokens):
        result = self.get_input_embeddings().weight.data
        output_embeddings = self.get_output_embeddings().weight.data

        input_embeddings_avg = result[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True)

        result[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

        return result

AutoConfig.register("llava_mpt", LlavaMPTConfig)
AutoModelForCausalLM.register(LlavaMPTConfig, LlavaMPTForCausalLM)


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/attention.py
================================================
"""Attention layers."""
import math
import warnings
from typing import Optional
import torch
import torch.nn as nn
from einops import rearrange
from torch import nn
from .norm import LPLayerNorm

def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
    if original_is_causal and num_query_tokens != num_key_tokens:
        if num_query_tokens != 1:
            raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
        else:
            return False
    return original_is_causal

def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
    q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
    k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads)
    v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads)
    min_val = torch.finfo(q.dtype).min
    (b, _, s_q, d) = q.shape
    s_k = k.size(-1)
    if softmax_scale is None:
        softmax_scale = 1 / math.sqrt(d)
    attn_weight = q.matmul(k) * softmax_scale
    if attn_bias is not None:
        if attn_bias.size(-1) not in [1, s_k] or attn_bias.size(-2) not in [
            1,
            s_q,
        ]:
            raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
        attn_weight = attn_weight + attn_bias
    if key_padding_mask is not None:
        if attn_bias is not None:
            warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
        attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
    if is_causal:
        s = max(s_q, s_k)
        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
        causal_mask = causal_mask.tril()
        causal_mask = causal_mask.to(torch.bool)
        causal_mask = ~causal_mask
        causal_mask = causal_mask[-s_q:, -s_k:]
        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
    attn_weight = torch.softmax(attn_weight, dim=-1)
    if dropout_p:
        attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
    out = attn_weight.matmul(v)
    out = rearrange(out, 'b h s d -> b s (h d)')
    return (out, attn_weight) if needs_weights else (out, None)

def check_valid_inputs(*tensors, valid_dtypes=None):
    if valid_dtypes is None:
        valid_dtypes = [torch.float16, torch.bfloat16]
    for tensor in tensors:
        if tensor.dtype not in valid_dtypes:
            raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
        if not tensor.is_cuda:
            raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')

def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
    try:
        from flash_attn import bert_padding, flash_attn_interface
    except:
        raise RuntimeError('Please install flash-attn==1.0.3.post0')
    check_valid_inputs(query, key, value)
    if attn_bias is not None:
        raise NotImplementedError('attn_bias not implemented for flash attn.')
    (batch_size, seqlen) = query.shape[:2]
    if key_padding_mask is None:
        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
    query_padding_mask = key_padding_mask[:, -query.size(1):]
    (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask)
    query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
    (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask)
    key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
    (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
    value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
    if multiquery:
        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
        value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
    dropout_p = dropout_p if training else 0.0
    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
    output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
    output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
    return (output, None)

def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
    try:
        from flash_attn import flash_attn_triton
    except:
        raise RuntimeError('Please install flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202')
    check_valid_inputs(query, key, value)
    if dropout_p:
        raise NotImplementedError('Dropout not implemented for attn_impl: triton.')
    if needs_weights:
        raise NotImplementedError('attn_impl: triton cannot return attn weights.')
    if key_padding_mask is not None:
        warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
        (b_size, s_k) = key_padding_mask.shape[:2]
        if attn_bias is None:
            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
        attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
    query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
    key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
    value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
    if multiquery:
        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
    attn_output = flash_attn_triton.flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
    output = attn_output.view(*attn_output.shape[:2], -1)
    return (output, None)

class MultiheadAttention(nn.Module):
    """Multi-head self attention.

    Using torch or triton attention implemetation enables user to also use
    additive bias.
    """

    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
        super().__init__()
        self.attn_impl = attn_impl
        self.clip_qkv = clip_qkv
        self.qk_ln = qk_ln
        self.d_model = d_model
        self.n_heads = n_heads
        self.softmax_scale = softmax_scale
        if self.softmax_scale is None:
            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
        self.attn_dropout_p = attn_pdrop
        self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
        fuse_splits = (d_model, 2 * d_model)
        self.Wqkv._fused = (0, fuse_splits)
        if self.qk_ln:
            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
            self.q_ln = layernorm_class(self.d_model, device=device)
            self.k_ln = layernorm_class(self.d_model, device=device)
        if self.attn_impl == 'flash':
            self.attn_fn = flash_attn_fn
        elif self.attn_impl == 'triton':
            self.attn_fn = triton_flash_attn_fn
            warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
        elif self.attn_impl == 'torch':
            self.attn_fn = scaled_multihead_dot_product_attention
            if torch.cuda.is_available():
                warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
        else:
            raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
        self.out_proj._is_residual = True

    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
        qkv = self.Wqkv(x)
        if self.clip_qkv:
            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
        (query, key, value) = qkv.chunk(3, dim=2)
        key_padding_mask = attention_mask
        if self.qk_ln:
            dtype = query.dtype
            query = self.q_ln(query).to(dtype)
            key = self.k_ln(key).to(dtype)
        if past_key_value is not None:
            if len(past_key_value) != 0:
                key = torch.cat([past_key_value[0], key], dim=1)
                value = torch.cat([past_key_value[1], value], dim=1)
            past_key_value = (key, value)
        if attn_bias is not None:
            attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
        (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
        return (self.out_proj(context), attn_weights, past_key_value)

class MultiQueryAttention(nn.Module):
    """Multi-Query self attention.

    Using torch or triton attention implemetation enables user to also use
    additive bias.
    """

    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
        super().__init__()
        self.attn_impl = attn_impl
        self.clip_qkv = clip_qkv
        self.qk_ln = qk_ln
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        self.softmax_scale = softmax_scale
        if self.softmax_scale is None:
            self.softmax_scale = 1 / math.sqrt(self.head_dim)
        self.attn_dropout_p = attn_pdrop
        self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
        fuse_splits = (d_model, d_model + self.head_dim)
        self.Wqkv._fused = (0, fuse_splits)
        if self.qk_ln:
            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
            self.q_ln = layernorm_class(d_model, device=device)
            self.k_ln = layernorm_class(self.head_dim, device=device)
        if self.attn_impl == 'flash':
            self.attn_fn = flash_attn_fn
        elif self.attn_impl == 'triton':
            self.attn_fn = triton_flash_attn_fn
            warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
        elif self.attn_impl == 'torch':
            self.attn_fn = scaled_multihead_dot_product_attention
            if torch.cuda.is_available():
                warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
        else:
            raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
        self.out_proj._is_residual = True

    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
        qkv = self.Wqkv(x)
        if self.clip_qkv:
            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
        (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
        key_padding_mask = attention_mask
        if self.qk_ln:
            dtype = query.dtype
            query = self.q_ln(query).to(dtype)
            key = self.k_ln(key).to(dtype)
        if past_key_value is not None:
            if len(past_key_value) != 0:
                key = torch.cat([past_key_value[0], key], dim=1)
                value = torch.cat([past_key_value[1], value], dim=1)
            past_key_value = (key, value)
        if attn_bias is not None:
            attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
        (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
        return (self.out_proj(context), attn_weights, past_key_value)

def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
    if attn_impl == 'flash':
        return None
    elif attn_impl in ['torch', 'triton']:
        if alibi:
            if (prefix_lm or not causal) or use_sequence_id:
                return (1, n_heads, seq_len, seq_len)
            return (1, n_heads, 1, seq_len)
        elif prefix_lm or use_sequence_id:
            return (1, 1, seq_len, seq_len)
        return None
    else:
        raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')

def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
    if attn_impl == 'flash':
        return None
    elif attn_impl in ['torch', 'triton']:
        if alibi:
            (device, dtype) = (attn_bias.device, attn_bias.dtype)
            attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
        return attn_bias
    else:
        raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')

def gen_slopes(n_heads, alibi_bias_max=8, device=None):
    _n_heads = 2 ** math.ceil(math.log2(n_heads))
    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
    m = m.mul(alibi_bias_max / _n_heads)
    slopes = 1.0 / torch.pow(2, m)
    if _n_heads != n_heads:
        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
    return slopes.view(1, n_heads, 1, 1)

def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
    if full:
        alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
        alibi_bias = alibi_bias.abs().mul(-1)
    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
    alibi_bias = alibi_bias * slopes
    return alibi_bias.to(dtype=dtype)
ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention}

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/blocks.py
================================================
"""GPT Blocks used for the GPT Model."""
from typing import Dict, Optional, Tuple
import torch
import torch.nn as nn
from .attention import ATTN_CLASS_REGISTRY
from .norm import NORM_CLASS_REGISTRY

class MPTMLP(nn.Module):

    def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
        super().__init__()
        self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
        self.act = nn.GELU(approximate='none')
        self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
        self.down_proj._is_residual = True

    def forward(self, x):
        return self.down_proj(self.act(self.up_proj(x)))

class MPTBlock(nn.Module):

    def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict = None, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', device: Optional[str]=None, **kwargs):
        if attn_config is None:
            attn_config = {
                'attn_type': 'multihead_attention',
                'attn_pdrop': 0.0,
                'attn_impl': 'triton',
                'qk_ln': False,
                'clip_qkv': None,
                'softmax_scale': None,
                'prefix_lm': False,
                'attn_uses_sequence_id': False,
                'alibi': False,
                'alibi_bias_max': 8,
            }
        del kwargs
        super().__init__()
        norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
        attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
        self.norm_1 = norm_class(d_model, device=device)
        self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, device=device)
        self.norm_2 = norm_class(d_model, device=device)
        self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)

    def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
        a = self.norm_1(x)
        (b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
        x = x + self.resid_attn_dropout(b)
        m = self.norm_2(x)
        n = self.ffn(m)
        x = x + self.resid_ffn_dropout(n)
        return (x, past_key_value)

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/configuration_mpt.py
================================================
"""A HuggingFace-style model configuration."""
from typing import Dict, Optional, Union
from transformers import PretrainedConfig
attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu'}

class MPTConfig(PretrainedConfig):
    model_type = 'mpt'

    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
        """The MPT configuration class.

        Args:
            d_model (int): The size of the embedding dimension of the model.
            n_heads (int): The number of attention heads.
            n_layers (int): The number of layers in the model.
            expansion_ratio (int): The ratio of the up/down scale in the MLP.
            max_seq_len (int): The maximum sequence length of the model.
            vocab_size (int): The size of the vocabulary.
            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
            emb_pdrop (float): The dropout probability for the embedding layer.
            learned_pos_emb (bool): Whether to use learned positional embeddings
            attn_config (Dict):  A dictionary used to configure the model's attention module:
                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
                attn_pdrop (float): The dropout probability for the attention layers.
                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
                    this value.
                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
                    use the default scale of ``1/sqrt(d_keys)``.
                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
                    which sub-sequence each token belongs to.
                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
                alibi (bool): Whether to use the alibi bias instead of position embeddings.
                alibi_bias_max (int): The maximum value of the alibi bias.
            init_device (str): The device to use for parameter initialization.
            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
            no_bias (bool): Whether to use bias in all layers.
            verbose (int): The verbosity level. 0 is silent.
            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
            norm_type (str): choose type of norm to use
            multiquery_attention (bool): Whether to use multiquery attention implementation.
            use_cache (bool): Whether or not the model should return the last key/values attentions
            init_config (Dict): A dictionary used to configure the model initialization:
                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
                init_std (float): The standard deviation of the normal distribution used to initialize the model,
                    if using the baseline_ parameter initialization scheme.
                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
                ---
                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
        """
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.expansion_ratio = expansion_ratio
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.resid_pdrop = resid_pdrop
        self.emb_pdrop = emb_pdrop
        self.learned_pos_emb = learned_pos_emb
        self.attn_config = attn_config
        self.init_device = init_device
        self.logit_scale = logit_scale
        self.no_bias = no_bias
        self.verbose = verbose
        self.embedding_fraction = embedding_fraction
        self.norm_type = norm_type
        self.use_cache = use_cache
        self.init_config = init_config
        if 'name' in kwargs:
            del kwargs['name']
        if 'loss_fn' in kwargs:
            del kwargs['loss_fn']
        super().__init__(**kwargs)
        self._validate_config()

    def _set_config_defaults(self, config, config_defaults):
        for (k, v) in config_defaults.items():
            if k not in config:
                config[k] = v
        return config

    def _validate_config(self):
        self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
        self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
        if self.d_model % self.n_heads != 0:
            raise ValueError('d_model must be divisible by n_heads')
        if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
            raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
            raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
        if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
            raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
            raise NotImplementedError('alibi only implemented with torch and triton attention.')
        if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
            raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
            raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
        if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
            raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
        if self.init_config.get('name', None) is None:
            raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
        if not self.learned_pos_emb and (not self.attn_config['alibi']):
            raise ValueError(
                'Positional information must be provided to the model using either learned_pos_emb or alibi.'
            )

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/modeling_mpt.py
================================================
"""A simple, flexible implementation of a GPT model.

Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
"""
import math
import warnings
from typing import List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from .attention import attn_bias_shape, build_attn_bias
from .blocks import MPTBlock
from .norm import NORM_CLASS_REGISTRY
from .configuration_mpt import MPTConfig
from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]

from transformers.utils import logging
logger = logging.get_logger(__name__)

class MPTPreTrainedModel(PreTrainedModel):
    config_class = MPTConfig
    base_model_prefix = 'model'

class MPTModel(MPTPreTrainedModel):

    def __init__(self, config: MPTConfig):
        config._validate_config()
        super().__init__(config)
        self.attn_impl = config.attn_config['attn_impl']
        self.prefix_lm = config.attn_config['prefix_lm']
        self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
        self.alibi = config.attn_config['alibi']
        self.alibi_bias_max = config.attn_config['alibi_bias_max']
        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
            norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
            raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
        norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
        self.embedding_fraction = config.embedding_fraction
        self.wte = nn.Embedding(config.vocab_size, config.d_model, device=config.init_device)
        if not self.alibi:
            self.wpe = nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
        self.emb_drop = nn.Dropout(config.emb_pdrop)
        self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
        self.norm_f = norm_class(config.d_model, device=config.init_device)
        if config.init_device != 'meta':
            self.apply(self.param_init_fn)
        self.is_causal = not self.prefix_lm
        self._attn_bias_initialized = False
        self.attn_bias = None
        self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, prefix_lm=self.prefix_lm, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
        if config.no_bias:
            for module in self.modules():
                if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
                    if config.verbose:
                        warnings.warn(f'Removing bias ({module.bias}) from {module}.')
                    module.register_parameter('bias', None)
        if config.verbose and config.verbose > 2:
            print(self)
        if 'verbose' not in self.config.init_config:
            self.config.init_config['verbose'] = self.config.verbose
        if self.config.init_config['verbose'] > 1:
            init_fn_name = self.config.init_config['name']
            warnings.warn(f'Using {init_fn_name} initialization.')
        self.gradient_checkpointing = False

    def get_input_embeddings(self):
        return self.wte

    def set_input_embeddings(self, value):
        self.wte = value

    @torch.no_grad()
    def _attn_bias(self, device, dtype, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None):
        if not self._attn_bias_initialized:
            if self.attn_bias_shape:
                self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
                self.attn_bias = build_attn_bias(self.attn_impl, self.attn_bias, self.config.n_heads, self.config.max_seq_len, causal=self.is_causal, alibi=self.alibi, alibi_bias_max=self.alibi_bias_max)
            self._attn_bias_initialized = True
        if self.attn_impl == 'flash':
            return (self.attn_bias, attention_mask)
        if self.attn_bias is not None:
            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
        attn_bias = self.attn_bias
        if self.prefix_lm:
            assert isinstance(attn_bias, torch.Tensor)
            assert isinstance(prefix_mask, torch.Tensor)
            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
        if self.attn_uses_sequence_id and sequence_id is not None:
            assert isinstance(attn_bias, torch.Tensor)
            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
        if attention_mask is not None:
            s_k = attention_mask.shape[-1]
            if attn_bias is None:
                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
            else:
                attn_bias = attn_bias[:, :, :, -s_k:]
            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
                raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
            min_val = torch.finfo(attn_bias.dtype).min
            attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
        return (attn_bias, None)

    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
        (s_k, s_q) = attn_bias.shape[-2:]
        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
            raise ValueError(
                f'attn_bias does not match the expected shape. The last two dimensions should both be {self.config.max_length} '
                + f'but are {s_k} and {s_q}.'
            )
        seq_len = prefix_mask.shape[-1]
        if seq_len > self.config.max_seq_len:
            raise ValueError(f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
        attn_bias = attn_bias[..., :seq_len, :seq_len]
        causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
        prefix = prefix_mask.view(-1, 1, 1, seq_len)
        cannot_attend = ~torch.logical_or(causal, prefix.bool())
        return self._extracted_from__apply_sequence_id_15(attn_bias, cannot_attend)

    def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor):
        seq_len = sequence_id.shape[-1]
        if seq_len > self.config.max_seq_len:
            raise ValueError(f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
        attn_bias = attn_bias[..., :seq_len, :seq_len]
        cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
        return self._extracted_from__apply_sequence_id_15(attn_bias, cannot_attend)

    # TODO Rename this here and in `_apply_prefix_mask` and `_apply_sequence_id`
    def _extracted_from__apply_sequence_id_15(self, attn_bias, cannot_attend):
        min_val = torch.finfo(attn_bias.dtype).min
        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
        return attn_bias

    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, tok_emb: Optional[torch.FloatTensor]=None):
        return_dict = return_dict if return_dict is not None else self.config.return_dict
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        if self.gradient_checkpointing and self.training and use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
            )
            use_cache = False
        if attention_mask is not None:
            attention_mask = attention_mask.bool()
        if prefix_mask is not None:
            prefix_mask = prefix_mask.bool()
        if not return_dict:
            raise NotImplementedError('return_dict False is not implemented yet for MPT')
        if output_attentions:
            raise NotImplementedError('output_attentions is not implemented yet for MPT')
        if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
            raise NotImplementedError('MPT does not support training with left padding.')
        if self.prefix_lm and prefix_mask is None:
            raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
        if self.training:
            if self.attn_uses_sequence_id and sequence_id is None:
                raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
            elif self.attn_uses_sequence_id is False and sequence_id is not None:
                warnings.warn('MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' + 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.')
        if input_ids is not None:
            S = input_ids.size(1)
            assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
            tok_emb = self.wte(input_ids)
        else:
            assert tok_emb is not None
            S = tok_emb.size(1)
        if self.alibi:
            x = tok_emb
        else:
            past_position = 0
            if past_key_values is not None:
                if len(past_key_values) != self.config.n_layers:
                    raise ValueError(
                        f'past_key_values must provide a past_key_value for each attention layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).'
                    )
                past_position = past_key_values[0][0].size(1)
            if S + past_position > self.config.max_seq_len:
                raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
            pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
            if attention_mask is not None:
                pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
            pos_emb = self.wpe(pos)
            x = tok_emb + pos_emb
        if self.embedding_fraction == 1:
            x = self.emb_drop(x)
        else:
            x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
            assert isinstance(self.emb_drop, nn.Module)
            x = self.emb_drop(x_shrunk)
        (attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=x.dtype, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
        if use_cache and past_key_values is None:
            past_key_values = [() for _ in range(self.config.n_layers)]
        all_hidden_states = () if output_hidden_states else None
        for (b_idx, block) in enumerate(self.blocks):
            if output_hidden_states:
                assert all_hidden_states is not None
                all_hidden_states = all_hidden_states + (x,)
            past_key_value = past_key_values[b_idx] if past_key_values is not None else None
            if self.gradient_checkpointing and self.training:
                (x, past_key_value) = torch.utils.checkpoint.checkpoint(
                    block,
                    x, past_key_value, attn_bias, attention_mask, self.is_causal
                )
            else:
                (x, past_key_value) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal)
            if past_key_values is not None:
                past_key_values[b_idx] = past_key_value
        x = self.norm_f(x)
        return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states)

    def param_init_fn(self, module):
        init_fn_name = self.config.init_config['name']
        MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)

    def fsdp_wrap_fn(self, module):
        return isinstance(module, MPTBlock)

    def activation_checkpointing_fn(self, module):
        return isinstance(module, MPTBlock)

class MPTForCausalLM(MPTPreTrainedModel):

    def __init__(self, config: MPTConfig):
        super().__init__(config)
        if not config.tie_word_embeddings:
            raise ValueError('MPTForCausalLM only supports tied word embeddings')
        self.transformer = MPTModel(config)
        self.logit_scale = None
        if config.logit_scale is not None:
            logit_scale = config.logit_scale
            if isinstance(logit_scale, str):
                if logit_scale == 'inv_sqrt_d_model':
                    logit_scale = 1 / math.sqrt(config.d_model)
                else:
                    raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
            self.logit_scale = logit_scale

    def get_input_embeddings(self):
        return self.transformer.wte

    def set_input_embeddings(self, value):
        self.transformer.wte = value

    def get_output_embeddings(self):
        return self.transformer.wte

    def set_output_embeddings(self, new_embeddings):
        self.transformer.wte = new_embeddings

    def set_decoder(self, decoder):
        self.transformer = decoder

    def get_decoder(self):
        return self.transformer

    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
        return_dict = return_dict if return_dict is not None else self.config.return_dict
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
        logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight)
        if self.logit_scale is not None:
            if self.logit_scale == 0:
                warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
            logits *= self.logit_scale
        loss = None
        if labels is not None:
            labels = torch.roll(labels, shifts=-1)
            labels[:, -1] = -100
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)

    def param_init_fn(self, module):
        init_fn_name = self.config.init_config['name']
        MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)

    def fsdp_wrap_fn(self, module):
        return isinstance(module, MPTBlock)

    def activation_checkpointing_fn(self, module):
        return isinstance(module, MPTBlock)

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
        if inputs_embeds is not None:
            raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
        attention_mask = kwargs['attention_mask'].bool()
        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
            raise NotImplementedError('MPT does not support generation with right padding.')
        if self.transformer.attn_uses_sequence_id and self.training:
            sequence_id = torch.zeros_like(input_ids[:1])
        else:
            sequence_id = None
        if past_key_values is not None:
            input_ids = input_ids[:, -1].unsqueeze(-1)
        if self.transformer.prefix_lm:
            prefix_mask = torch.ones_like(attention_mask)
            if kwargs.get('use_cache') == False:
                raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
        else:
            prefix_mask = None
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)}

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        """Used by HuggingFace generate when using beam search with kv-caching.

        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
        for an example in transformers.
        """
        return [
            tuple(
                (past_state.index_select(0, beam_idx) for past_state in layer_past)
            )
            for layer_past in past_key_values
        ]

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/norm.py
================================================
import torch

def _cast_if_autocast_enabled(tensor):
    if torch.is_autocast_enabled():
        if tensor.device.type == 'cuda':
            dtype = torch.get_autocast_gpu_dtype()
        elif tensor.device.type == 'cpu':
            dtype = torch.get_autocast_cpu_dtype()
        else:
            raise NotImplementedError()
        return tensor.to(dtype=dtype)
    return tensor

class LPLayerNorm(torch.nn.LayerNorm):

    def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
        super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)

    def forward(self, x):
        module_device = x.device
        downcast_x = _cast_if_autocast_enabled(x)
        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
        with torch.autocast(enabled=False, device_type=module_device.type):
            return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)

def rms_norm(x, weight=None, eps=1e-05):
    output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
    return output * weight if weight is not None else output

class RMSNorm(torch.nn.Module):

    def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
        super().__init__()
        self.eps = eps
        if weight:
            self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
        else:
            self.register_parameter('weight', None)

    def forward(self, x):
        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)

class LPRMSNorm(RMSNorm):

    def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
        super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)

    def forward(self, x):
        downcast_x = _cast_if_autocast_enabled(x)
        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
        with torch.autocast(enabled=False, device_type=x.device.type):
            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/llava/mpt/param_init_fns.py
================================================
import math
import warnings
from collections.abc import Sequence
from functools import partial
from typing import Optional, Tuple, Union
import torch
from torch import nn
from .norm import NORM_CLASS_REGISTRY

def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
    del kwargs
    if verbose > 1:
        warnings.warn("Initializing network using module's reset_parameters attribute")
    if hasattr(module, 'reset_parameters'):
        module.reset_parameters()

def fused_init_helper_(module: nn.Module, init_fn_):
    _fused = getattr(module, '_fused', None)
    if _fused is None:
        raise RuntimeError('Internal logic error')
    (dim, splits) = _fused
    splits = (0, *splits, module.weight.size(dim))
    for (s, e) in zip(splits[:-1], splits[1:]):
        slice_indices = [slice(None)] * module.weight.ndim
        slice_indices[dim] = slice(s, e)
        init_fn_(module.weight[slice_indices])

def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
    del kwargs
    if verbose > 1:
        warnings.warn('If model has bias parameters they are initialized to 0.')
    init_div_is_residual = init_div_is_residual
    if init_div_is_residual is False:
        div_is_residual = 1.0
    elif init_div_is_residual is True:
        div_is_residual = math.sqrt(2 * n_layers)
    elif isinstance(init_div_is_residual, (float, int)):
        div_is_residual = init_div_is_residual
    elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
        div_is_residual = float(init_div_is_residual)
    else:
        div_is_residual = 1.0
        raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}')
    if init_div_is_residual is not False and verbose > 1:
        warnings.warn(
            f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. Set `init_div_is_residual: false` in init config to disable this.'
        )
    if isinstance(module, nn.Linear):
        if hasattr(module, '_fused'):
            fused_init_helper_(module, init_fn_)
        else:
            init_fn_(module.weight)
        if module.bias is not None:
            torch.nn.init.zeros_(module.bias)
        if init_div_is_residual is not False and getattr(module, '_is_residual', False):
            with torch.no_grad():
                module.weight.div_(div_is_residual)
    elif isinstance(module, nn.Embedding):
        if emb_init_std is not None:
            std = emb_init_std
            if std == 0:
                warnings.warn('Embedding layer initialized to 0.')
            emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
            if verbose > 1:
                warnings.warn(f'Embedding layer initialized using normal distribution with mean=0 and std={std!r}.')
        elif emb_init_uniform_lim is not None:
            lim = emb_init_uniform_lim
            if isinstance(lim, Sequence):
                if len(lim) > 2:
                    raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.')
                if lim[0] == lim[1]:
                    warnings.warn(f'Embedding layer initialized to {lim[0]}.')
            else:
                if lim == 0:
                    warnings.warn('Embedding layer initialized to 0.')
                lim = [-lim, lim]
            (a, b) = lim
            emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
            if verbose > 1:
                warnings.warn(f'Embedding layer initialized using uniform distribution in range {lim}.')
        else:
            emb_init_fn_ = init_fn_
        emb_init_fn_(module.weight)
    elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
        if verbose > 1:
            warnings.warn(
                'Norm weights are set to 1. If norm layer has a bias it is initialized to 0.'
            )
        if hasattr(module, 'weight') and module.weight is not None:
            torch.nn.init.ones_(module.weight)
        if hasattr(module, 'bias') and module.bias is not None:
            torch.nn.init.zeros_(module.bias)
    elif isinstance(module, nn.MultiheadAttention):
        if module._qkv_same_embed_dim:
            _extracted_from_generic_param_init_fn__69(module, d_model, init_fn_)
        else:
            assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
            assert module.in_proj_weight is None
            init_fn_(module.q_proj_weight)
            init_fn_(module.k_proj_weight)
            init_fn_(module.v_proj_weight)
        if module.in_proj_bias is not None:
            torch.nn.init.zeros_(module.in_proj_bias)
        if module.bias_k is not None:
            torch.nn.init.zeros_(module.bias_k)
        if module.bias_v is not None:
            torch.nn.init.zeros_(module.bias_v)
        init_fn_(module.out_proj.weight)
        if init_div_is_residual is not False and getattr(module.out_proj, '_is_residual', False):
            with torch.no_grad():
                module.out_proj.weight.div_(div_is_residual)
        if module.out_proj.bias is not None:
            torch.nn.init.zeros_(module.out_proj.bias)
    else:
        for _ in module.parameters(recurse=False):
            raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')


# TODO Rename this here and in `generic_param_init_fn_`
def _extracted_from_generic_param_init_fn__69(module, d_model, init_fn_):
    assert module.in_proj_weight is not None
    assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
    assert d_model is not None
    _d = d_model
    splits = (0, _d, 2 * _d, 3 * _d)
    for (s, e) in zip(splits[:-1], splits[1:]):
        init_fn_(module.in_proj_weight[s:e])

def _normal_init_(std, mean=0.0):
    return partial(torch.nn.init.normal_, mean=mean, std=std)

def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
    del kwargs
    init_fn_ = _normal_init_(std=std)
    if verbose > 1:
        warnings.warn(f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
    generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)

def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
    del kwargs
    if init_std is None:
        raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
    _normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)

def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
    del kwargs
    std = math.sqrt(2 / (5 * d_model))
    _normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)

def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
    """From section 2.3.1 of GPT-NeoX-20B:

    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
    """
    del kwargs
    residual_div = n_layers / math.sqrt(10)
    if verbose > 1:
        warnings.warn(f'setting init_div_is_residual to {residual_div}')
    small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)

def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
    del kwargs
    if verbose > 1:
        warnings.warn(
            f'Using nn.init.kaiming_uniform_ init fn with parameters: a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'
        )
    kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
    generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)

def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
    del kwargs
    if verbose > 1:
        warnings.warn(
            f'Using nn.init.kaiming_normal_ init fn with parameters: a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'
        )
    kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
    generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)

def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
    del kwargs
    xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
    if verbose > 1:
        warnings.warn(
            f'Using torch.nn.init.xavier_uniform_ init fn with parameters: gain={init_gain}'
        )
    generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)

def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
    xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
    if verbose > 1:
        warnings.warn(
            f'Using torch.nn.init.xavier_normal_ init fn with parameters: gain={init_gain}'
        )
    generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
MODEL_INIT_REGISTRY = {'default_': torch_default_param_init_fn_, 'baseline_': baseline_param_init_fn_, 'kaiming_uniform_': kaiming_uniform_param_init_fn_, 'kaiming_normal_': kaiming_normal_param_init_fn_, 'neox_init_': neox_param_init_fn_, 'small_init_': small_param_init_fn_, 'xavier_uniform_': xavier_uniform_param_init_fn_, 'xavier_normal_': xavier_normal_param_init_fn_}

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/nets/PixArt.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import math
import torch
import torch.nn as nn
import os
import numpy as np
from timm.models.layers import DropPath
from timm.models.vision_transformer import PatchEmbed, Mlp

from diffusion.model.builder import MODELS
from diffusion.model.utils import auto_grad_checkpoint, to_2tuple
from diffusion.model.nets.PixArt_blocks import t2i_modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, LabelEmbedder, FinalLayer
from diffusion.utils.logger import get_root_logger
from diffusion.model.cache_functions import global_force_fresh, cache_cutfresh, update_cache, force_init
import json

class PixArtBlock(nn.Module):
    """
    A PixArt block with adaptive layer norm (adaLN-single) conditioning.
    """

    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs):
        super().__init__()
        self.hidden_size = hidden_size
        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True,
                                    input_size=input_size if window_size == 0 else (window_size, window_size),
                                    use_rel_pos=use_rel_pos, **block_kwargs)
        self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs)
        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        # to be compatible with lower version pytorch
        approx_gelu = lambda: nn.GELU(approximate="tanh")
        self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.window_size = window_size
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)

    def forward(self, x, y, t, current, cache_dic, mask=None, **kwargs):
        B, N, C = x.shape

        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
        is_force_fresh = global_force_fresh(cache_dic, current)
        current['is_force_fresh'] = is_force_fresh
        
        if is_force_fresh: # Compute all tokens, and save them to cache
            current['module'] = 'attn'
            cache_dic['cache'][-1][current['layer']][current['module']], cache_dic['attn_map'][-1][current['layer']] = self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa))#.reshape(B, N, C)
            force_init(cache_dic, current, x)
            x = x + self.drop_path(gate_msa * cache_dic['cache'][-1][current['layer']][current['module']])

            current['module'] = 'cross-attn'
            cache_dic['cache'][-1][current['layer']][current['module']], cache_dic['cross_attn_map'][-1][current['layer']] = self.cross_attn(x, y, mask)
            force_init(cache_dic, current, x)
            x = x + cache_dic['cache'][-1][current['layer']][current['module']]

            current['module'] = 'mlp'
            cache_dic['cache'][-1][current['layer']][current['module']] = self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))
            force_init(cache_dic, current, x)
            x = x + self.drop_path(gate_mlp * cache_dic['cache'][-1][current['layer']][current['module']])

        else: 
            current['module'] = 'attn' 
            # no partial computation for attn. if you want to have an exploration, below may help.
            #fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current)
            #fresh_tokens, fresh_attn_map = self.attn(t2i_modulate(self.norm1(fresh_tokens), shift_msa, scale_msa))#.reshape(B, N, C)
            #update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_attn_map)
            #cache_dic['cache'][-1][current['layer']][current['module']], cache_dic['attn_map'][-1][current['layer']] = self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa))#.reshape(B, N, C)
            
            x = x + self.drop_path(gate_msa * cache_dic['cache'][-1][current['layer']][current['module']])

            current['module'] = 'cross-attn'
            fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current)
            fresh_tokens, fresh_cross_attn_map = self.cross_attn(fresh_tokens, y, mask)
            update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current, fresh_attn_map=fresh_cross_attn_map)

            x = x + cache_dic['cache'][-1][current['layer']][current['module']]

            current['module'] = 'mlp'
            fresh_indices, fresh_tokens = cache_cutfresh(cache_dic, x, current)
            fresh_tokens = self.mlp(t2i_modulate(self.norm2(fresh_tokens), shift_mlp, scale_mlp))
            update_cache(fresh_indices, fresh_tokens=fresh_tokens, cache_dic=cache_dic, current=current)
            
            x = x + self.drop_path(gate_mlp * cache_dic['cache'][-1][current['layer']][current['module']])

        return x

#############################################################################
#                                 Core PixArt Model                                #
#################################################################################
@MODELS.register_module()
class PixArt(nn.Module):
    """
    Diffusion model with a Transformer backbone.
    """

    def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1.0, config=None, model_max_length=120, **kwargs):
        if window_block_indexes is None:
            window_block_indexes = []
        super().__init__()
        self.pred_sigma = pred_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if pred_sigma else in_channels
        self.patch_size = patch_size
        self.num_heads = num_heads
        self.lewei_scale = lewei_scale,

        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
        self.t_embedder = TimestepEmbedder(hidden_size)
        num_patches = self.x_embedder.num_patches
        self.base_size = input_size // self.patch_size
        # Will use fixed sin-cos embedding:
        self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size))

        approx_gelu = lambda: nn.GELU(approximate="tanh")
        self.t_block = nn.Sequential(
            nn.SiLU(),
            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
        )
        self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length)
        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList([
            PixArtBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i],
                          input_size=(input_size // patch_size, input_size // patch_size),
                          window_size=window_size if i in window_block_indexes else 0,
                          use_rel_pos=use_rel_pos if i in window_block_indexes else False)
            for i in range(depth)
        ])
        self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels)

        self.initialize_weights()

        if config:
            logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log'))
            logger.warning(f"lewei scale: {self.lewei_scale}, base size: {self.base_size}")
        else:
            print(f'Warning: lewei scale: {self.lewei_scale}, base size: {self.base_size}')

    def forward(self, x, timestep, current, cache_dic, y, mask=None, data_info=None, **kwargs):
        """
        Forward pass of PixArt.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N, 1, 120, C) tensor of class labels
        """
        x = x.to(self.dtype)
        timestep = timestep.to(self.dtype)
        y = y.to(self.dtype)
        pos_embed = self.pos_embed.to(self.dtype)
        self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
        t = self.t_embedder(timestep.to(x.dtype))  # (N, D)
        t0 = self.t_block(t)
        y = self.y_embedder(y, self.training)  # (N, 1, L, D)
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])
        for i, block in enumerate(self.blocks):
            current['layer'] = i
            x = auto_grad_checkpoint(block, x, y, t0, current, cache_dic, y_lens)  # (N, T, D) #support grad checkpoint
        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)  # (N, out_channels, H, W)
        return x

    def forward_with_dpmsolver(self, x, timestep, current, cache_dic, y, mask=None, **kwargs):
        """
        dpm solver donnot need variance prediction
        """
        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
        model_out = self.forward(x, timestep, current, cache_dic, y, mask)
        return model_out.chunk(2, dim=1)[0]

    def forward_with_cfg(self, x, timestep, current, cache_dic, y, cfg_scale, mask=None, **kwargs):
        """
        Forward pass of PixArt, but also batches the unconditional forward pass for classifier-free guidance.
        """
        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
        half = x[: len(x) // 2]
        combined = torch.cat([half, half], dim=0)
        model_out = self.forward(combined, timestep, current, cache_dic, y, mask, kwargs)
        model_out = model_out['x'] if isinstance(model_out, dict) else model_out
        eps, rest = model_out[:, :3], model_out[:, 3:]
        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
        eps = torch.cat([half_eps, half_eps], dim=0)
        return torch.cat([eps, rest], dim=1)

    def unpatchify(self, x):
        """
        x: (N, T, patch_size**2 * C)
        imgs: (N, H, W, C)
        """
        c = self.out_channels
        p = self.x_embedder.patch_size[0]
        h = w = int(x.shape[1] ** 0.5)
        assert h * w == x.shape[1]

        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
        x = torch.einsum('nhwpqc->nchpwq', x)
        return x.reshape(shape=(x.shape[0], c, h * p, h * p))

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize (and freeze) pos_embed by sin-cos embedding:
        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5), lewei_scale=self.lewei_scale, base_size=self.base_size)
        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
        nn.init.normal_(self.t_block[1].weight, std=0.02)

        # Initialize caption embedding MLP:
        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)

        # Zero-out adaLN modulation layers in PixArt blocks:
        for block in self.blocks:
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)

    @property
    def dtype(self):
        return next(self.parameters()).dtype


def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, lewei_scale=1.0, base_size=16):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    if isinstance(grid_size, int):
        grid_size = to_2tuple(grid_size)
    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / lewei_scale
    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / lewei_scale
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)
    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])

    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token and extra_tokens > 0:
        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
    return pos_embed


def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0

    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)

    return np.concatenate([emb_h, emb_w], axis=1)


def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float64)
    omega /= embed_dim / 2.
    omega = 1. / 10000 ** omega  # (D/2,)

    pos = pos.reshape(-1)  # (M,)
    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product

    emb_sin = np.sin(out)  # (M, D/2)
    emb_cos = np.cos(out)  # (M, D/2)

    return np.concatenate([emb_sin, emb_cos], axis=1)


#################################################################################
#                                   PixArt Configs                                  #
#################################################################################
@MODELS.register_module()
def PixArt_XL_2(**kwargs):
    return PixArt(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/nets/PixArtMS.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import torch
import torch.nn as nn
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp

from diffusion.model.builder import MODELS
from diffusion.model.utils import auto_grad_checkpoint, to_2tuple
from diffusion.model.nets.PixArt_blocks import t2i_modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, SizeEmbedder
from diffusion.model.nets.PixArt import PixArt, get_2d_sincos_pos_embed


class PatchEmbed(nn.Module):
    """ 2D Image to Patch Embedding
    """
    def __init__(
            self,
            patch_size=16,
            in_chans=3,
            embed_dim=768,
            norm_layer=None,
            flatten=True,
            bias=True,
    ):
        super().__init__()
        patch_size = to_2tuple(patch_size)
        self.patch_size = patch_size
        self.flatten = flatten
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x):
        x = self.proj(x)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
        x = self.norm(x)
        return x


class PixArtMSBlock(nn.Module):
    """
    A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning.
    """

    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs):
        super().__init__()
        self.hidden_size = hidden_size
        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True,
                              input_size=input_size if window_size == 0 else (window_size, window_size),
                              use_rel_pos=use_rel_pos, **block_kwargs)
        self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs)
        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        # to be compatible with lower version pytorch
        approx_gelu = lambda: nn.GELU(approximate="tanh")
        self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.window_size = window_size
        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)

    def forward(self, x, y, t, mask=None, **kwargs):
        B, N, C = x.shape

        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
        x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)))
        x = x + self.cross_attn(x, y, mask)
        x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))

        return x


#############################################################################
#                                 Core PixArt Model                                #
#################################################################################
@MODELS.register_module()
class PixArtMS(PixArt):
    """
    Diffusion model with a Transformer backbone.
    """

    def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, learn_sigma=True, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1., config=None, model_max_length=120, **kwargs):
        if window_block_indexes is None:
            window_block_indexes = []
        super().__init__(
            input_size=input_size,
            patch_size=patch_size,
            in_channels=in_channels,
            hidden_size=hidden_size,
            depth=depth,
            num_heads=num_heads,
            mlp_ratio=mlp_ratio,
            class_dropout_prob=class_dropout_prob,
            learn_sigma=learn_sigma,
            pred_sigma=pred_sigma,
            drop_path=drop_path,
            window_size=window_size,
            window_block_indexes=window_block_indexes,
            use_rel_pos=use_rel_pos,
            lewei_scale=lewei_scale,
            config=config,
            model_max_length=model_max_length,
            **kwargs,
        )
        self.h = self.w = 0
        approx_gelu = lambda: nn.GELU(approximate="tanh")
        self.t_block = nn.Sequential(
            nn.SiLU(),
            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
        )
        self.x_embedder = PatchEmbed(patch_size, in_channels, hidden_size, bias=True)
        self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length)
        self.csize_embedder = SizeEmbedder(hidden_size//3)  # c_size embed
        self.ar_embedder = SizeEmbedder(hidden_size//3)     # aspect ratio embed
        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList([
            PixArtMSBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i],
                          input_size=(input_size // patch_size, input_size // patch_size),
                          window_size=window_size if i in window_block_indexes else 0,
                          use_rel_pos=use_rel_pos if i in window_block_indexes else False)
            for i in range(depth)
        ])
        self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels)

        self.initialize()

    def forward(self, x, timestep, y, mask=None, data_info=None, **kwargs):
        """
        Forward pass of PixArt.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N, 1, 120, C) tensor of class labels
        """
        bs = x.shape[0]
        x = x.to(self.dtype)
        timestep = timestep.to(self.dtype)
        y = y.to(self.dtype)
        c_size, ar = data_info['img_hw'].to(self.dtype), data_info['aspect_ratio'].to(self.dtype)
        self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
        pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(x.device).to(self.dtype)
        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
        t = self.t_embedder(timestep)  # (N, D)
        csize = self.csize_embedder(c_size, bs)  # (N, D)
        ar = self.ar_embedder(ar, bs)  # (N, D)
        t = t + torch.cat([csize, ar], dim=1)
        t0 = self.t_block(t)
        y = self.y_embedder(y, self.training)  # (N, D)
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])
        for block in self.blocks:
            x = auto_grad_checkpoint(block, x, y, t0, y_lens, **kwargs)  # (N, T, D) #support grad checkpoint
        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)  # (N, out_channels, H, W)
        return x

    def forward_with_dpmsolver(self, x, timestep, y, data_info, **kwargs):
        """
        dpm solver donnot need variance prediction
        """
        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
        model_out = self.forward(x, timestep, y, data_info=data_info, **kwargs)
        return model_out.chunk(2, dim=1)[0]

    def forward_with_cfg(self, x, timestep, y, cfg_scale, data_info, **kwargs):
        """
        Forward pass of PixArt, but also batches the unconditional forward pass for classifier-free guidance.
        """
        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
        half = x[: len(x) // 2]
        combined = torch.cat([half, half], dim=0)
        model_out = self.forward(combined, timestep, y, data_info=data_info)
        eps, rest = model_out[:, :3], model_out[:, 3:]
        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
        eps = torch.cat([half_eps, half_eps], dim=0)
        return torch.cat([eps, rest], dim=1)

    def unpatchify(self, x):
        """
        x: (N, T, patch_size**2 * C)
        imgs: (N, H, W, C)
        """
        c = self.out_channels
        p = self.x_embedder.patch_size[0]
        assert self.h * self.w == x.shape[1]

        x = x.reshape(shape=(x.shape[0], self.h, self.w, p, p, c))
        x = torch.einsum('nhwpqc->nchpwq', x)
        return x.reshape(shape=(x.shape[0], c, self.h * p, self.w * p))

    def initialize(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

        self.apply(_basic_init)

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
        nn.init.normal_(self.t_block[1].weight, std=0.02)
        nn.init.normal_(self.csize_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.csize_embedder.mlp[2].weight, std=0.02)
        nn.init.normal_(self.ar_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.ar_embedder.mlp[2].weight, std=0.02)

        # Initialize caption embedding MLP:
        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)

        # Zero-out adaLN modulation layers in PixArt blocks:
        for block in self.blocks:
            nn.init.constant_(block.cross_attn.proj.weight, 0)
            nn.init.constant_(block.cross_attn.proj.bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)


#################################################################################
#                                   PixArt Configs                                  #
#################################################################################
@MODELS.register_module()
def PixArtMS_XL_2(**kwargs):
    return PixArtMS(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/nets/PixArt_blocks.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import math
import torch
import torch.nn as nn
from timm.models.vision_transformer import Mlp, Attention as Attention_
from einops import rearrange, repeat
import xformers.ops

from diffusion.model.utils import add_decomposed_rel_pos
from diffusion.model.cache_functions import cached_attention_forward


def modulate(x, shift, scale):
    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)


def t2i_modulate(x, shift, scale):
    return x * (1 + scale) + shift


class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads, attn_drop=0., proj_drop=0., **block_kwargs):
        super(MultiHeadCrossAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.kv_linear = nn.Linear(d_model, d_model*2)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(d_model, d_model)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x, cond, mask=None):
        # query: img tokens; key/value: condition; mask: if padding tokens
        B, N, C = x.shape

        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
        k, v = kv.unbind(2)
        attn_bias = None
        if mask is not None:
            attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
        #x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
        # we need to save the cross-attn map here, so we use our own function for cross-attention, not the xformers.ops.memory_efficient_attention
        # maybe there is a future version of xformers.ops.memory_efficient_attention that can return the attn_map
        x, attn_map = cached_attention_forward(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
        x = x.view(B, -1, C)
        attn_map = attn_map.view(B, -1, attn_map.shape[-1])
        x = self.proj(x)
        x = self.proj_drop(x)

        #q = self.q_linear(x).reshape(B, -1, self.num_heads, self.head_dim)
        #kv = self.kv_linear(cond).reshape(B, -1, 2, self.num_heads, self.head_dim)
        #k, v = kv.unbind(2)
        #attn_bias = None
        #if mask is not None:
        #    attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
        #    attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf'))
        ##x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
        #x, attn_map = cached_attention_forward(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
        #x = x.contiguous().reshape(B, -1, C)
        #x = self.proj(x)
        #x = self.proj_drop(x)

        return x, attn_map


class WindowAttention(Attention_):
    """Multi-head Attention block with relative position embeddings."""

    def __init__(
        self,
        dim,
        num_heads=8,
        qkv_bias=True,
        use_rel_pos=False,
        rel_pos_zero_init=True,
        input_size=None,
        **block_kwargs,
    ):
        """
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads.
            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
            rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            input_size (int or None): Input resolution for calculating the relative positional
                parameter size.
        """
        super().__init__(dim, num_heads=num_heads, qkv_bias=qkv_bias, **block_kwargs)

        self.use_rel_pos = use_rel_pos
        if self.use_rel_pos:
            # initialize relative positional embeddings
            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, self.head_dim))
            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, self.head_dim))

            if not rel_pos_zero_init:
                nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
                nn.init.trunc_normal_(self.rel_pos_w, std=0.02)

    def forward(self, x, mask=None):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
        q, k, v = qkv.unbind(2)
        if use_fp32_attention := getattr(self, 'fp32_attention', False):
            q, k, v = q.float(), k.float(), v.float()

        attn_bias = None
        if mask is not None:
            attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
            attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf'))
        #x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
        #attn_map = None
        # we need to save the self-attn map here, so we use our own function for self-attention, not the xformers.ops.memory_efficient_attention
        # maybe there is a future version of xformers.ops.memory_efficient_attention that can return the attn_map
        # However, you can use the xformers.ops.memory_efficient_attention for self-attention, and use our own function for cross-attention.
        # This is because in our final version, only cross attention map is used, you can use the xformers.ops.memory_efficient_attention for self-attention for a faster speed, if you don't need the self-attention score(s1).
        x, attn_map = cached_attention_forward(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
        x = x.view(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x, attn_map


#################################################################################
#   AMP attention with fp32 softmax to fix loss NaN problem during training     #
#################################################################################
class Attention(Attention_):
    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
        use_fp32_attention = getattr(self, 'fp32_attention', False)
        if use_fp32_attention:
            q, k = q.float(), k.float()
        with torch.cuda.amp.autocast(enabled=not use_fp32_attention):
            attn = (q @ k.transpose(-2, -1)) * self.scale
            attn = attn.softmax(dim=-1)

        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class FinalLayer(nn.Module):
    """
    The final layer of PixArt.
    """

    def __init__(self, hidden_size, patch_size, out_channels):
        super().__init__()
        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
        )

    def forward(self, x, c):
        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
        x = modulate(self.norm_final(x), shift, scale)
        x = self.linear(x)
        return x


class T2IFinalLayer(nn.Module):
    """
    The final layer of PixArt.
    """

    def __init__(self, hidden_size, patch_size, out_channels):
        super().__init__()
        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
        self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size ** 0.5)
        self.out_channels = out_channels

    def forward(self, x, t):
        shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
        x = t2i_modulate(self.norm_final(x), shift, scale)
        x = self.linear(x)
        return x


class MaskFinalLayer(nn.Module):
    """
    The final layer of PixArt.
    """

    def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels):
        super().__init__()
        self.norm_final = nn.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True)
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            nn.Linear(c_emb_size, 2 * final_hidden_size, bias=True)
        )
    def forward(self, x, t):
        shift, scale = self.adaLN_modulation(t).chunk(2, dim=1)
        x = modulate(self.norm_final(x), shift, scale)
        x = self.linear(x)
        return x


class DecoderLayer(nn.Module):
    """
    The final layer of PixArt.
    """

    def __init__(self, hidden_size, decoder_hidden_size):
        super().__init__()
        self.norm_decoder = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, decoder_hidden_size, bias=True)
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
        )
    def forward(self, x, t):
        shift, scale = self.adaLN_modulation(t).chunk(2, dim=1)
        x = modulate(self.norm_decoder(x), shift, scale)
        x = self.linear(x)
        return x


#################################################################################
#               Embedding Layers for Timesteps and Class Labels                 #
#################################################################################
class TimestepEmbedder(nn.Module):
    """
    Embeds scalar timesteps into vector representations.
    """

    def __init__(self, hidden_size, frequency_embedding_size=256):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
            nn.SiLU(),
            nn.Linear(hidden_size, hidden_size, bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size

    @staticmethod
    def timestep_embedding(t, dim, max_period=10000):
        """
        Create sinusoidal timestep embeddings.
        :param t: a 1-D Tensor of N indices, one per batch element.
                          These may be fractional.
        :param dim: the dimension of the output.
        :param max_period: controls the minimum frequency of the embeddings.
        :return: an (N, D) Tensor of positional embeddings.
        """
        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
        half = dim // 2
        freqs = torch.exp(
            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
        args = t[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
        return embedding

    def forward(self, t):
        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(self.dtype)
        return self.mlp(t_freq)

    @property
    def dtype(self):
        # 返回模型参数的数据类型
        return next(self.parameters()).dtype


class SizeEmbedder(TimestepEmbedder):
    """
    Embeds scalar timesteps into vector representations.
    """

    def __init__(self, hidden_size, frequency_embedding_size=256):
        super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size)
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
            nn.SiLU(),
            nn.Linear(hidden_size, hidden_size, bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size
        self.outdim = hidden_size

    def forward(self, s, bs):
        if s.ndim == 1:
            s = s[:, None]
        assert s.ndim == 2
        if s.shape[0] != bs:
            s = s.repeat(bs//s.shape[0], 1)
            assert s.shape[0] == bs
        b, dims = s.shape[0], s.shape[1]
        s = rearrange(s, "b d -> (b d)")
        s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype)
        s_emb = self.mlp(s_freq)
        s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
        return s_emb

    @property
    def dtype(self):
        # 返回模型参数的数据类型
        return next(self.parameters()).dtype


class LabelEmbedder(nn.Module):
    """
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
    """

    def __init__(self, num_classes, hidden_size, dropout_prob):
        super().__init__()
        use_cfg_embedding = dropout_prob > 0
        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
        self.num_classes = num_classes
        self.dropout_prob = dropout_prob

    def token_drop(self, labels, force_drop_ids=None):
        """
        Drops labels to enable classifier-free guidance.
        """
        if force_drop_ids is None:
            drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob
        else:
            drop_ids = force_drop_ids == 1
        labels = torch.where(drop_ids, self.num_classes, labels)
        return labels

    def forward(self, labels, train, force_drop_ids=None):
        use_dropout = self.dropout_prob > 0
        if (train and use_dropout) or (force_drop_ids is not None):
            labels = self.token_drop(labels, force_drop_ids)
        return self.embedding_table(labels)


class CaptionEmbedder(nn.Module):
    """
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
    """

    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120):
        super().__init__()
        self.y_proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0)
        self.register_buffer("y_embedding", nn.Parameter(torch.randn(token_num, in_channels) / in_channels ** 0.5))
        self.uncond_prob = uncond_prob

    def token_drop(self, caption, force_drop_ids=None):
        """
        Drops labels to enable classifier-free guidance.
        """
        if force_drop_ids is None:
            drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob
        else:
            drop_ids = force_drop_ids == 1
        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
        return caption

    def forward(self, caption, train, force_drop_ids=None):
        if train:
            assert caption.shape[2:] == self.y_embedding.shape
        use_dropout = self.uncond_prob > 0
        if (train and use_dropout) or (force_drop_ids is not None):
            caption = self.token_drop(caption, force_drop_ids)
        caption = self.y_proj(caption)
        return caption


class CaptionEmbedderDoubleBr(nn.Module):
    """
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
    """

    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120):
        super().__init__()
        self.proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0)
        self.embedding = nn.Parameter(torch.randn(1, in_channels) / 10 ** 0.5)
        self.y_embedding = nn.Parameter(torch.randn(token_num, in_channels) / 10 ** 0.5)
        self.uncond_prob = uncond_prob

    def token_drop(self, global_caption, caption, force_drop_ids=None):
        """
        Drops labels to enable classifier-free guidance.
        """
        if force_drop_ids is None:
            drop_ids = torch.rand(global_caption.shape[0]).cuda() < self.uncond_prob
        else:
            drop_ids = force_drop_ids == 1
        global_caption = torch.where(drop_ids[:, None], self.embedding, global_caption)
        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
        return global_caption, caption

    def forward(self, caption, train, force_drop_ids=None):
        assert caption.shape[2: ] == self.y_embedding.shape
        global_caption = caption.mean(dim=2).squeeze()
        use_dropout = self.uncond_prob > 0
        if (train and use_dropout) or (force_drop_ids is not None):
            global_caption, caption = self.token_drop(global_caption, caption, force_drop_ids)
        y_embed = self.proj(global_caption)
        return y_embed, caption

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/nets/__init__.py
================================================
from .PixArt import PixArt, PixArt_XL_2
from .PixArtMS import PixArtMS, PixArtMS_XL_2, PixArtMSBlock
from .pixart_controlnet import ControlPixArtHalf, ControlPixArtMSHalf

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/nets/pixart_controlnet.py
================================================
import re
import torch
import torch.nn as nn

from copy import deepcopy
from torch import Tensor
from torch.nn import Module, Linear, init
from typing import Any, Mapping

from diffusion.model.nets import PixArtMSBlock, PixArtMS, PixArt
from diffusion.model.nets.PixArt import get_2d_sincos_pos_embed
from diffusion.model.utils import auto_grad_checkpoint


# The implementation of ControlNet-Half architrecture
# https://github.com/lllyasviel/ControlNet/discussions/188
class ControlT2IDitBlockHalf(Module):
    def __init__(self, base_block: PixArtMSBlock, block_index: 0) -> None:
        super().__init__()
        self.copied_block = deepcopy(base_block)
        self.block_index = block_index

        for p in self.copied_block.parameters():
            p.requires_grad_(True)

        self.copied_block.load_state_dict(base_block.state_dict())
        self.copied_block.train()
        
        self.hidden_size = hidden_size = base_block.hidden_size
        if self.block_index == 0:
            self.before_proj = Linear(hidden_size, hidden_size)
            init.zeros_(self.before_proj.weight)
            init.zeros_(self.before_proj.bias)
        self.after_proj = Linear(hidden_size, hidden_size) 
        init.zeros_(self.after_proj.weight)
        init.zeros_(self.after_proj.bias)

    def forward(self, x, y, t, mask=None, c=None):
        
        if self.block_index == 0:
            # the first block
            c = self.before_proj(c)
            c = self.copied_block(x + c, y, t, mask)
            c_skip = self.after_proj(c)
        else:
            # load from previous c and produce the c for skip connection
            c = self.copied_block(c, y, t, mask)
            c_skip = self.after_proj(c)
        
        return c, c_skip
        

# The implementation of ControlPixArtHalf net
class ControlPixArtHalf(Module):
    # only support single res model
    def __init__(self, base_model: PixArt, copy_blocks_num: int = 13) -> None:
        super().__init__()
        self.base_model = base_model.eval()
        self.controlnet = []
        self.copy_blocks_num = copy_blocks_num
        self.total_blocks_num = len(base_model.blocks)
        for p in self.base_model.parameters():
            p.requires_grad_(False)

        # Copy first copy_blocks_num block
        for i in range(copy_blocks_num):
            self.controlnet.append(ControlT2IDitBlockHalf(base_model.blocks[i], i))
        self.controlnet = nn.ModuleList(self.controlnet)
    
    def __getattr__(self, name: str) -> Tensor or Module:
        if name in ['forward', 'forward_with_dpmsolver', 'forward_with_cfg', 'forward_c', 'load_state_dict']:
            return self.__dict__[name]
        elif name in ['base_model', 'controlnet']:
            return super().__getattr__(name)
        else:
            return getattr(self.base_model, name)

    def forward_c(self, c):
        self.h, self.w = c.shape[-2]//self.patch_size, c.shape[-1]//self.patch_size
        pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(c.device).to(self.dtype)
        return self.x_embedder(c) + pos_embed if c is not None else c

    # def forward(self, x, t, c, **kwargs):
    #     return self.base_model(x, t, c=self.forward_c(c), **kwargs)
    def forward(self, x, timestep, y, mask=None, data_info=None, c=None, **kwargs):
        # modify the original PixArtMS forward function
        if c is not None:
            c = c.to(self.dtype)
            c = self.forward_c(c)
        """
        Forward pass of PixArt.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N, 1, 120, C) tensor of class labels
        """
        x = x.to(self.dtype)
        timestep = timestep.to(self.dtype)
        y = y.to(self.dtype)
        pos_embed = self.pos_embed.to(self.dtype)
        self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
        t = self.t_embedder(timestep.to(x.dtype))  # (N, D)
        t0 = self.t_block(t)
        y = self.y_embedder(y, self.training)  # (N, 1, L, D)
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # define the first layer
        x = auto_grad_checkpoint(self.base_model.blocks[0], x, y, t0, y_lens, **kwargs)  # (N, T, D) #support grad checkpoint

        if c is not None:
            # update c
            for index in range(1, self.copy_blocks_num + 1):
                c, c_skip = auto_grad_checkpoint(self.controlnet[index - 1], x, y, t0, y_lens, c, **kwargs)
                x = auto_grad_checkpoint(self.base_model.blocks[index], x + c_skip, y, t0, y_lens, **kwargs)
        
            # update x
            for index in range(self.copy_blocks_num + 1, self.total_blocks_num):
                x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)
        else:
            for index in range(1, self.total_blocks_num):
                x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)

        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)  # (N, out_channels, H, W)
        return x

    def forward_with_dpmsolver(self, x, t, y, data_info, c, **kwargs):
        model_out = self.forward(x, t, y, data_info=data_info, c=c, **kwargs)
        return model_out.chunk(2, dim=1)[0]

    # def forward_with_dpmsolver(self, x, t, y, data_info, c, **kwargs):
    #     return self.base_model.forward_with_dpmsolver(x, t, y, data_info=data_info, c=self.forward_c(c), **kwargs)

    def forward_with_cfg(self, x, t, y, cfg_scale, data_info, c, **kwargs):
        return self.base_model.forward_with_cfg(x, t, y, cfg_scale, data_info, c=self.forward_c(c), **kwargs)

    def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
        if all((k.startswith('base_model') or k.startswith('controlnet')) for k in state_dict.keys()):
            return super().load_state_dict(state_dict, strict)
        else:
            new_key = {}
            for k in state_dict.keys():
                new_key[k] = re.sub(r"(blocks\.\d+)(.*)", r"\1.base_block\2", k)
            for k, v in new_key.items():
                if k != v:
                    print(f"replace {k} to {v}")
                    state_dict[v] = state_dict.pop(k)

            return self.base_model.load_state_dict(state_dict, strict)
    
    def unpatchify(self, x):
        """
        x: (N, T, patch_size**2 * C)
        imgs: (N, H, W, C)
        """
        c = self.out_channels
        p = self.x_embedder.patch_size[0]
        assert self.h * self.w == x.shape[1]

        x = x.reshape(shape=(x.shape[0], self.h, self.w, p, p, c))
        x = torch.einsum('nhwpqc->nchpwq', x)
        imgs = x.reshape(shape=(x.shape[0], c, self.h * p, self.w * p))
        return imgs

    @property
    def dtype(self):
        # 返回模型参数的数据类型
        return next(self.parameters()).dtype


# The implementation for PixArtMS_Half + 1024 resolution
class ControlPixArtMSHalf(ControlPixArtHalf):
    # support multi-scale res model (multi-scale model can also be applied to single reso training & inference)
    def __init__(self, base_model: PixArtMS, copy_blocks_num: int = 13) -> None:
        super().__init__(base_model=base_model, copy_blocks_num=copy_blocks_num)

    def forward(self, x, timestep, y, mask=None, data_info=None, c=None, **kwargs):
        # modify the original PixArtMS forward function
        """
        Forward pass of PixArt.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N, 1, 120, C) tensor of class labels
        """
        if c is not None:
            c = c.to(self.dtype)
            c = self.forward_c(c)
        bs = x.shape[0]
        x = x.to(self.dtype)
        timestep = timestep.to(self.dtype)
        y = y.to(self.dtype)
        c_size, ar = data_info['img_hw'].to(self.dtype), data_info['aspect_ratio'].to(self.dtype)
        self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size

        pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(x.device).to(self.dtype)
        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
        t = self.t_embedder(timestep)  # (N, D)
        csize = self.csize_embedder(c_size, bs)  # (N, D)
        ar = self.ar_embedder(ar, bs)  # (N, D)
        t = t + torch.cat([csize, ar], dim=1)
        t0 = self.t_block(t)
        y = self.y_embedder(y, self.training)  # (N, D)
        if mask is not None:
            if mask.shape[0] != y.shape[0]:
                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
            mask = mask.squeeze(1).squeeze(1)
            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
            y_lens = mask.sum(dim=1).tolist()
        else:
            y_lens = [y.shape[2]] * y.shape[0]
            y = y.squeeze(1).view(1, -1, x.shape[-1])

        # define the first layer
        x = auto_grad_checkpoint(self.base_model.blocks[0], x, y, t0, y_lens, **kwargs)  # (N, T, D) #support grad checkpoint

        if c is not None:
            # update c
            for index in range(1, self.copy_blocks_num + 1):
                c, c_skip = auto_grad_checkpoint(self.controlnet[index - 1], x, y, t0, y_lens, c, **kwargs)
                x = auto_grad_checkpoint(self.base_model.blocks[index], x + c_skip, y, t0, y_lens, **kwargs)
        
            # update x
            for index in range(self.copy_blocks_num + 1, self.total_blocks_num):
                x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)
        else:
            for index in range(1, self.total_blocks_num):
                x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)

        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)  # (N, out_channels, H, W)
        return x


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/respace.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py

import numpy as np
import torch as th

from .gaussian_diffusion import GaussianDiffusion


def space_timesteps(num_timesteps, section_counts):
    """
    Create a list of timesteps to use from an original diffusion process,
    given the number of timesteps we want to take from equally-sized portions
    of the original process.
    For example, if there's 300 timesteps and the section counts are [10,15,20]
    then the first 100 timesteps are strided to be 10 timesteps, the second 100
    are strided to be 15 timesteps, and the final 100 are strided to be 20.
    If the stride is a string starting with "ddim", then the fixed striding
    from the DDIM paper is used, and only one section is allowed.
    :param num_timesteps: the number of diffusion steps in the original
                          process to divide up.
    :param section_counts: either a list of numbers, or a string containing
                           comma-separated numbers, indicating the step count
                           per section. As a special case, use "ddimN" where N
                           is a number of steps to use the striding from the
                           DDIM paper.
    :return: a set of diffusion steps from the original process to use.
    """
    if isinstance(section_counts, str):
        if section_counts.startswith("ddim"):
            desired_count = int(section_counts[len("ddim") :])
            for i in range(1, num_timesteps):
                if len(range(0, num_timesteps, i)) == desired_count:
                    return set(range(0, num_timesteps, i))
            raise ValueError(
                f"cannot create exactly {num_timesteps} steps with an integer stride"
            )
        section_counts = [int(x) for x in section_counts.split(",")]
    size_per = num_timesteps // len(section_counts)
    extra = num_timesteps % len(section_counts)
    start_idx = 0
    all_steps = []
    for i, section_count in enumerate(section_counts):
        size = size_per + (1 if i < extra else 0)
        if size < section_count:
            raise ValueError(
                f"cannot divide section of {size} steps into {section_count}"
            )
        frac_stride = 1 if section_count <= 1 else (size - 1) / (section_count - 1)
        cur_idx = 0.0
        taken_steps = []
        for _ in range(section_count):
            taken_steps.append(start_idx + round(cur_idx))
            cur_idx += frac_stride
        all_steps += taken_steps
        start_idx += size
    return set(all_steps)


class SpacedDiffusion(GaussianDiffusion):
    """
    A diffusion process which can skip steps in a base diffusion process.
    :param use_timesteps: a collection (sequence or set) of timesteps from the
                          original diffusion process to retain.
    :param kwargs: the kwargs to create the base diffusion process.
    """

    def __init__(self, use_timesteps, **kwargs):
        self.use_timesteps = set(use_timesteps)
        self.timestep_map = []
        self.original_num_steps = len(kwargs["betas"])

        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
        last_alpha_cumprod = 1.0
        new_betas = []
        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
            if i in self.use_timesteps:
                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
                last_alpha_cumprod = alpha_cumprod
                self.timestep_map.append(i)
        kwargs["betas"] = np.array(new_betas)
        super().__init__(**kwargs)

    def p_mean_variance(
        self, model, *args, **kwargs
    ):  # pylint: disable=signature-differs
        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)

    def training_losses(
        self, model, *args, **kwargs
    ):  # pylint: disable=signature-differs
        return super().training_losses(self._wrap_model(model), *args, **kwargs)

    def training_losses_diffusers(
        self, model, *args, **kwargs
    ):  # pylint: disable=signature-differs
        return super().training_losses_diffusers(self._wrap_model(model), *args, **kwargs)

    def condition_mean(self, cond_fn, *args, **kwargs):
        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)

    def condition_score(self, cond_fn, *args, **kwargs):
        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)

    def _wrap_model(self, model):
        if isinstance(model, _WrappedModel):
            return model
        return _WrappedModel(
            model, self.timestep_map, self.original_num_steps
        )

    def _scale_timesteps(self, t):
        # Scaling is done by the wrapped model.
        return t


class _WrappedModel:
    def __init__(self, model, timestep_map, original_num_steps):
        self.model = model
        self.timestep_map = timestep_map
        # self.rescale_timesteps = rescale_timesteps
        self.original_num_steps = original_num_steps

    def __call__(self, x, timestep, **kwargs):
        map_tensor = th.tensor(self.timestep_map, device=timestep.device, dtype=timestep.dtype)
        new_ts = map_tensor[timestep]
        # if self.rescale_timesteps:
        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
        return self.model(x, timestep=new_ts, **kwargs)


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/sa_solver.py
================================================
import torch
import torch.nn.functional as F
import math
from tqdm import tqdm


class NoiseScheduleVP:
    def __init__(
            self,
            schedule='discrete',
            betas=None,
            alphas_cumprod=None,
            continuous_beta_0=0.1,
            continuous_beta_1=20.,
            dtype=torch.float32,
    ):
        """Thanks to DPM-Solver for their code base"""
        """Create a wrapper class for the forward SDE (VP type).
        ***
        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
        ***
        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
            log_alpha_t = self.marginal_log_mean_coeff(t)
            sigma_t = self.marginal_std(t)
            lambda_t = self.marginal_lambda(t)
        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
            t = self.inverse_lambda(lambda_t)
        ===============================================================
        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
        1. For discrete-time DPMs:
            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
                t_i = (i + 1) / N
            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
            Args:
                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
            Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
                and
                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
        2. For continuous-time DPMs:
            We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
            schedule are the default settings in DDPM and improved-DDPM:
            Args:
                beta_min: A `float` number. The smallest beta for the linear schedule.
                beta_max: A `float` number. The largest beta for the linear schedule.
                cosine_s: A `float` number. The hyperparameter in the cosine schedule.
                cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
                T: A `float` number. The ending time of the forward process.
        ===============================================================
        Args:
            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
                    'linear' or 'cosine' for continuous-time DPMs.
        Returns:
            A wrapper object of the forward SDE (VP type).

        ===============================================================
        Example:
        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
        >>> ns = NoiseScheduleVP('discrete', betas=betas)
        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
        # For continuous-time DPMs (VPSDE), linear schedule:
        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
        """

        if schedule not in ['discrete', 'linear', 'cosine']:
            raise ValueError(
                f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'"
            )

        self.schedule = schedule
        if schedule == 'discrete':
            if betas is not None:
                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
            else:
                assert alphas_cumprod is not None
                log_alphas = 0.5 * torch.log(alphas_cumprod)
            self.total_N = len(log_alphas)
            self.T = 1.
            self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
            self.log_alpha_array = log_alphas.reshape((1, -1,)).to(dtype=dtype)
        else:
            self.total_N = 1000
            self.beta_0 = continuous_beta_0
            self.beta_1 = continuous_beta_1
            self.cosine_s = 0.008
            self.cosine_beta_max = 999.
            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (
                        1. + self.cosine_s) / math.pi - self.cosine_s
            self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
            self.schedule = schedule
            self.T = 0.9946 if schedule == 'cosine' else 1.

    def marginal_log_mean_coeff(self, t):
        """
        Compute log(alpha_t) of a given continuous-time label t in [0, T].
        """
        if self.schedule == 'discrete':
            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
                                  self.log_alpha_array.to(t.device)).reshape((-1))
        elif self.schedule == 'linear':
            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
        elif self.schedule == 'cosine':
            log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
            return log_alpha_fn(t) - self.cosine_log_alpha_0

    def marginal_alpha(self, t):
        """
        Compute alpha_t of a given continuous-time label t in [0, T].
        """
        return torch.exp(self.marginal_log_mean_coeff(t))

    def marginal_std(self, t):
        """
        Compute sigma_t of a given continuous-time label t in [0, T].
        """
        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))

    def marginal_lambda(self, t):
        """
        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
        """
        log_mean_coeff = self.marginal_log_mean_coeff(t)
        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
        return log_mean_coeff - log_std

    def inverse_lambda(self, lamb):
        """
        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
        """
        if self.schedule == 'linear':
            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
            Delta = self.beta_0 ** 2 + tmp
            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
        elif self.schedule == 'discrete':
            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
                               torch.flip(self.t_array.to(lamb.device), [1]))
            return t.reshape((-1,))
        else:
            log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (
                        1. + self.cosine_s) / math.pi - self.cosine_s
            return t_fn(log_alpha)

    def edm_sigma(self, t):
        return self.marginal_std(t) / self.marginal_alpha(t)

    def edm_inverse_sigma(self, edmsigma):
        alpha = 1 / (edmsigma ** 2 + 1).sqrt()
        sigma = alpha * edmsigma
        lambda_t = torch.log(alpha / sigma)
        return self.inverse_lambda(lambda_t)


def model_wrapper(
        model,
        noise_schedule,
        model_type="noise",
        model_kwargs={},
        guidance_type="uncond",
        condition=None,
        unconditional_condition=None,
        guidance_scale=1.,
        classifier_fn=None,
        classifier_kwargs={},
):
    """Thanks to DPM-Solver for their code base"""
    """Create a wrapper function for the noise prediction model.
    SA-Solver needs to solve the continuous-time diffusion SDEs. For DPMs trained on discrete-time labels, we need to
    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
    We support four types of the diffusion model by setting `model_type`:
        1. "noise": noise prediction model. (Trained by predicting noise).
        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
        3. "v": velocity prediction model. (Trained by predicting the velocity).
            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
                arXiv preprint arXiv:2202.00512 (2022).
            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
                arXiv preprint arXiv:2210.02303 (2022).

        4. "score": marginal score function. (Trained by denoising score matching).
            Note that the score function and the noise prediction model follows a simple relationship:
            ```
                noise(x_t, t) = -sigma_t * score(x_t, t)
            ```
    We support three types of guided sampling by DPMs by setting `guidance_type`:
        1. "uncond": unconditional sampling by DPMs.
            The input `model` has the following format:
            ``
                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
            ``
        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
            The input `model` has the following format:
            ``
                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
            ``
            The input `classifier_fn` has the following format:
            ``
                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
            ``
            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
            The input `model` has the following format:
            ``
                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
            ``
            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
                arXiv preprint arXiv:2207.12598 (2022).

    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
    or continuous-time labels (i.e. epsilon to T).
    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
    ``
        def model_fn(x, t_continuous) -> noise:
            t_input = get_model_input_time(t_continuous)
            return noise_pred(model, x, t_input, **model_kwargs)
    ``
    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for SA-Solver.
    ===============================================================
    Args:
        model: A diffusion model with the corresponding format described above.
        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
        model_type: A `str`. The parameterization type of the diffusion model.
                    "noise" or "x_start" or "v" or "score".
        model_kwargs: A `dict`. A dict for the other inputs of the model function.
        guidance_type: A `str`. The type of the guidance for sampling.
                    "uncond" or "classifier" or "classifier-free".
        condition: A pytorch tensor. The condition for the guided sampling.
                    Only used for "classifier" or "classifier-free" guidance type.
        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
                    Only used for "classifier-free" guidance type.
        guidance_scale: A `float`. The scale for the guided sampling.
        classifier_fn: A classifier function. Only used for the classifier guidance.
        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
    Returns:
        A noise prediction model that accepts the noised data and the continuous time as the inputs.
    """

    def get_model_input_time(t_continuous):
        """
        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
        For continuous-time DPMs, we just use `t_continuous`.
        """
        if noise_schedule.schedule == 'discrete':
            return (t_continuous - 1. / noise_schedule.total_N) * 1000.
        else:
            return t_continuous

    def noise_pred_fn(x, t_continuous, cond=None):
        t_input = get_model_input_time(t_continuous)
        if cond is None:
            output = model(x, t_input, **model_kwargs)
        else:
            output = model(x, t_input, cond, **model_kwargs)
        if model_type == "noise":
            return output
        elif model_type == "x_start":
            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
            return (x - alpha_t[0] * output) / sigma_t[0]
        elif model_type == "v":
            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
            return alpha_t[0] * output + sigma_t[0] * x
        elif model_type == "score":
            sigma_t = noise_schedule.marginal_std(t_continuous)
            return -sigma_t[0] * output

    def cond_grad_fn(x, t_input):
        """
        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
        """
        with torch.enable_grad():
            x_in = x.detach().requires_grad_(True)
            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
            return torch.autograd.grad(log_prob.sum(), x_in)[0]

    def model_fn(x, t_continuous):
        """
        The noise predicition model function that is used for DPM-Solver.
        """
        if guidance_type == "uncond":
            return noise_pred_fn(x, t_continuous)
        elif guidance_type == "classifier":
            assert classifier_fn is not None
            t_input = get_model_input_time(t_continuous)
            cond_grad = cond_grad_fn(x, t_input)
            sigma_t = noise_schedule.marginal_std(t_continuous)
            noise = noise_pred_fn(x, t_continuous)
            return noise - guidance_scale * sigma_t * cond_grad
        elif guidance_type == "classifier-free":
            if guidance_scale == 1. or unconditional_condition is None:
                return noise_pred_fn(x, t_continuous, cond=condition)
            x_in = torch.cat([x] * 2)
            t_in = torch.cat([t_continuous] * 2)
            c_in = torch.cat([unconditional_condition, condition])
            noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
            return noise_uncond + guidance_scale * (noise - noise_uncond)

    assert model_type in ["noise", "x_start", "v", "score"]
    assert guidance_type in ["uncond", "classifier", "classifier-free"]
    return model_fn


class SASolver:
    def __init__(
            self,
            model_fn,
            noise_schedule,
            algorithm_type="data_prediction",
            correcting_x0_fn=None,
            correcting_xt_fn=None,
            thresholding_max_val=1.,
            dynamic_thresholding_ratio=0.995
    ):
        """
        Construct a SA-Solver
        The default value for algorithm_type is "data_prediction" and we recommend not to change it to
        "noise_prediction". For details, please see Appendix A.2.4 in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
        """

        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
        self.noise_schedule = noise_schedule
        assert algorithm_type in ["data_prediction", "noise_prediction"]

        if correcting_x0_fn == "dynamic_thresholding":
            self.correcting_x0_fn = self.dynamic_thresholding_fn
        else:
            self.correcting_x0_fn = correcting_x0_fn

        self.correcting_xt_fn = correcting_xt_fn
        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
        self.thresholding_max_val = thresholding_max_val

        self.predict_x0 = algorithm_type == "data_prediction"

        self.sigma_min = float(self.noise_schedule.edm_sigma(torch.tensor([1e-3])))
        self.sigma_max = float(self.noise_schedule.edm_sigma(torch.tensor([1])))

    def dynamic_thresholding_fn(self, x0, t=None):
        """
        The dynamic thresholding method.
        """
        dims = x0.dim()
        p = self.dynamic_thresholding_ratio
        s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
        s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
        x0 = torch.clamp(x0, -s, s) / s
        return x0

    def noise_prediction_fn(self, x, t):
        """
        Return the noise prediction model.
        """
        return self.model(x, t)

    def data_prediction_fn(self, x, t):
        """
        Return the data prediction model (with corrector).
        """
        noise = self.noise_prediction_fn(x, t)
        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
        x0 = (x - sigma_t * noise) / alpha_t
        if self.correcting_x0_fn is not None:
            x0 = self.correcting_x0_fn(x0)
        return x0

    def model_fn(self, x, t):
        """
        Convert the model to the noise prediction model or the data prediction model.
        """

        if self.predict_x0:
            return self.data_prediction_fn(x, t)
        else:
            return self.noise_prediction_fn(x, t)

    def get_time_steps(self, skip_type, t_T, t_0, N, order, device):
        """Compute the intermediate time steps for sampling.
        """
        if skip_type == 'logSNR':
            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
            logSNR_steps = lambda_T + torch.linspace(torch.tensor(0.).cpu().item(),
                                                     (lambda_0 - lambda_T).cpu().item() ** (1. / order), N + 1).pow(
                order).to(device)
            return self.noise_schedule.inverse_lambda(logSNR_steps)
        elif skip_type == 'time':
            t = torch.linspace(t_T ** (1. / order), t_0 ** (1. / order), N + 1).pow(order).to(device)
            return t
        elif skip_type == 'karras':
            sigma_min = max(0.002, self.sigma_min)
            sigma_max = min(80, self.sigma_max)
            sigma_steps = torch.linspace(sigma_max ** (1. / 7), sigma_min ** (1. / 7), N + 1).pow(7).to(device)
            return self.noise_schedule.edm_inverse_sigma(sigma_steps)
        else:
            raise ValueError(
                f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time' or 'karras'"
            )

    def denoise_to_zero_fn(self, x, s):
        """
        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
        """
        return self.data_prediction_fn(x, s)

    def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
        """
        Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
        For calculating the coefficient of gradient terms after the lagrange interpolation,
        see Eq.(15) and Eq.(18) in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
        For noise_prediction formula.
        """
        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"

        if order == 0:
            return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
        elif order == 1:
            return torch.exp(-interval_end) * (
                        (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1))
        elif order == 2:
            return torch.exp(-interval_end) * (
                        (interval_start ** 2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start) - (
                            interval_end ** 2 + 2 * interval_end + 2))
        elif order == 3:
            return torch.exp(-interval_end) * (
                        (interval_start ** 3 + 3 * interval_start ** 2 + 6 * interval_start + 6) * torch.exp(
                    interval_end - interval_start) - (interval_end ** 3 + 3 * interval_end ** 2 + 6 * interval_end + 6))

    def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
        """
        Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
        For calculating the coefficient of gradient terms after the lagrange interpolation,
        see Eq.(15) and Eq.(18) in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
        For data_prediction formula.
        """
        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"

        # after change of variable(cov)
        interval_end_cov = (1 + tau ** 2) * interval_end
        interval_start_cov = (1 + tau ** 2) * interval_start

        if order == 0:
            return torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (
            (1 + tau ** 2))
        elif order == 1:
            return torch.exp(interval_end_cov) * ((interval_end_cov - 1) - (interval_start_cov - 1) * torch.exp(
                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 2)
        elif order == 2:
            return torch.exp(interval_end_cov) * ((interval_end_cov ** 2 - 2 * interval_end_cov + 2) - (
                        interval_start_cov ** 2 - 2 * interval_start_cov + 2) * torch.exp(
                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 3)
        elif order == 3:
            return torch.exp(interval_end_cov) * (
                        (interval_end_cov ** 3 - 3 * interval_end_cov ** 2 + 6 * interval_end_cov - 6) - (
                            interval_start_cov ** 3 - 3 * interval_start_cov ** 2 + 6 * interval_start_cov - 6) * torch.exp(
                    -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 4)

    def lagrange_polynomial_coefficient(self, order, lambda_list):
        """
        Calculate the coefficient of lagrange polynomial
        For lagrange interpolation
        """
        assert order in [0, 1, 2, 3]
        assert order == len(lambda_list) - 1
        if order == 0:
            return [[1]]
        elif order == 1:
            return [[1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])],
                    [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])]]
        elif order == 2:
            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
            return [[1 / denominator1,
                     (-lambda_list[1] - lambda_list[2]) / denominator1,
                     lambda_list[1] * lambda_list[2] / denominator1],

                    [1 / denominator2,
                     (-lambda_list[0] - lambda_list[2]) / denominator2,
                     lambda_list[0] * lambda_list[2] / denominator2],

                    [1 / denominator3,
                     (-lambda_list[0] - lambda_list[1]) / denominator3,
                     lambda_list[0] * lambda_list[1] / denominator3]
                    ]
        elif order == 3:
            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) * (
                        lambda_list[0] - lambda_list[3])
            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) * (
                        lambda_list[1] - lambda_list[3])
            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) * (
                        lambda_list[2] - lambda_list[3])
            denominator4 = (lambda_list[3] - lambda_list[0]) * (lambda_list[3] - lambda_list[1]) * (
                        lambda_list[3] - lambda_list[2])
            return [[1 / denominator1,
                     (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
                     (lambda_list[1] * lambda_list[2] + lambda_list[1] * lambda_list[3] + lambda_list[2] * lambda_list[
                         3]) / denominator1,
                     (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1],

                    [1 / denominator2,
                     (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
                     (lambda_list[0] * lambda_list[2] + lambda_list[0] * lambda_list[3] + lambda_list[2] * lambda_list[
                         3]) / denominator2,
                     (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2],

                    [1 / denominator3,
                     (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[3] + lambda_list[1] * lambda_list[
                         3]) / denominator3,
                     (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3],

                    [1 / denominator4,
                     (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[2] + lambda_list[1] * lambda_list[
                         2]) / denominator4,
                     (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4]

                    ]

    def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
        """
        Calculate the coefficient of gradients.
        """
        assert order in [1, 2, 3, 4]
        assert order == len(lambda_list), 'the length of lambda list must be equal to the order'
        coefficients = []
        lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
        for i in range(order):
            coefficient = sum(
                lagrange_coefficient[i][j]
                * self.get_coefficients_exponential_positive(
                    order - 1 - j, interval_start, interval_end, tau
                )
                if self.predict_x0
                else lagrange_coefficient[i][j]
                * self.get_coefficients_exponential_negative(
                    order - 1 - j, interval_start, interval_end
                )
                for j in range(order)
            )
            coefficients.append(coefficient)
        assert len(coefficients) == order, 'the length of coefficients does not match the order'
        return coefficients

    def adams_bashforth_update(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
        """
        SA-Predictor, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
        """
        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"

        # get noise schedule
        ns = self.noise_schedule
        alpha_t = ns.marginal_alpha(t)
        sigma_t = ns.marginal_std(t)
        lambda_t = ns.marginal_lambda(t)
        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
        sigma_prev = ns.marginal_std(t_prev_list[-1])
        gradient_part = torch.zeros_like(x)
        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
        lambda_list = [ns.marginal_lambda(t_prev_list[-(i + 1)]) for i in range(order)]
        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
                                                         lambda_list, tau)

        for i in range(order):
            if self.predict_x0:
                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
                    i] * model_prev_list[-(i + 1)]
            else:
                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]

        if self.predict_x0:
            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
        else:
            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise

        if self.predict_x0:
            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
        else:
            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part

        return x_t

    def adams_moulton_update(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
        """
        SA-Corrector, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
        """

        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"

        # get noise schedule
        ns = self.noise_schedule
        alpha_t = ns.marginal_alpha(t)
        sigma_t = ns.marginal_std(t)
        lambda_t = ns.marginal_lambda(t)
        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
        sigma_prev = ns.marginal_std(t_prev_list[-1])
        gradient_part = torch.zeros_like(x)
        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
        t_list = t_prev_list + [t]
        lambda_list = [ns.marginal_lambda(t_list[-(i + 1)]) for i in range(order)]
        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
                                                         lambda_list, tau)

        for i in range(order):
            if self.predict_x0:
                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
                    i] * model_prev_list[-(i + 1)]
            else:
                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]

        if self.predict_x0:
            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
        else:
            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise

        if self.predict_x0:
            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
        else:
            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part

        return x_t

    def adams_bashforth_update_few_steps(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
        """
        SA-Predictor, with the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
        """

        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"

        # get noise schedule
        ns = self.noise_schedule
        alpha_t = ns.marginal_alpha(t)
        sigma_t = ns.marginal_std(t)
        lambda_t = ns.marginal_lambda(t)
        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
        sigma_prev = ns.marginal_std(t_prev_list[-1])
        gradient_part = torch.zeros_like(x)
        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
        lambda_list = [ns.marginal_lambda(t_prev_list[-(i + 1)]) for i in range(order)]
        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
                                                         lambda_list, tau)

        if self.predict_x0:
            if order == 2:  ## if order = 2 we do a modification that does not influence the convergence order similar to unipc. Note: This is used only for few steps sampling.
                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
                # ODE case
                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
                gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
                            h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
                                (1 + tau ** 2) ** 2)) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(
                    t_prev_list[-2]))
                gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
                            h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
                                (1 + tau ** 2) ** 2)) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(
                    t_prev_list[-2]))

        for i in range(order):
            if self.predict_x0:
                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
                    i] * model_prev_list[-(i + 1)]
            else:
                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]

        if self.predict_x0:
            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
        else:
            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise

        if self.predict_x0:
            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
        else:
            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part

        return x_t

    def adams_moulton_update_few_steps(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
        """
        SA-Corrector, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
        """

        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"

        # get noise schedule
        ns = self.noise_schedule
        alpha_t = ns.marginal_alpha(t)
        sigma_t = ns.marginal_std(t)
        lambda_t = ns.marginal_lambda(t)
        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
        sigma_prev = ns.marginal_std(t_prev_list[-1])
        gradient_part = torch.zeros_like(x)
        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
        t_list = t_prev_list + [t]
        lambda_list = [ns.marginal_lambda(t_list[-(i + 1)]) for i in range(order)]
        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
                                                         lambda_list, tau)

        if self.predict_x0:
            if order == 2:  ## if order = 2 we do a modification that does not influence the convergence order similar to UniPC. Note: This is used only for few steps sampling.
                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
                # ODE case
                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
                gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
                            h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
                                (1 + tau ** 2) ** 2 * h))
                gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
                            h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
                                (1 + tau ** 2) ** 2 * h))

        for i in range(order):
            if self.predict_x0:
                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
                    i] * model_prev_list[-(i + 1)]
            else:
                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]

        if self.predict_x0:
            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
        else:
            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise

        if self.predict_x0:
            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
        else:
            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part

        return x_t

    def sample_few_steps(self, x, tau, steps=5, t_start=None, t_end=None, skip_type='time', skip_order=1,
                         predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False
                         ):
        """
        For the PC-mode, please refer to the wiki page
        https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
        'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
        We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
        """

        skip_first_step = False
        skip_final_step = True
        lower_order_final = True
        denoise_to_zero = False

        assert pc_mode in ['PEC', 'PECE'], 'Predictor-corrector mode only supports PEC and PECE'
        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
        t_T = self.noise_schedule.T if t_start is None else t_start
        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"

        device = x.device
        intermediates = []
        with torch.no_grad():
            assert steps >= max(predictor_order, corrector_order - 1)
            timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, order=skip_order,
                                            device=device)
            assert timesteps.shape[0] - 1 == steps
            # Init the initial values.
            step = 0
            t = timesteps[step]
            noise = torch.randn_like(x)
            t_prev_list = [t]
            # do not evaluate if skip_first_step
            if skip_first_step:
                if self.predict_x0:
                    alpha_t = self.noise_schedule.marginal_alpha(t)
                    sigma_t = self.noise_schedule.marginal_std(t)
                    model_prev_list = [(1 - sigma_t) / alpha_t * x]
                else:
                    model_prev_list = [x]
            else:
                model_prev_list = [self.model_fn(x, t)]

            if self.correcting_xt_fn is not None:
                x = self.correcting_xt_fn(x, t, step)
            if return_intermediate:
                intermediates.append(x)

            # determine the first several values
            for step in tqdm(range(1, max(predictor_order, corrector_order - 1))):

                t = timesteps[step]
                predictor_order_used = min(predictor_order, step)
                corrector_order_used = min(corrector_order, step + 1)
                noise = torch.randn_like(x)
                # predictor step
                x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=tau(t),
                                                            model_prev_list=model_prev_list, t_prev_list=t_prev_list,
                                                            noise=noise, t=t)
                # evaluation step
                model_x = self.model_fn(x_p, t)

                # update model_list
                model_prev_list.append(model_x)
                # corrector step
                if corrector_order > 0:
                    x = self.adams_moulton_update_few_steps(order=corrector_order_used, x=x, tau=tau(t),
                                                            model_prev_list=model_prev_list, t_prev_list=t_prev_list,
                                                            noise=noise, t=t)
                else:
                    x = x_p

                # evaluation step if correction and mode = pece
                if corrector_order > 0 and pc_mode == 'PECE':
                    model_x = self.model_fn(x, t)
                    del model_prev_list[-1]
                    model_prev_list.append(model_x)

                if self.correcting_xt_fn is not None:
                    x = self.correcting_xt_fn(x, t, step)
                if return_intermediate:
                    intermediates.append(x)

                t_prev_list.append(t)

            for step in tqdm(range(max(predictor_order, corrector_order - 1), steps + 1)):
                if lower_order_final:
                    predictor_order_used = min(predictor_order, steps - step + 1)
                    corrector_order_used = min(corrector_order, steps - step + 2)

                else:
                    predictor_order_used = predictor_order
                    corrector_order_used = corrector_order
                t = timesteps[step]
                noise = torch.randn_like(x)

                # predictor step
                if skip_final_step and step == steps and not denoise_to_zero:
                    x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=0,
                                                                model_prev_list=model_prev_list,
                                                                t_prev_list=t_prev_list, noise=noise, t=t)
                else:
                    x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=tau(t),
                                                                model_prev_list=model_prev_list,
                                                                t_prev_list=t_prev_list, noise=noise, t=t)

                # evaluation step
                # do not evaluate if skip_final_step and step = steps
                if not skip_final_step or step < steps:
                    model_x = self.model_fn(x_p, t)

                # update model_list
                # do not update if skip_final_step and step = steps
                if not skip_final_step or step < steps:
                    model_prev_list.append(model_x)

                # corrector step
                # do not correct if skip_final_step and step = steps
                if corrector_order > 0 and (not skip_final_step or step < steps):
                    x = self.adams_moulton_update_few_steps(order=corrector_order_used, x=x, tau=tau(t),
                                                            model_prev_list=model_prev_list,
                                                            t_prev_list=t_prev_list, noise=noise, t=t)
                else:
                    x = x_p

                # evaluation step if mode = pece and step != steps
                if corrector_order > 0 and (pc_mode == 'PECE' and step < steps):
                    model_x = self.model_fn(x, t)
                    del model_prev_list[-1]
                    model_prev_list.append(model_x)

                if self.correcting_xt_fn is not None:
                    x = self.correcting_xt_fn(x, t, step)
                if return_intermediate:
                    intermediates.append(x)

                t_prev_list.append(t)
                del model_prev_list[0]

            if denoise_to_zero:
                t = torch.ones((1,)).to(device) * t_0
                x = self.denoise_to_zero_fn(x, t)
                if self.correcting_xt_fn is not None:
                    x = self.correcting_xt_fn(x, t, step + 1)
                if return_intermediate:
                    intermediates.append(x)
        return (x, intermediates) if return_intermediate else x

    def sample_more_steps(self, x, tau, steps=20, t_start=None, t_end=None, skip_type='time', skip_order=1,
                          predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False
                          ):
        """
        For the PC-mode, please refer to the wiki page
        https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
        'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
        We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
        """

        skip_first_step = False
        skip_final_step = False
        lower_order_final = True
        denoise_to_zero = True

        assert pc_mode in ['PEC', 'PECE'], 'Predictor-corrector mode only supports PEC and PECE'
        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
        t_T = self.noise_schedule.T if t_start is None else t_start
        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"

        device = x.device
        intermediates = []
        with torch.no_grad():
            assert steps >= max(predictor_order, corrector_order - 1)
            timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, order=skip_order,
                                            device=device)
            assert timesteps.shape[0] - 1 == steps
            # Init the initial values.
            step = 0
            t = timesteps[step]
            noise = torch.randn_like(x)
            t_prev_list = [t]
            # do not evaluate if skip_first_step
            if skip_first_step:
                if self.predict_x0:
                    alpha_t = self.noise_schedule.marginal_alpha(t)
                    sigma_t = self.noise_schedule.marginal_std(t)
                    model_prev_list = [(1 - sigma_t) / alpha_t * x]
                else:
                    model_prev_list = [x]
            else:
                model_prev_list = [self.model_fn(x, t)]

            if self.correcting_xt_fn is not None:
                x = self.correcting_xt_fn(x, t, step)
            if return_intermediate:
                intermediates.append(x)

            # determine the first several values
            for step in tqdm(range(1, max(predictor_order, corrector_order - 1))):

                t = timesteps[step]
                predictor_order_used = min(predictor_order, step)
                corrector_order_used = min(corrector_order, step + 1)
                noise = torch.randn_like(x)
                # predictor step
                x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=tau(t),
                                                  model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise,
                                                  t=t)
                # evaluation step
                model_x = self.model_fn(x_p, t)

                # update model_list
                model_prev_list.append(model_x)
                # corrector step
                if corrector_order > 0:
                    x = self.adams_moulton_update(order=corrector_order_used, x=x, tau=tau(t),
                                                  model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise,
                                                  t=t)
                else:
                    x = x_p

                # evaluation step if mode = pece
                if corrector_order > 0 and pc_mode == 'PECE':
                    model_x = self.model_fn(x, t)
                    del model_prev_list[-1]
                    model_prev_list.append(model_x)
                if self.correcting_xt_fn is not None:
                    x = self.correcting_xt_fn(x, t, step)
                if return_intermediate:
                    intermediates.append(x)

                t_prev_list.append(t)

            for step in tqdm(range(max(predictor_order, corrector_order - 1), steps + 1)):
                if lower_order_final:
                    predictor_order_used = min(predictor_order, steps - step + 1)
                    corrector_order_used = min(corrector_order, steps - step + 2)

                else:
                    predictor_order_used = predictor_order
                    corrector_order_used = corrector_order
                t = timesteps[step]
                noise = torch.randn_like(x)

                # predictor step
                if skip_final_step and step == steps and not denoise_to_zero:
                    x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=0,
                                                      model_prev_list=model_prev_list, t_prev_list=t_prev_list,
                                                      noise=noise, t=t)
                else:
                    x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=tau(t),
                                                      model_prev_list=model_prev_list, t_prev_list=t_prev_list,
                                                      noise=noise, t=t)

                # evaluation step
                # do not evaluate if skip_final_step and step = steps
                if not skip_final_step or step < steps:
                    model_x = self.model_fn(x_p, t)

                # update model_list
                # do not update if skip_final_step and step = steps
                if not skip_final_step or step < steps:
                    model_prev_list.append(model_x)

                # corrector step
                # do not correct if skip_final_step and step = steps
                if corrector_order > 0:
                    if not skip_final_step or step < steps:
                        x = self.adams_moulton_update(order=corrector_order_used, x=x, tau=tau(t),
                                                      model_prev_list=model_prev_list, t_prev_list=t_prev_list,
                                                      noise=noise, t=t)
                    else:
                        x = x_p
                else:
                    x = x_p

                # evaluation step if mode = pece and step != steps
                if corrector_order > 0 and (pc_mode == 'PECE' and step < steps):
                    model_x = self.model_fn(x, t)
                    del model_prev_list[-1]
                    model_prev_list.append(model_x)

                if self.correcting_xt_fn is not None:
                    x = self.correcting_xt_fn(x, t, step)
                if return_intermediate:
                    intermediates.append(x)

                t_prev_list.append(t)
                del model_prev_list[0]

            if denoise_to_zero:
                t = torch.ones((1,)).to(device) * t_0
                x = self.denoise_to_zero_fn(x, t)
                if self.correcting_xt_fn is not None:
                    x = self.correcting_xt_fn(x, t, step + 1)
                if return_intermediate:
                    intermediates.append(x)
        if return_intermediate:
            return x, intermediates
        else:
            return x

    def sample(self, mode, x, tau, steps, t_start=None, t_end=None, skip_type='time', skip_order=1, predictor_order=3,
               corrector_order=4, pc_mode='PEC', return_intermediate=False
               ):
        """
        For the PC-mode, please refer to the wiki page 
        https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
        'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
        We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.

        'few_steps' mode is recommended. The differences between 'few_steps' and 'more_steps' are as below:
        1) 'few_steps' do not correct at final step and do not denoise to zero, while 'more_steps' do these two.
        Thus the NFEs for 'few_steps' = steps, NFEs for 'more_steps' = steps + 2
        For most of the experiments and tasks, we find these two operations do not have much help to sample quality.
        2) 'few_steps' use a rescaling trick as in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
        We find it will slightly improve the sample quality especially in few steps.
        """
        assert mode in ['few_steps', 'more_steps'], "mode must be either 'few_steps' or 'more_steps'"
        if mode == 'few_steps':
            return self.sample_few_steps(x=x, tau=tau, steps=steps, t_start=t_start, t_end=t_end, skip_type=skip_type,
                                         skip_order=skip_order, predictor_order=predictor_order,
                                         corrector_order=corrector_order, pc_mode=pc_mode,
                                         return_intermediate=return_intermediate)
        else:
            return self.sample_more_steps(x=x, tau=tau, steps=steps, t_start=t_start, t_end=t_end, skip_type=skip_type,
                                          skip_order=skip_order, predictor_order=predictor_order,
                                          corrector_order=corrector_order, pc_mode=pc_mode,
                                          return_intermediate=return_intermediate)


#############################################################
# other utility functions
#############################################################

def interpolate_fn(x, xp, yp):
    """
    A piecewise linear function y = f(x), using xp and yp as keypoints.
    We implement f(x) in a differentiable way (i.e. applicable for autograd).
    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
    Args:
        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
        yp: PyTorch tensor with shape [C, K].
    Returns:
        The function values f(x), with shape [N, C].
    """
    N, K = x.shape[0], xp.shape[1]
    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
    x_idx = torch.argmin(x_indices, dim=2)
    cand_start_idx = x_idx - 1
    start_idx = torch.where(
        torch.eq(x_idx, 0),
        torch.tensor(1, device=x.device),
        torch.where(
            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
        ),
    )
    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
    start_idx2 = torch.where(
        torch.eq(x_idx, 0),
        torch.tensor(0, device=x.device),
        torch.where(
            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
        ),
    )
    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
    return cand


def expand_dims(v, dims):
    """
    Expand the tensor `v` to the dim `dims`.
    Args:
        `v`: a PyTorch tensor with shape [N].
        `dim`: a `int`.
    Returns:
        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
    """
    return v[(...,) + (None,) * (dims - 1)]

================================================
FILE: PixArt-alpha-ToCa/diffusion/model/t5.py
================================================
# -*- coding: utf-8 -*-
import os
import re
import html
import urllib.parse as ul

import ftfy
import torch
from bs4 import BeautifulSoup
from transformers import T5EncoderModel, AutoTokenizer
from huggingface_hub import hf_hub_download

class T5Embedder:

    available_models = ['t5-v1_1-xxl']
    bad_punct_regex = re.compile(r'['+'#®•©™&@·º½¾¿¡§~'+'\)'+'\('+'\]'+'\['+'\}'+'\{'+'\|'+'\\'+'\/'+'\*' + r']{1,}')  # noqa

    def __init__(self, device, dir_or_name='t5-v1_1-xxl', *, local_cache=False, cache_dir=None, hf_token=None, use_text_preprocessing=True,
                 t5_model_kwargs=None, torch_dtype=None, use_offload_folder=None, model_max_length=120):
        self.device = torch.device(device)
        self.torch_dtype = torch_dtype or torch.bfloat16
        if t5_model_kwargs is None:
            t5_model_kwargs = {'low_cpu_mem_usage': True, 'torch_dtype': self.torch_dtype}
            if use_offload_folder is not None:
                t5_model_kwargs['offload_folder'] = use_offload_folder
                t5_model_kwargs['device_map'] = {
                    'shared': self.device,
                    'encoder.embed_tokens': self.device,
                    'encoder.block.0': self.device,
                    'encoder.block.1': self.device,
                    'encoder.block.2': self.device,
                    'encoder.block.3': self.device,
                    'encoder.block.4': self.device,
                    'encoder.block.5': self.device,
                    'encoder.block.6': self.device,
                    'encoder.block.7': self.device,
                    'encoder.block.8': self.device,
                    'encoder.block.9': self.device,
                    'encoder.block.10': self.device,
                    'encoder.block.11': self.device,
                    'encoder.block.12': 'disk',
                    'encoder.block.13': 'disk',
                    'encoder.block.14': 'disk',
                    'encoder.block.15': 'disk',
                    'encoder.block.16': 'disk',
                    'encoder.block.17': 'disk',
                    'encoder.block.18': 'disk',
                    'encoder.block.19': 'disk',
                    'encoder.block.20': 'disk',
                    'encoder.block.21': 'disk',
                    'encoder.block.22': 'disk',
                    'encoder.block.23': 'disk',
                    'encoder.final_layer_norm': 'disk',
                    'encoder.dropout': 'disk',
                }
            else:
                t5_model_kwargs['device_map'] = {'shared': self.device, 'encoder': self.device}

        self.use_text_preprocessing = use_text_preprocessing
        self.hf_token = hf_token
        self.cache_dir = cache_dir or os.path.expanduser('~/.cache/IF_')
        self.dir_or_name = dir_or_name
        tokenizer_path, path = dir_or_name, dir_or_name
        if local_cache:
            cache_dir = os.path.join(self.cache_dir, dir_or_name)
            tokenizer_path, path = cache_dir, cache_dir
        elif dir_or_name in self.available_models:
            cache_dir = os.path.join(self.cache_dir, dir_or_name)
            for filename in [
                'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json',
                'pytorch_model.bin.index.json', 'pytorch_model-00001-of-00002.bin', 'pytorch_model-00002-of-00002.bin'
            ]:
                hf_hub_download(repo_id=f'DeepFloyd/{dir_or_name}', filename=filename, cache_dir=cache_dir,
                                force_filename=filename, token=self.hf_token)
            tokenizer_path, path = cache_dir, cache_dir
        else:
            cache_dir = os.path.join(self.cache_dir, 't5-v1_1-xxl')
            for filename in [
                'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json',
            ]:
                hf_hub_download(repo_id='DeepFloyd/t5-v1_1-xxl', filename=filename, cache_dir=cache_dir,
                                force_filename=filename, token=self.hf_token)
            tokenizer_path = cache_dir

        print(tokenizer_path)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        self.model = T5EncoderModel.from_pretrained(path, **t5_model_kwargs).eval()
        self.model_max_length = model_max_length

    def get_text_embeddings(self, texts):
        texts = [self.text_preprocessing(text) for text in texts]

        text_tokens_and_mask = self.tokenizer(
            texts,
            max_length=self.model_max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        text_tokens_and_mask['input_ids'] = text_tokens_and_mask['input_ids']
        text_tokens_and_mask['attention_mask'] = text_tokens_and_mask['attention_mask']

        with torch.no_grad():
            text_encoder_embs = self.model(
                input_ids=text_tokens_and_mask['input_ids'].to(self.device),
                attention_mask=text_tokens_and_mask['attention_mask'].to(self.device),
            )['last_hidden_state'].detach()
        return text_encoder_embs, text_tokens_and_mask['attention_mask'].to(self.device)

    def text_preprocessing(self, text):
        if self.use_text_preprocessing:
            # The exact text cleaning as was in the training stage:
            text = self.clean_caption(text)
            text = self.clean_caption(text)
            return text
        else:
            return text.lower().strip()

    @staticmethod
    def basic_clean(text):
        text = ftfy.fix_text(text)
        text = html.unescape(html.unescape(text))
        return text.strip()

    def clean_caption(self, caption):
        caption = str(caption)
        caption = ul.unquote_plus(caption)
        caption = caption.strip().lower()
        caption = re.sub('<person>', 'person', caption)
        # urls:
        caption = re.sub(
            r'\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))',  # noqa
            '', caption)  # regex for urls
        caption = re.sub(
            r'\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))',  # noqa
            '', caption)  # regex for urls
        # html:
        caption = BeautifulSoup(caption, features='html.parser').text

        # @<nickname>
        caption = re.sub(r'@[\w\d]+\b', '', caption)

        # 31C0—31EF CJK Strokes
        # 31F0—31FF Katakana Phonetic Extensions
        # 3200—32FF Enclosed CJK Letters and Months
        # 3300—33FF CJK Compatibility
        # 3400—4DBF CJK Unified Ideographs Extension A
        # 4DC0—4DFF Yijing Hexagram Symbols
        # 4E00—9FFF CJK Unified Ideographs
        caption = re.sub(r'[\u31c0-\u31ef]+', '', caption)
        caption = re.sub(r'[\u31f0-\u31ff]+', '', caption)
        caption = re.sub(r'[\u3200-\u32ff]+', '', caption)
        caption = re.sub(r'[\u3300-\u33ff]+', '', caption)
        caption = re.sub(r'[\u3400-\u4dbf]+', '', caption)
        caption = re.sub(r'[\u4dc0-\u4dff]+', '', caption)
        caption = re.sub(r'[\u4e00-\u9fff]+', '', caption)
        #######################################################

        # все виды тире / all types of dash --> "-"
        caption = re.sub(
            r'[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+',  # noqa
            '-', caption)

        # кавычки к одному стандарту
        caption = re.sub(r'[`´«»“”¨]', '"', caption)
        caption = re.sub(r'[‘’]', "'", caption)

        # &quot;
        caption = re.sub(r'&quot;?', '', caption)
        # &amp
        caption = re.sub(r'&amp', '', caption)

        # ip adresses:
        caption = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' ', caption)

        # article ids:
        caption = re.sub(r'\d:\d\d\s+$', '', caption)

        # \n
        caption = re.sub(r'\\n', ' ', caption)

        # "#123"
        caption = re.sub(r'#\d{1,3}\b', '', caption)
        # "#12345.."
        caption = re.sub(r'#\d{5,}\b', '', caption)
        # "123456.."
        caption = re.sub(r'\b\d{6,}\b', '', caption)
        # filenames:
        caption = re.sub(r'[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)', '', caption)

        #
        caption = re.sub(r'[\"\']{2,}', r'"', caption)  # """AUSVERKAUFT"""
        caption = re.sub(r'[\.]{2,}', r' ', caption)  # """AUSVERKAUFT"""

        caption = re.sub(self.bad_punct_regex, r' ', caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
        caption = re.sub(r'\s+\.\s+', r' ', caption)  # " . "

        # this-is-my-cute-cat / this_is_my_cute_cat
        regex2 = re.compile(r'(?:\-|\_)')
        if len(re.findall(regex2, caption)) > 3:
            caption = re.sub(regex2, ' ', caption)

        caption = self.basic_clean(caption)

        caption = re.sub(r'\b[a-zA-Z]{1,3}\d{3,15}\b', '', caption)  # jc6640
        caption = re.sub(r'\b[a-zA-Z]+\d+[a-zA-Z]+\b', '', caption)  # jc6640vc
        caption = re.sub(r'\b\d+[a-zA-Z]+\d+\b', '', caption)  # 6640vc231

        caption = re.sub(r'(worldwide\s+)?(free\s+)?shipping', '', caption)
        caption = re.sub(r'(free\s)?download(\sfree)?', '', caption)
        caption = re.sub(r'\bclick\b\s(?:for|on)\s\w+', '', caption)
        caption = re.sub(r'\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?', '', caption)
        caption = re.sub(r'\bpage\s+\d+\b', '', caption)

        caption = re.sub(r'\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b', r' ', caption)  # j2d1a2a...

        caption = re.sub(r'\b\d+\.?\d*[xх×]\d+\.?\d*\b', '', caption)

        caption = re.sub(r'\b\s+\:\s+', r': ', caption)
        caption = re.sub(r'(\D[,\./])\b', r'\1 ', caption)
        caption = re.sub(r'\s+', ' ', caption)

        caption.strip()

        caption = re.sub(r'^[\"\']([\w\W]+)[\"\']$', r'\1', caption)
        caption = re.sub(r'^[\'\_,\-\:;]', r'', caption)
        caption = re.sub(r'[\'\_,\-\:\-\+]$', r'', caption)
        caption = re.sub(r'^\.\S+$', '', caption)

        return caption.strip()


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/timestep_sampler.py
================================================
# Modified from OpenAI's diffusion repos
#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py

from abc import ABC, abstractmethod

import numpy as np
import torch as th
import torch.distributed as dist


def create_named_schedule_sampler(name, diffusion):
    """
    Create a ScheduleSampler from a library of pre-defined samplers.
    :param name: the name of the sampler.
    :param diffusion: the diffusion object to sample for.
    """
    if name == "uniform":
        return UniformSampler(diffusion)
    elif name == "loss-second-moment":
        return LossSecondMomentResampler(diffusion)
    else:
        raise NotImplementedError(f"unknown schedule sampler: {name}")


class ScheduleSampler(ABC):
    """
    A distribution over timesteps in the diffusion process, intended to reduce
    variance of the objective.
    By default, samplers perform unbiased importance sampling, in which the
    objective's mean is unchanged.
    However, subclasses may override sample() to change how the resampled
    terms are reweighted, allowing for actual changes in the objective.
    """

    @abstractmethod
    def weights(self):
        """
        Get a numpy array of weights, one per diffusion step.
        The weights needn't be normalized, but must be positive.
        """

    def sample(self, batch_size, device):
        """
        Importance-sample timesteps for a batch.
        :param batch_size: the number of timesteps.
        :param device: the torch device to save to.
        :return: a tuple (timesteps, weights):
                 - timesteps: a tensor of timestep indices.
                 - weights: a tensor of weights to scale the resulting losses.
        """
        w = self.weights()
        p = w / np.sum(w)
        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
        indices = th.from_numpy(indices_np).long().to(device)
        weights_np = 1 / (len(p) * p[indices_np])
        weights = th.from_numpy(weights_np).float().to(device)
        return indices, weights


class UniformSampler(ScheduleSampler):
    def __init__(self, diffusion):
        self.diffusion = diffusion
        self._weights = np.ones([diffusion.num_timesteps])

    def weights(self):
        return self._weights


class LossAwareSampler(ScheduleSampler):
    def update_with_local_losses(self, local_ts, local_losses):
        """
        Update the reweighting using losses from a model.
        Call this method from each rank with a batch of timesteps and the
        corresponding losses for each of those timesteps.
        This method will perform synchronization to make sure all of the ranks
        maintain the exact same reweighting.
        :param local_ts: an integer Tensor of timesteps.
        :param local_losses: a 1D Tensor of losses.
        """
        batch_sizes = [
            th.tensor([0], dtype=th.int32, device=local_ts.device)
            for _ in range(dist.get_world_size())
        ]
        dist.all_gather(
            batch_sizes,
            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
        )

        # Pad all_gather batches to be the maximum batch size.
        batch_sizes = [x.item() for x in batch_sizes]
        max_bs = max(batch_sizes)

        timestep_batches = [th.zeros(max_bs, device=local_ts.device) for _ in batch_sizes]
        loss_batches = [th.zeros(max_bs, device=local_losses.device) for _ in batch_sizes]
        dist.all_gather(timestep_batches, local_ts)
        dist.all_gather(loss_batches, local_losses)
        timesteps = [
            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
        ]
        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
        self.update_with_all_losses(timesteps, losses)

    @abstractmethod
    def update_with_all_losses(self, ts, losses):
        """
        Update the reweighting using losses from a model.
        Sub-classes should override this method to update the reweighting
        using losses from the model.
        This method directly updates the reweighting without synchronizing
        between workers. It is called by update_with_local_losses from all
        ranks with identical arguments. Thus, it should have deterministic
        behavior to maintain state across workers.
        :param ts: a list of int timesteps.
        :param losses: a list of float losses, one per timestep.
        """


class LossSecondMomentResampler(LossAwareSampler):
    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
        self.diffusion = diffusion
        self.history_per_term = history_per_term
        self.uniform_prob = uniform_prob
        self._loss_history = np.zeros(
            [diffusion.num_timesteps, history_per_term], dtype=np.float64
        )
        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)

    def weights(self):
        if not self._warmed_up():
            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
        weights /= np.sum(weights)
        weights *= 1 - self.uniform_prob
        weights += self.uniform_prob / len(weights)
        return weights

    def update_with_all_losses(self, ts, losses):
        for t, loss in zip(ts, losses):
            if self._loss_counts[t] == self.history_per_term:
                # Shift out the oldest loss term.
                self._loss_history[t, :-1] = self._loss_history[t, 1:]
                self._loss_history[t, -1] = loss
            else:
                self._loss_history[t, self._loss_counts[t]] = loss
                self._loss_counts[t] += 1

    def _warmed_up(self):
        return (self._loss_counts == self.history_per_term).all()


================================================
FILE: PixArt-alpha-ToCa/diffusion/model/utils.py
================================================
import os
import sys
import torch.nn as nn
from torch.utils.checkpoint import checkpoint, checkpoint_sequential
import torch.nn.functional as F
import torch
import torch.distributed as dist
import re
import math
from collections.abc import Iterable
from itertools import repeat
from torchvision import transforms as T
import random
from PIL import Image


def _ntuple(n):
    def parse(x):
        if isinstance(x, Iterable) and not isinstance(x, str):
            return x
        return tuple(repeat(x, n))
    return parse


to_1tuple = _ntuple(1)
to_2tuple = _ntuple(2)

def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
    assert isinstance(model, nn.Module)

    def set_attr(module):
        module.grad_checkpointing = True
        module.fp32_attention = use_fp32_attention
        module.grad_checkpointing_step = gc_step
    model.apply(set_attr)


def auto_grad_checkpoint(module, *args, **kwargs):
    if getattr(module, 'grad_checkpointing', False):
        if not isinstance(module, Iterable):
            return checkpoint(module, *args, **kwargs)
        gc_step = module[0].grad_checkpointing_step
        return checkpoint_sequential(module, gc_step, *args, **kwargs)
    return module(*args, **kwargs)


def checkpoint_sequential(functions, step, input, *args, **kwargs):

    # Hack for keyword-only parameter in a python 2.7-compliant way
    preserve = kwargs.pop('preserve_rng_state', True)
    if kwargs:
        raise ValueError("Unexpected keyword arguments: " + ",".join(kwargs))

    def run_function(start, end, functions):
        def forward(input):
            for j in range(start, end + 1):
                input = functions[j](input, *args)
            return input
        return forward

    if isinstance(functions, torch.nn.Sequential):
        functions = list(functions.children())

    # the last chunk has to be non-volatile
    end = -1
    segment = len(functions) // step
    for start in range(0, step * (segment - 1), step):
        end = start + step - 1
        input = checkpoint(run_function(start, end, functions), input, preserve_rng_state=preserve)
    return run_function(end + 1, len(functions) - 1, functions)(input)


def window_partition(x, window_size):
    """
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.

    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    """
    B, H, W, C = x.shape

    pad_h = (window_size - H % window_size) % window_size
    pad_w = (window_size - W % window_size) % window_size
    if pad_h > 0 or pad_w > 0:
        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
    Hp, Wp = H + pad_h, W + pad_w

    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
    return windows, (Hp, Wp)


def window_unpartition(windows, window_size, pad_hw, hw):
    """
    Window unpartition into original sequences and removing padding.
    Args:
        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
        window_size (int): window size.
        pad_hw (Tuple): padded height and width (Hp, Wp).
        hw (Tuple): original height and width (H, W) before padding.

    Returns:
        x: unpartitioned sequences with [B, H, W, C].
    """
    Hp, Wp = pad_hw
    H, W = hw
    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)

    if Hp > H or Wp > W:
        x = x[:, :H, :W, :].contiguous()
    return x


def get_rel_pos(q_size, k_size, rel_pos):
    """
    Get relative positional embeddings according to the relative positions of
        query and key sizes.
    Args:
        q_size (int): size of query q.
        k_size (int): size of key k.
        rel_pos (Tensor): relative position embeddings (L, C).

    Returns:
        Extracted positional embeddings according to relative positions.
    """
    max_rel_dist = int(2 * max(q_size, k_size) - 1)
    # Interpolate rel pos if needed.
    if rel_pos.shape[0] != max_rel_dist:
        # Interpolate rel pos.
        rel_pos_resized = F.interpolate(
            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
            size=max_rel_dist,
            mode="linear",
        )
        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
    else:
        rel_pos_resized = rel_pos

    # Scale the coords with short length if shapes for q and k are different.
    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)

    return rel_pos_resized[relative_coords.long()]


def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
    """
    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
    Args:
        attn (Tensor): attention map.
        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    """
    q_h, q_w = q_size
    k_h, k_w = k_size
    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
    Rw = get_rel_pos(q_w, k_w, rel_pos_w)

    B, _, dim = q.shape
    r_q = q.reshape(B, q_h, q_w, dim)
    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)

    attn = (
        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
    ).view(B, q_h * q_w, k_h * k_w)

    return attn

def mean_flat(tensor):
    return tensor.mean(dim=list(range(1, tensor.ndim)))


#################################################################################
#                          Token Masking and Unmasking                          #
#################################################################################
def get_mask(batch, length, mask_ratio, device, mask_type=None, data_info=None, extra_len=0):
    """
    Get the binary mask for the input sequence.
    Args:
        - batch: batch size
        - length: sequence length
        - mask_ratio: ratio of tokens to mask
        - data_info: dictionary with info for reconstruction
    return:
        mask_dict with following keys:
        - mask: binary mask, 0 is keep, 1 is remove
        - ids_keep: indices of tokens to keep
        - ids_restore: indices to restore the original order
    """
    assert mask_type in ['random', 'fft', 'laplacian', 'group']
    mask = torch.ones([batch, length], device=device)
    len_keep = int(length * (1 - mask_ratio)) - extra_len

    if mask_type in ['random', 'group']:
        noise = torch.rand(batch, length, device=device)  # noise in [0, 1]
        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
        ids_restore = torch.argsort(ids_shuffle, dim=1)
        # keep the first subset
        ids_keep = ids_shuffle[:, :len_keep]
        ids_removed = ids_shuffle[:, len_keep:]

    elif mask_type in ['fft', 'laplacian']:
        if 'strength' in data_info:
            strength = data_info['strength']

        else:
            N = data_info['N'][0]
            img = data_info['ori_img']
            # 获取原图的尺寸信息
            _, C, H, W = img.shape
            if mask_type == 'fft':
                # 对图片进行reshape，将其变为patch (3, H/N, N, W/N, N)
                reshaped_image = img.reshape((batch, -1, H // N, N, W // N, N))
                fft_image = torch.fft.fftn(reshaped_image, dim=(3, 5))
                # 取绝对值并求和获取频率强度
                strength = torch.sum(torch.abs(fft_image), dim=(1, 3, 5)).reshape((batch, -1,))
            elif type == 'laplacian':
                laplacian_kernel = torch.tensor([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=torch.float32).reshape(1, 1, 3, 3)
                laplacian_kernel = laplacian_kernel.repeat(C, 1, 1, 1)
                # 对图片进行reshape，将其变为patch (3, H/N, N, W/N, N)
                reshaped_image = img.reshape(-1, C, H // N, N, W // N, N).permute(0, 2, 4, 1, 3, 5).reshape(-1, C, N, N)
                laplacian_response = F.conv2d(reshaped_image, laplacian_kernel, padding=1, groups=C)
                strength = laplacian_response.sum(dim=[1, 2, 3]).reshape((batch, -1,))

        # 对频率强度进行归一化，然后使用torch.multinomial进行采样
        probabilities = strength / (strength.max(dim=1)[0][:, None]+1e-5)
        ids_shuffle = torch.multinomial(probabilities.clip(1e-5, 1), length, replacement=False)
        ids_keep = ids_shuffle[:, :len_keep]
        ids_restore = torch.argsort(ids_shuffle, dim=1)
        ids_removed = ids_shuffle[:, len_keep:]

    mask[:, :len_keep] = 0
    mask = torch.gather(mask, dim=1, index=ids_restore)

    return {'mask': mask,
            'ids_keep': ids_keep,
            'ids_restore': ids_restore,
            'ids_removed': ids_removed}


def mask_out_token(x, ids_keep, ids_removed=None):
    """
    Mask out the tokens specified by ids_keep.
    Args:
        - x: input sequence, [N, L, D]
        - ids_keep: indices of tokens to keep
    return:
        - x_masked: masked sequence
    """
    N, L, D = x.shape  # batch, length, dim
    x_remain = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
    if ids_removed is not None:
        x_masked = torch.gather(x, dim=1, index=ids_removed.unsqueeze(-1).repeat(1, 1, D))
        return x_remain, x_masked
    else:
        return x_remain


def mask_tokens(x, mask_ratio):
    """
    Perform per-sample random masking by per-sample shuffling.
    Per-sample shuffling is done by argsort random noise.
    x: [N, L, D], sequence
    """
    N, L, D = x.shape  # batch, length, dim
    len_keep = int(L * (1 - mask_ratio))

    noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]

    # sort noise for each sample
    ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
    ids_restore = torch.argsort(ids_shuffle, dim=1)

    # keep the first subset
    ids_keep = ids_shuffle[:, :len_keep]
    x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))

    # generate the binary mask: 0 is keep, 1 is remove
    mask = torch.ones([N, L], device=x.device)
    mask[:, :len_keep] = 0
    mask = torch.gather(mask, dim=1, index=ids_restore)

    return x_masked, mask, ids_restore


def unmask_tokens(x, ids_restore, mask_token):
    # x: [N, T, D] if extras == 0 (i.e., no cls token) else x: [N, T+1, D]
    mask_tokens = mask_token.repeat(x.shape[0], ids_restore.shape[1] - x.shape[1], 1)
    x = torch.cat([x, mask_tokens], dim=1)
    x = torch.gather(x, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]))  # unshuffle
    return x


# Parse 'None' to None and others to float value
def parse_float_none(s):
    assert isinstance(s, str)
    return None if s == 'None' else float(s)


#----------------------------------------------------------------------------
# Parse a comma separated list of numbers or ranges and return a list of ints.
# Example: '1,2,5-10' returns [1, 2, 5, 6, 7, 8, 9, 10]

def parse_int_list(s):
    if isinstance(s, list): return s
    ranges = []
    range_re = re.compile(r'^(\d+)-(\d+)$')
    for p in s.split(','):
        if m := range_re.match(p):
            ranges.extend(range(int(m.group(1)), int(m.group(2))+1))
        else:
            ranges.append(int(p))
    return ranges


def init_processes(fn, args):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = args.master_address
    os.environ['MASTER_PORT'] = str(random.randint(2000, 6000))
    print(f'MASTER_ADDR = {os.environ["MASTER_ADDR"]}')
    print(f'MASTER_PORT = {os.environ["MASTER_PORT"]}')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend='nccl', init_method='env://', rank=args.global_rank, world_size=args.global_size)
    fn(args)
    if args.global_size > 1:
        cleanup()


def mprint(*args, **kwargs):
    """
    Print only from rank 0.
    """
    if dist.get_rank() == 0:
        print(*args, **kwargs)


def cleanup():
    """
    End DDP training.
    """
    dist.barrier()
    mprint("Done!")
    dist.barrier()
    dist.destroy_process_group()


#----------------------------------------------------------------------------
# logging info.
class Logger(object):
    """
    Redirect stderr to stdout, optionally print stdout to a file,
    and optionally force flushing on both stdout and the file.
    """

    def __init__(self, file_name=None, file_mode="w", should_flush=True):
        self.file = None

        if file_name is not None:
            self.file = open(file_name, file_mode)

        self.should_flush = should_flush
        self.stdout = sys.stdout
        self.stderr = sys.stderr

        sys.stdout = self
        sys.stderr = self

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def write(self, text):
        """Write text to stdout (and a file) and optionally flush."""
        if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
            return

        if self.file is not None:
            self.file.write(text)

        self.stdout.write(text)

        if self.should_flush:
            self.flush()

    def flush(self):
        """Flush written text to both stdout and a file, if open."""
        if self.file is not None:
            self.file.flush()

        self.stdout.flush()

    def close(self):
        """Flush, close possible files, and remove stdout/stderr mirroring."""
        self.flush()

        # if using multiple loggers, prevent closing in wrong order
        if sys.stdout is self:
            sys.stdout = self.stdout
        if sys.stderr is self:
            sys.stderr = self.stderr

        if self.file is not None:
            self.file.close()


class StackedRandomGenerator:
    def __init__(self, device, seeds):
        super().__init__()
        self.generators = [torch.Generator(device).manual_seed(int(seed) % (1 << 32)) for seed in seeds]

    def randn(self, size, **kwargs):
        assert size[0] == len(self.generators)
        return torch.stack([torch.randn(size[1:], generator=gen, **kwargs) for gen in self.generators])

    def randn_like(self, input):
        return self.randn(input.shape, dtype=input.dtype, layout=input.layout, device=input.device)

    def randint(self, *args, size, **kwargs):
        assert size[0] == len(self.generators)
        return torch.stack([torch.randint(*args, size=size[1:], generator=gen, **kwargs) for gen in self.generators])


def prepare_prompt_ar(prompt, ratios, device='cpu', show=True):
    # get aspect_ratio or ar
    aspect_ratios = re.findall(r"--aspect_ratio\s+(\d+:\d+)", prompt)
    ars = re.findall(r"--ar\s+(\d+:\d+)", prompt)
    custom_hw = re.findall(r"--hw\s+(\d+:\d+)", prompt)
    if show:
        print("aspect_ratios:", aspect_ratios, "ars:", ars, "hws:", custom_hw)
    prompt_clean = prompt.split("--aspect_ratio")[0].split("--ar")[0].split("--hw")[0]
    if len(aspect_ratios) + len(ars) + len(custom_hw) == 0 and show:
        print( "Wrong prompt format. Set to default ar: 1. change your prompt into format '--ar h:w or --hw h:w' for correct generating")
    if len(aspect_ratios) != 0:
        ar = float(aspect_ratios[0].split(':')[0]) / float(aspect_ratios[0].split(':')[1])
    elif len(ars) != 0:
        ar = float(ars[0].split(':')[0]) / float(ars[0].split(':')[1])
    else:
        ar = 1.
    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
    if len(custom_hw) != 0:
        custom_hw = [float(custom_hw[0].split(':')[0]), float(custom_hw[0].split(':')[1])]
    else:
        custom_hw = ratios[closest_ratio]
    default_hw = ratios[closest_ratio]
    prompt_show = f'prompt: {prompt_clean.strip()}\nSize: --ar {closest_ratio}, --bin hw {ratios[closest_ratio]}, --custom hw {custom_hw}'
    return prompt_clean, prompt_show, torch.tensor(default_hw, device=device)[None], torch.tensor([float(closest_ratio)], device=device)[None], torch.tensor(custom_hw, device=device)[None]


def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int):
    orig_hw = torch.tensor([samples.shape[2], samples.shape[3]], dtype=torch.int)
    custom_hw = torch.tensor([int(new_height), int(new_width)], dtype=torch.int)

    if (orig_hw != custom_hw).all():
        ratio = max(custom_hw[0] / orig_hw[0], custom_hw[1] / orig_hw[1])
        resized_width = int(orig_hw[1] * ratio)
        resized_height = int(orig_hw[0] * ratio)

        transform = T.Compose([
            T.Resize((resized_height, resized_width)),
            T.CenterCrop(custom_hw.tolist())
        ])
        return transform(samples)
    else:
        return samples


def resize_and_crop_img(img: Image, new_width, new_height):
    orig_width, orig_height = img.size

    ratio = max(new_width/orig_width, new_height/orig_height)
    resized_width = int(orig_width * ratio)
    resized_height = int(orig_height * ratio)

    img = img.resize((resized_width, resized_height), Image.LANCZOS)

    left = (resized_width - new_width)/2
    top = (resized_height - new_height)/2
    right = (resized_width + new_width)/2
    bottom = (resized_height + new_height)/2

    img = img.crop((left, top, right, bottom))

    return img


def mask_feature(emb, mask):
    if emb.shape[0] == 1:
        keep_index = mask.sum().item()
        return emb[:, :, :keep_index, :], keep_index
    else:
        masked_feature = emb * mask[:, None, :, None]
        return masked_feature, emb.shape[2]

================================================
FILE: PixArt-alpha-ToCa/diffusion/sa_sampler.py
================================================
"""SAMPLING ONLY."""

import torch
import numpy as np

from diffusion.model.sa_solver import NoiseScheduleVP, model_wrapper, SASolver
from .model import gaussian_diffusion as gd


class SASolverSampler(object):
    def __init__(self, model,
                 noise_schedule="linear",
                 diffusion_steps=1000,
                 device='cpu',
                 ):
        super().__init__()
        self.model = model
        self.device = device
        to_torch = lambda x: x.clone().detach().to(torch.float32).to(device)
        betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps))
        alphas = 1.0 - betas
        self.register_buffer('alphas_cumprod', to_torch(np.cumprod(alphas, axis=0)))

    def register_buffer(self, name, attr):
        if type(attr) == torch.Tensor and attr.device != torch.device("cuda"):
            attr = attr.to(torch.device("cuda"))
        setattr(self, name, attr)

    @torch.no_grad()
    def sample(self, S, batch_size, shape, conditioning=None, callback=None, normals_sequence=None, img_callback=None, quantize_x0=False, eta=0., mask=None, x0=None, temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, verbose=True, x_T=None, log_every_t=100, unconditional_guidance_scale=1., unconditional_conditioning=None, model_kwargs=None, **kwargs):
        if model_kwargs is None:
            model_kwargs = {}
        if conditioning is not None:
            if isinstance(conditioning, dict):
                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
                if cbs != batch_size:
                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
            elif conditioning.shape[0] != batch_size:
                print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")

        # sampling
        C, H, W = shape
        size = (batch_size, C, H, W)

        device = self.device
        img = torch.randn(size, device=device) if x_T is None else x_T
        ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod)

        model_fn = model_wrapper(
            self.model,
            ns,
            model_type="noise",
            guidance_type="classifier-free",
            condition=conditioning,
            unconditional_condition=unconditional_conditioning,
            guidance_scale=unconditional_guidance_scale,
            model_kwargs=model_kwargs,
        )

        sasolver = SASolver(model_fn, ns, algorithm_type="data_prediction")

        tau_t = lambda t: eta if 0.2 <= t <= 0.8 else 0

        x = sasolver.sample(mode='few_steps', x=img, tau=tau_t, steps=S, skip_type='time', skip_order=1, predictor_order=2, corrector_order=2, pc_mode='PEC', return_intermediate=False)

        return x.to(device), None

================================================
FILE: PixArt-alpha-ToCa/diffusion/sa_solver_diffusers.py
================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# DISCLAIMER: check https://arxiv.org/abs/2309.05019
# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py

import math
from typing import List, Optional, Tuple, Union, Callable

import numpy as np
import torch

from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.utils.torch_utils import randn_tensor
from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput


# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
        num_diffusion_timesteps,
        max_beta=0.999,
        alpha_transform_type="cosine",
):
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].

    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.


    Args:
        num_diffusion_timesteps (`int`): the number of betas to produce.
        max_beta (`float`): the maximum beta to use; use values lower than 1 to
                     prevent singularities.
        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
                     Choose from `cosine` or `exp`

    Returns:
        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
    """
    if alpha_transform_type == "cosine":

        def alpha_bar_fn(t):
            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2

    elif alpha_transform_type == "exp":

        def alpha_bar_fn(t):
            return math.exp(t * -12.0)

    else:
        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
        t1 = i / num_diffusion_timesteps
        t2 = (i + 1) / num_diffusion_timesteps
        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
    return torch.tensor(betas, dtype=torch.float32)


class SASolverScheduler(SchedulerMixin, ConfigMixin):
    """
    `SASolverScheduler` is a fast dedicated high-order solver for diffusion SDEs.

    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
    methods the library implements for all schedulers such as loading and saving.

    Args:
        num_train_timesteps (`int`, defaults to 1000):
            The number of diffusion steps to train the model.
        beta_start (`float`, defaults to 0.0001):
            The starting `beta` value of inference.
        beta_end (`float`, defaults to 0.02):
            The final `beta` value.
        beta_schedule (`str`, defaults to `"linear"`):
            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
        trained_betas (`np.ndarray`, *optional*):
            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
        predictor_order (`int`, defaults to 2):
            The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for guided
            sampling, and `predictor_order=3` for unconditional sampling.
        corrector_order (`int`, defaults to 2):
            The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for guided
            sampling, and `corrector_order=3` for unconditional sampling.
        predictor_corrector_mode (`str`, defaults to `PEC`):
            The predictor-corrector mode can be `PEC` or 'PECE'. It is recommended to use `PEC` mode for fast
            sampling, and `PECE` for high-quality sampling (PECE needs around twice model evaluations as PEC).
        prediction_type (`str`, defaults to `epsilon`, *optional*):
            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
            Video](https://imagen.research.google/video/paper.pdf) paper).
        thresholding (`bool`, defaults to `False`):
            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
            as Stable Diffusion.
        dynamic_thresholding_ratio (`float`, defaults to 0.995):
            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
        sample_max_value (`float`, defaults to 1.0):
            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
            `algorithm_type="dpmsolver++"`.
        algorithm_type (`str`, defaults to `data_prediction`):
            Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use `data_prediction`
            with `solver_order=2` for guided sampling like in Stable Diffusion.
        lower_order_final (`bool`, defaults to `True`):
            Whether to use lower-order solvers in the final steps. Default = True.
        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
            the sigmas are determined according to a sequence of noise levels {σi}.
        lambda_min_clipped (`float`, defaults to `-inf`):
            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
            cosine (`squaredcos_cap_v2`) noise schedule.
        variance_type (`str`, *optional*):
            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
            contains the predicted Gaussian variance.
        timestep_spacing (`str`, defaults to `"linspace"`):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
        steps_offset (`int`, defaults to 0):
            An offset added to the inference steps. You can use a combination of `offset=1` and
            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
            Diffusion.
    """

    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
    order = 1

    @register_to_config
    def __init__(
            self,
            num_train_timesteps: int = 1000,
            beta_start: float = 0.0001,
            beta_end: float = 0.02,
            beta_schedule: str = "linear",
            trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
            predictor_order: int = 2,
            corrector_order: int = 2,
            predictor_corrector_mode: str = 'PEC',
            prediction_type: str = "epsilon",
            tau_func: Callable = lambda t: 1 if t >= 200 and t <= 800 else 0,
            thresholding: bool = False,
            dynamic_thresholding_ratio: float = 0.995,
            sample_max_value: float = 1.0,
            algorithm_type: str = "data_prediction",
            lower_order_final: bool = True,
            use_karras_sigmas: Optional[bool] = False,
            lambda_min_clipped: float = -float("inf"),
            variance_type: Optional[str] = None,
            timestep_spacing: str = "linspace",
            steps_offset: int = 0,
    ):
        if trained_betas is not None:
            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
        elif beta_schedule == "linear":
            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
        elif beta_schedule == "scaled_linear":
            # this schedule is very specific to the latent diffusion model.
            self.betas = (
                    torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_train_timesteps, dtype=torch.float32) ** 2
            )
        elif beta_schedule == "squaredcos_cap_v2":
            # Glide cosine schedule
            self.betas = betas_for_alpha_bar(num_train_timesteps)
        else:
            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")

        self.alphas = 1.0 - self.betas
        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
        # Currently we only support VP-type noise schedule
        self.alpha_t = torch.sqrt(self.alphas_cumprod)
        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)

        # standard deviation of the initial noise distribution
        self.init_noise_sigma = 1.0

        if algorithm_type not in ["data_prediction", "noise_prediction"]:
            raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")

        # setable values
        self.num_inference_steps = None
        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
        self.timesteps = torch.from_numpy(timesteps)
        self.timestep_list = [None] * max(predictor_order, corrector_order - 1)
        self.model_outputs = [None] * max(predictor_order, corrector_order - 1)

        self.tau_func = tau_func
        self.predict_x0 = algorithm_type == "data_prediction"
        self.lower_order_nums = 0
        self.last_sample = None

    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
        """
        Sets the discrete timesteps used for the diffusion chain (to be run before inference).

        Args:
            num_inference_steps (`int`):
                The number of diffusion steps used when generating samples with a pre-trained model.
            device (`str` or `torch.device`, *optional*):
                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        """
        # Clipping the minimum of all lambda(t) for numerical stability.
        # This is critical for cosine (squaredcos_cap_v2) noise schedule.
        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
        last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()

        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
        if self.config.timestep_spacing == "linspace":
            timesteps = (
                np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64)
            )

        elif self.config.timestep_spacing == "leading":
            step_ratio = last_timestep // (num_inference_steps + 1)
            # creates integer timesteps by multiplying by ratio
            # casting to int to avoid issues when num_inference_step is power of 3
            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
            timesteps += self.config.steps_offset
        elif self.config.timestep_spacing == "trailing":
            step_ratio = self.config.num_train_timesteps / num_inference_steps
            # creates integer timesteps by multiplying by ratio
            # casting to int to avoid issues when num_inference_step is power of 3
            timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
            timesteps -= 1
        else:
            raise ValueError(
                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
            )

        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
        if self.config.use_karras_sigmas:
            log_sigmas = np.log(sigmas)
            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
            timesteps = np.flip(timesteps).copy().astype(np.int64)

        self.sigmas = torch.from_numpy(sigmas)

        # when num_inference_steps == num_train_timesteps, we can end up with
        # duplicates in timesteps.
        _, unique_indices = np.unique(timesteps, return_index=True)
        timesteps = timesteps[np.sort(unique_indices)]

        self.timesteps = torch.from_numpy(timesteps).to(device)

        self.num_inference_steps = len(timesteps)

        self.model_outputs = [
                                 None,
                             ] * max(self.config.predictor_order, self.config.corrector_order - 1)
        self.lower_order_nums = 0
        self.last_sample = None

    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
        """
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
        photorealism as well as better image-text alignment, especially when using very large guidance weights."

        https://arxiv.org/abs/2205.11487
        """
        dtype = sample.dtype
        batch_size, channels, height, width = sample.shape

        if dtype not in (torch.float32, torch.float64):
            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half

        # Flatten sample for doing quantile calculation along each image
        sample = sample.reshape(batch_size, channels * height * width)

        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"

        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
        s = torch.clamp(
            s, min=1, max=self.config.sample_max_value
        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]

        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"

        sample = sample.reshape(batch_size, channels, height, width)
        sample = sample.to(dtype)

        return sample

    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
    def _sigma_to_t(self, sigma, log_sigmas):
        # get log sigma
        log_sigma = np.log(sigma)

        # get distribution
        dists = log_sigma - log_sigmas[:, np.newaxis]

        # get sigmas range
        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
        high_idx = low_idx + 1

        low = log_sigmas[low_idx]
        high = log_sigmas[high_idx]

        # interpolate sigmas
        w = (low - log_sigma) / (low - high)
        w = np.clip(w, 0, 1)

        # transform interpolation to time range
        t = (1 - w) * low_idx + w * high_idx
        t = t.reshape(sigma.shape)
        return t

    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
        """Constructs the noise schedule of Karras et al. (2022)."""

        sigma_min: float = in_sigmas[-1].item()
        sigma_max: float = in_sigmas[0].item()

        rho = 7.0  # 7.0 is the value used in the paper
        ramp = np.linspace(0, 1, num_inference_steps)
        min_inv_rho = sigma_min ** (1 / rho)
        max_inv_rho = sigma_max ** (1 / rho)
        return (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho

    def convert_model_output(
            self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor
    ) -> torch.FloatTensor:
        """
        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
        integral of the data prediction model.

        <Tip>

        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
        prediction and data prediction models.

        </Tip>

        Args:
            model_output (`torch.FloatTensor`):
                The direct output from the learned diffusion model.
            timestep (`int`):
                The current discrete timestep in the diffusion chain.
            sample (`torch.FloatTensor`):
                A current instance of a sample created by the diffusion process.

        Returns:
            `torch.FloatTensor`:
                The converted model output.
        """

        # SA-Solver_data_prediction needs to solve an integral of the data prediction model.
        if self.config.algorithm_type in ["data_prediction"]:
            if self.config.prediction_type == "epsilon":
                # SA-Solver only needs the "mean" output.
                if self.config.variance_type in ["learned", "learned_range"]:
                    model_output = model_output[:, :3]
                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                x0_pred = (sample - sigma_t * model_output) / alpha_t
            elif self.config.prediction_type == "sample":
                x0_pred = model_output
            elif self.config.prediction_type == "v_prediction":
                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                x0_pred = alpha_t * sample - sigma_t * model_output
            else:
                raise ValueError(
                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
                    " `v_prediction` for the SASolverScheduler."
                )

            if self.config.thresholding:
                x0_pred = self._threshold_sample(x0_pred)

            return x0_pred

        # SA-Solver_noise_prediction needs to solve an integral of the noise prediction model.
        elif self.config.algorithm_type in ["noise_prediction"]:
            if self.config.prediction_type == "epsilon":
                # SA-Solver only needs the "mean" output.
                if self.config.variance_type in ["learned", "learned_range"]:
                    epsilon = model_output[:, :3]
                else:
                    epsilon = model_output
            elif self.config.prediction_type == "sample":
                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                epsilon = (sample - alpha_t * model_output) / sigma_t
            elif self.config.prediction_type == "v_prediction":
                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                epsilon = alpha_t * model_output + sigma_t * sample
            else:
                raise ValueError(
                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
                    " `v_prediction` for the SASolverScheduler."
                )

            if self.config.thresholding:
                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                x0_pred = (sample - sigma_t * epsilon) / alpha_t
                x0_pred = self._threshold_sample(x0_pred)
                epsilon = (sample - alpha_t * x0_pred) / sigma_t

            return epsilon

    def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
        """
        Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
        """
        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"

        if order == 0:
            return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
        elif order == 1:
            return torch.exp(-interval_end) * (
                        (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1))
        elif order == 2:
            return torch.exp(-interval_end) * (
                        (interval_start ** 2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start) - (
                            interval_end ** 2 + 2 * interval_end + 2))
        elif order == 3:
            return torch.exp(-interval_end) * (
                        (interval_start ** 3 + 3 * interval_start ** 2 + 6 * interval_start + 6) * torch.exp(
                    interval_end - interval_start) - (interval_end ** 3 + 3 * interval_end ** 2 + 6 * interval_end + 6))

    def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
        """
        Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
        """
        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"

        # after change of variable(cov)
        interval_end_cov = (1 + tau ** 2) * interval_end
        interval_start_cov = (1 + tau ** 2) * interval_start

        if order == 0:
            return torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (
            (1 + tau ** 2))
        elif order == 1:
            return torch.exp(interval_end_cov) * ((interval_end_cov - 1) - (interval_start_cov - 1) * torch.exp(
                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 2)
        elif order == 2:
            return torch.exp(interval_end_cov) * ((interval_end_cov ** 2 - 2 * interval_end_cov + 2) - (
                        interval_start_cov ** 2 - 2 * interval_start_cov + 2) * torch.exp(
                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 3)
        elif order == 3:
            return torch.exp(interval_end_cov) * (
                        (interval_end_cov ** 3 - 3 * interval_end_cov ** 2 + 6 * interval_end_cov - 6) - (
                            interval_start_cov ** 3 - 3 * interval_start_cov ** 2 + 6 * interval_start_cov - 6) * torch.exp(
                    -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 4)

    def lagrange_polynomial_coefficient(self, order, lambda_list):
        """
        Calculate the coefficient of lagrange polynomial
        """

        assert order in [0, 1, 2, 3]
        assert order == len(lambda_list) - 1
        if order == 0:
            return [[1]]
        elif order == 1:
            return [[1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])],
                    [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])]]
        elif order == 2:
            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
            return [[1 / denominator1,
                     (-lambda_list[1] - lambda_list[2]) / denominator1,
                     lambda_list[1] * lambda_list[2] / denominator1],

                    [1 / denominator2,
                     (-lambda_list[0] - lambda_list[2]) / denominator2,
                     lambda_list[0] * lambda_list[2] / denominator2],

                    [1 / denominator3,
                     (-lambda_list[0] - lambda_list[1]) / denominator3,
                     lambda_list[0] * lambda_list[1] / denominator3]
                    ]
        elif order == 3:
            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) * (
                        lambda_list[0] - lambda_list[3])
            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) * (
                        lambda_list[1] - lambda_list[3])
            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) * (
                        lambda_list[2] - lambda_list[3])
            denominator4 = (lambda_list[3] - lambda_list[0]) * (lambda_list[3] - lambda_list[1]) * (
                        lambda_list[3] - lambda_list[2])
            return [[1 / denominator1,
                     (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
                     (lambda_list[1] * lambda_list[2] + lambda_list[1] * lambda_list[3] + lambda_list[2] * lambda_list[
                         3]) / denominator1,
                     (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1],

                    [1 / denominator2,
                     (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
                     (lambda_list[0] * lambda_list[2] + lambda_list[0] * lambda_list[3] + lambda_list[2] * lambda_list[
                         3]) / denominator2,
                     (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2],

                    [1 / denominator3,
                     (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[3] + lambda_list[1] * lambda_list[
                         3]) / denominator3,
                     (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3],

                    [1 / denominator4,
                     (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[2] + lambda_list[1] * lambda_list[
                         2]) / denominator4,
                     (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4]

                    ]

    def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
        assert order in [1, 2, 3, 4]
        assert order == len(lambda_list), 'the length of lambda list must be equal to the order'
        coefficients = []
        lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
        for i in range(order):
            coefficient = sum(
                lagrange_coefficient[i][j]
                * self.get_coefficients_exponential_positive(
                    order - 1 - j, interval_start, interval_end, tau
                )
                if self.predict_x0
                else lagrange_coefficient[i][j]
                * self.get_coefficients_exponential_negative(
                    order - 1 - j, interval_start, interval_end
                )
                for j in range(order)
            )
            coefficients.append(coefficient)
        assert len(coefficients) == order, 'the length of coefficients does not match the order'
        return coefficients

    def stochastic_adams_bashforth_update(
            self,
            model_output: torch.FloatTensor,
            prev_timestep: int,
            sample: torch.FloatTensor,
            noise: torch.FloatTensor,
            order: int,
            tau: torch.FloatTensor,
    ) -> torch.FloatTensor:
        """
        One step for the SA-Predictor.

        Args:
            model_output (`torch.FloatTensor`):
                The direct output from the learned diffusion model at the current timestep.
            prev_timestep (`int`):
                The previous discrete timestep in the diffusion chain.
            sample (`torch.FloatTensor`):
                A current instance of a sample created by the diffusion process.
            order (`int`):
                The order of SA-Predictor at this timestep.

        Returns:
            `torch.FloatTensor`:
                The sample tensor at the previous timestep.
        """

        assert noise is not None
        timestep_list = self.timestep_list
        model_output_list = self.model_outputs
        s0, t = self.timestep_list[-1], prev_timestep
        lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0]
        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
        gradient_part = torch.zeros_like(sample)
        h = lambda_t - lambda_s0
        lambda_list = [self.lambda_t[timestep_list[-(i + 1)]] for i in range(order)]
        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)

        x = sample

        if self.predict_x0 and order == 2:
            gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
                        h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
                            (1 + tau ** 2) ** 2)) / (self.lambda_t[timestep_list[-1]] - self.lambda_t[
                timestep_list[-2]])
            gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
                        h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
                            (1 + tau ** 2) ** 2)) / (self.lambda_t[timestep_list[-1]] - self.lambda_t[
                timestep_list[-2]])

        for i in range(order):
            if self.predict_x0:

                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
                    i] * model_output_list[-(i + 1)]
            else:
                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_output_list[-(i + 1)]

        if self.predict_x0:
            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
        else:
            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise

        if self.predict_x0:
            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
        else:
            x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part

        x_t = x_t.to(x.dtype)
        return x_t

    def stochastic_adams_moulton_update(
            self,
            this_model_output: torch.FloatTensor,
            this_timestep: int,
            last_sample: torch.FloatTensor,
            last_noise: torch.FloatTensor,
            this_sample: torch.FloatTensor,
            order: int,
            tau: torch.FloatTensor,
    ) -> torch.FloatTensor:
        """
        One step for the SA-Corrector.

        Args:
            this_model_output (`torch.FloatTensor`):
                The model outputs at `x_t`.
            this_timestep (`int`):
                The current timestep `t`.
            last_sample (`torch.FloatTensor`):
                The generated sample before the last predictor `x_{t-1}`.
            this_sample (`torch.FloatTensor`):
                The generated sample after the last predictor `x_{t}`.
            order (`int`):
                The order of SA-Corrector at this step.

        Returns:
            `torch.FloatTensor`:
                The corrected sample tensor at the current timestep.
        """

        assert last_noise is not None
        timestep_list = self.timestep_list
        model_output_list = self.model_outputs
        s0, t = self.timestep_list[-1], this_timestep
        lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0]
        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
        gradient_part = torch.zeros_like(this_sample)
        h = lambda_t - lambda_s0
        t_list = timestep_list + [this_timestep]
        lambda_list = [self.lambda_t[t_list[-(i + 1)]] for i in range(order)]
        model_prev_list = model_output_list + [this_model_output]

        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)

        x = last_sample

        if self.predict_x0 and order == 2:
            gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
                        h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
                            (1 + tau ** 2) ** 2 * h))
            gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
                        h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
                            (1 + tau ** 2) ** 2 * h))

        for i in range(order):
            if self.predict_x0:
                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
                    i] * model_prev_list[-(i + 1)]
            else:
                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]

        if self.predict_x0:
            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * last_noise
        else:
            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * last_noise

        if self.predict_x0:
            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
        else:
            x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part

        x_t = x_t.to(x.dtype)
        return x_t

    def step(
            self,
            model_output: torch.FloatTensor,
            timestep: int,
            sample: torch.FloatTensor,
            generator=None,
            return_dict: bool = True,
    ) -> Union[SchedulerOutput, Tuple]:
        """
        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
        the SA-Solver.

        Args:
            model_output (`torch.FloatTensor`):
                The direct output from learned diffusion model.
            timestep (`int`):
                The current discrete timestep in the diffusion chain.
            sample (`torch.FloatTensor`):
                A current instance of a sample created by the diffusion process.
            generator (`torch.Generator`, *optional*):
                A random number generator.
            return_dict (`bool`):
                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.

        Returns:
            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
                tuple is returned where the first element is the sample tensor.

        """
        if self.num_inference_steps is None:
            raise ValueError(
                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
            )

        if isinstance(timestep, torch.Tensor):
            timestep = timestep.to(self.timesteps.device)
        step_index = (self.timesteps == timestep).nonzero()
        if len(step_index) == 0:
            step_index = len(self.timesteps) - 1
        else:
            step_index = step_index.item()

        use_corrector = (
                step_index > 0 and self.last_sample is not None
        )

        model_output_convert = self.convert_model_output(model_output, timestep, sample)

        if use_corrector:
            current_tau = self.tau_func(self.timestep_list[-1])
            sample = self.stochastic_adams_moulton_update(
                this_model_output=model_output_convert,
                this_timestep=timestep,
                last_sample=self.last_sample,
                last_noise=self.last_noise,
                this_sample=sample,
                order=self.this_corrector_order,
                tau=current_tau,
            )

        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]

        for i in range(max(self.config.predictor_order, self.config.corrector_order - 1) - 1):
            self.model_outputs[i] = self.model_outputs[i + 1]
            self.timestep_list[i] = self.timestep_list[i + 1]

        self.model_outputs[-1] = model_output_convert
        self.timestep_list[-1] = timestep

        noise = randn_tensor(
            model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
        )

        if self.config.lower_order_final:
            this_predictor_order = min(self.config.predictor_order, len(self.timesteps) - step_index)
            this_corrector_order = min(self.config.corrector_order, len(self.timesteps) - step_index + 1)
        else:
            this_predictor_order = self.config.predictor_order
            this_corrector_order = self.config.corrector_order

        self.this_predictor_order = min(this_predictor_order, self.lower_order_nums + 1)  # warmup for multistep
        self.this_corrector_order = min(this_corrector_order, self.lower_order_nums + 2)  # warmup for multistep
        assert self.this_predictor_order > 0
        assert self.this_corrector_order > 0

        self.last_sample = sample
        self.last_noise = noise

        current_tau = self.tau_func(self.timestep_list[-1])
        prev_sample = self.stochastic_adams_bashforth_update(
            model_output=model_output_convert,
            prev_timestep=prev_timestep,
            sample=sample,
            noise=noise,
            order=self.this_predictor_order,
            tau=current_tau,
        )

        if self.lower_order_nums < max(self.config.predictor_order, self.config.corrector_order - 1):
            self.lower_order_nums += 1

        if not return_dict:
            return (prev_sample,)

        return SchedulerOutput(prev_sample=prev_sample)

    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
        """
        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
        current timestep.

        Args:
            sample (`torch.FloatTensor`):
                The input sample.

        Returns:
            `torch.FloatTensor`:
                A scaled input sample.
        """
        return sample

    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
    def add_noise(
            self,
            original_samples: torch.FloatTensor,
            noise: torch.FloatTensor,
            timesteps: torch.IntTensor,
    ) -> torch.FloatTensor:
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
        timesteps = timesteps.to(original_samples.device)

        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)

        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)

        return sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise

    def __len__(self):
        return self.config.num_train_timesteps

================================================
FILE: PixArt-alpha-ToCa/diffusion/utils/__init__.py
================================================


================================================
FILE: PixArt-alpha-ToCa/diffusion/utils/checkpoint.py
================================================
import os
import re
import torch

from diffusion.utils.logger import get_root_logger


def save_checkpoint(work_dir,
                    epoch,
                    model,
                    model_ema=None,
                    optimizer=None,
                    lr_scheduler=None,
                    keep_last=False,
                    step=None,
                    ):
    os.makedirs(work_dir, exist_ok=True)
    state_dict = dict(state_dict=model.state_dict())
    if model_ema is not None:
        state_dict['state_dict_ema'] = model_ema.state_dict()
    if optimizer is not None:
        state_dict['optimizer'] = optimizer.state_dict()
    if lr_scheduler is not None:
        state_dict['scheduler'] = lr_scheduler.state_dict()
    if epoch is not None:
        state_dict['epoch'] = epoch
        file_path = os.path.join(work_dir, f"epoch_{epoch}.pth")
        if step is not None:
            file_path = file_path.split('.pth')[0] + f"_step_{step}.pth"
    logger = get_root_logger()
    torch.save(state_dict, file_path)
    logger.info(f'Saved checkpoint of epoch {epoch} to {file_path.format(epoch)}.')
    if keep_last:
        for i in range(epoch):
            previous_ckgt = file_path.format(i)
            if os.path.exists(previous_ckgt):
                os.remove(previous_ckgt)


def load_checkpoint(checkpoint,
                    model,
                    model_ema=None,
                    optimizer=None,
                    lr_scheduler=None,
                    load_ema=False,
                    resume_optimizer=True,
                    resume_lr_scheduler=True
                    ):
    assert isinstance(checkpoint, str)
    ckpt_file = checkpoint
    checkpoint = torch.load(ckpt_file, map_location="cpu")

    state_dict_keys = ['pos_embed', 'base_model.pos_embed', 'model.pos_embed']
    for key in state_dict_keys:
        if key in checkpoint['state_dict']:
            del checkpoint['state_dict'][key]
            if 'state_dict_ema' in checkpoint and key in checkpoint['state_dict_ema']:
                del checkpoint['state_dict_ema'][key]
            break

    if load_ema:
        state_dict = checkpoint['state_dict_ema']
    else:
        state_dict = checkpoint.get('state_dict', checkpoint)  # to be compatible with the official checkpoint
    # model.load_state_dict(state_dict)
    missing, unexpect = model.load_state_dict(state_dict, strict=False)
    if model_ema is not None:
        model_ema.load_state_dict(checkpoint['state_dict_ema'], strict=False)
    if optimizer is not None and resume_optimizer:
        optimizer.load_state_dict(checkpoint['optimizer'])
    if lr_scheduler is not None and resume_lr_scheduler:
        lr_scheduler.load_state_dict(checkpoint['scheduler'])
    logger = get_root_logger()
    if optimizer is not None:
        epoch = checkpoint.get('epoch', re.match(r'.*epoch_(\d*).*.pth', ckpt_file).group()[0])
        logger.info(f'Resume checkpoint of epoch {epoch} from {ckpt_file}. Load ema: {load_ema}, '
                    f'resume optimizer： {resume_optimizer}, resume lr scheduler: {resume_lr_scheduler}.')
        return epoch, missing, unexpect
    logger.info(f'Load checkpoint from {ckpt_file}. Load ema: {load_ema}.')
    return missing, unexpect


================================================
FILE: PixArt-alpha-ToCa/diffusion/utils/data_sampler.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import os
from typing import Sequence
from torch.utils.data import BatchSampler, Sampler, Dataset
from random import shuffle, choice
from copy import deepcopy
from diffusion.utils.logger import get_root_logger


class AspectRatioBatchSampler(BatchSampler):
    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.

    Args:
        sampler (Sampler): Base sampler.
        dataset (Dataset): Dataset providing data information.
        batch_size (int): Size of mini-batch.
        drop_last (bool): If ``True``, the sampler will drop the last batch if
            its size would be less than ``batch_size``.
        aspect_ratios (dict): The predefined aspect ratios.
    """

    def __init__(self,
                 sampler: Sampler,
                 dataset: Dataset,
                 batch_size: int,
                 aspect_ratios: dict,
                 drop_last: bool = False,
                 config=None,
                 valid_num=0,   # take as valid aspect-ratio when sample number >= valid_num
                 **kwargs) -> None:
        if not isinstance(sampler, Sampler):
            raise TypeError('sampler should be an instance of ``Sampler``, '
                            f'but got {sampler}')
        if not isinstance(batch_size, int) or batch_size <= 0:
            raise ValueError('batch_size should be a positive integer value, '
                             f'but got batch_size={batch_size}')
        self.sampler = sampler
        self.dataset = dataset
        self.batch_size = batch_size
        self.aspect_ratios = aspect_ratios
        self.drop_last = drop_last
        self.ratio_nums_gt = kwargs.get('ratio_nums', None)
        self.config = config
        assert self.ratio_nums_gt
        # buckets for each aspect ratio
        self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
        self.current_available_bucket_keys =  [str(k) for k, v in self.ratio_nums_gt.items() if v >= valid_num]
        logger = get_root_logger() if config is None else get_root_logger(os.path.join(config.work_dir, 'train_log.log'))
        logger.warning(f"Using valid_num={valid_num} in config file. Available {len(self.current_available_bucket_keys)} aspect_ratios: {self.current_available_bucket_keys}")

    def __iter__(self) -> Sequence[int]:
        for idx in self.sampler:
            data_info = self.dataset.get_data_info(idx)
            height, width =  data_info['height'], data_info['width']
            ratio = height / width
            # find the closest aspect ratio
            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
            if closest_ratio not in self.current_available_bucket_keys:
                continue
            bucket = self._aspect_ratio_buckets[closest_ratio]
            bucket.append(idx)
            # yield a batch of indices in the same aspect ratio group
            if len(bucket) == self.batch_size:
                yield bucket[:]
                del bucket[:]

        # yield the rest data and reset the buckets
        for bucket in self._aspect_ratio_buckets.values():
            while len(bucket) > 0:
                if len(bucket) <= self.batch_size:
                    if not self.drop_last:
                        yield bucket[:]
                    bucket = []
                else:
                    yield bucket[:self.batch_size]
                    bucket = bucket[self.batch_size:]


class BalancedAspectRatioBatchSampler(AspectRatioBatchSampler):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Assign samples to each bucket
        self.ratio_nums_gt = kwargs.get('ratio_nums', None)
        assert self.ratio_nums_gt
        self._aspect_ratio_buckets = {float(ratio): [] for ratio in self.aspect_ratios.keys()}
        self.original_buckets = {}
        self.current_available_bucket_keys =  [k for k, v in self.ratio_nums_gt.items() if v >= 3000]
        self.all_available_keys = deepcopy(self.current_available_bucket_keys)
        self.exhausted_bucket_keys = []
        self.total_batches = len(self.sampler) // self.batch_size
        self._aspect_ratio_count = {}
        for k in self.all_available_keys:
            self._aspect_ratio_count[float(k)] = 0
            self.original_buckets[float(k)] = []
        logger = get_root_logger(os.path.join(self.config.work_dir, 'train_log.log'))
        logger.warning(f"Available {len(self.current_available_bucket_keys)} aspect_ratios: {self.current_available_bucket_keys}")

    def __iter__(self) -> Sequence[int]:
        i = 0
        for idx in self.sampler:
            data_info = self.dataset.get_data_info(idx)
            height, width = data_info['height'], data_info['width']
            ratio = height / width
            closest_ratio = float(min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio)))
            if closest_ratio not in self.all_available_keys:
                continue
            if self._aspect_ratio_count[closest_ratio] < self.ratio_nums_gt[closest_ratio]:
                self._aspect_ratio_count[closest_ratio] += 1
                self._aspect_ratio_buckets[closest_ratio].append(idx)
                self.original_buckets[closest_ratio].append(idx)    # Save the original samples for each bucket
            if not self.current_available_bucket_keys:
                self.current_available_bucket_keys, self.exhausted_bucket_keys = self.exhausted_bucket_keys, []

            if closest_ratio not in self.current_available_bucket_keys:
                continue
            key = closest_ratio
            bucket = self._aspect_ratio_buckets[key]
            if len(bucket) == self.batch_size:
                yield bucket[:self.batch_size]
                del bucket[:self.batch_size]
                i += 1
                self.exhausted_bucket_keys.append(key)
                self.current_available_bucket_keys.remove(key)

        for _ in range(self.total_batches - i):
            key = choice(self.all_available_keys)
            bucket = self._aspect_ratio_buckets[key]
            if len(bucket) >= self.batch_size:
                yield bucket[:self.batch_size]
                del bucket[:self.batch_size]

                # If a bucket is exhausted
                if not bucket:
                    self._aspect_ratio_buckets[key] = deepcopy(self.original_buckets[key][:])
                    shuffle(self._aspect_ratio_buckets[key])
            else:
                self._aspect_ratio_buckets[key] = deepcopy(self.original_buckets[key][:])
                shuffle(self._aspect_ratio_buckets[key])


================================================
FILE: PixArt-alpha-ToCa/diffusion/utils/dist_utils.py
================================================
"""
This file contains primitives for multi-gpu communication.
This is useful when doing distributed training.
"""
import os
import pickle
import shutil

import gc
import mmcv
import torch
import torch.distributed as dist
from mmcv.runner import get_dist_info


def is_distributed():
    return get_world_size() > 1


def get_world_size():
    if not dist.is_available():
        return 1
    return dist.get_world_size() if dist.is_initialized() else 1


def get_rank():
    if not dist.is_available():
        return 0
    return dist.get_rank() if dist.is_initialized() else 0


def get_local_rank():
    if not dist.is_available():
        return 0
    return int(os.getenv('LOCAL_RANK', 0)) if dist.is_initialized() else 0


def is_master():
    return get_rank() == 0


def is_local_master():
    return get_local_rank() == 0


def get_local_proc_group(group_size=8):
    world_size = get_world_size()
    if world_size <= group_size or group_size == 1:
        return None
    assert world_size % group_size == 0, f'world size ({world_size}) should be evenly divided by group size ({group_size}).'
    process_groups = getattr(get_local_proc_group, 'process_groups', {})
    if group_size not in process_groups:
        num_groups = dist.get_world_size() // group_size
        groups = [list(range(i * group_size, (i + 1) * group_size)) for i in range(num_groups)]
        process_groups.update({group_size: [torch.distributed.new_group(group) for group in groups]})
        get_local_proc_group.process_groups = process_groups

    group_idx = get_rank() // group_size
    return get_local_proc_group.process_groups.get(group_size)[group_idx]


def synchronize():
    """
    Helper function to synchronize (barrier) among all processes when
    using distributed training
    """
    if not dist.is_available():
        return
    if not dist.is_initialized():
        return
    world_size = dist.get_world_size()
    if world_size == 1:
        return
    dist.barrier()


def all_gather(data):
    """
    Run all_gather on arbitrary picklable data (not necessarily tensors)
    Args:
        data: any picklable object
    Returns:
        list[data]: list of data gathered from each rank
    """
    to_device = torch.device("cuda")
    # to_device = torch.device("cpu")

    world_size = get_world_size()
    if world_size == 1:
        return [data]

    # serialized to a Tensor
    buffer = pickle.dumps(data)
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to(to_device)

    # obtain Tensor size of each rank
    local_size = torch.LongTensor([tensor.numel()]).to(to_device)
    size_list = [torch.LongTensor([0]).to(to_device) for _ in range(world_size)]
    dist.all_gather(size_list, local_size)
    size_list = [int(size.item()) for size in size_list]
    max_size = max(size_list)

    tensor_list = [
        torch.ByteTensor(size=(max_size,)).to(to_device) for _ in size_list
    ]
    if local_size != max_size:
        padding = torch.ByteTensor(size=(max_size - local_size,)).to(to_device)
        tensor = torch.cat((tensor, padding), dim=0)
    dist.all_gather(tensor_list, tensor)

    data_list = []
    for size, tensor in zip(size_list, tensor_list):
        buffer = tensor.cpu().numpy().tobytes()[:size]
        data_list.append(pickle.loads(buffer))

    return data_list


def reduce_dict(input_dict, average=True):
    """
    Args:
        input_dict (dict): all the values will be reduced
        average (bool): whether to do average or sum
    Reduce the values in the dictionary from all processes so that process with rank
    0 has the averaged results. Returns a dict with the same fields as
    input_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return input_dict
    with torch.no_grad():
        reduced_dict = _extracted_from_reduce_dict_14(input_dict, average, world_size)
    return reduced_dict


# TODO Rename this here and in `reduce_dict`
def _extracted_from_reduce_dict_14(input_dict, average, world_size):
    names = []
    values = []
    # sort the keys so that they are consistent across processes
    for k in sorted(input_dict.keys()):
        names.append(k)
        values.append(input_dict[k])
    values = torch.stack(values, dim=0)
    dist.reduce(values, dst=0)
    if dist.get_rank() == 0 and average:
        # only main process gets accumulated, so only divide by
        # world_size in this case
        values /= world_size
    return dict(zip(names, values))


def broadcast(data, **kwargs):
    if get_world_size() == 1:
        return data
    data = [data]
    dist.broadcast_object_list(data, **kwargs)
    return data[0]


def all_gather_cpu(result_part, tmpdir=None, collect_by_master=True):
    rank, world_size = get_dist_info()
    if tmpdir is None:
        tmpdir = './tmp'
    if rank == 0:
        mmcv.mkdir_or_exist(tmpdir)
    synchronize()
    # dump the part result to the dir
    mmcv.dump(result_part, os.path.join(tmpdir, f'part_{rank}.pkl'))
    synchronize()
    if collect_by_master and rank != 0:
        return None
    # load results of all parts from tmp dir
    results = []
    for i in range(world_size):
        part_file = os.path.join(tmpdir, f'part_{i}.pkl')
        results.append(mmcv.load(part_file))
    if not collect_by_master:
        synchronize()
    # remove tmp dir
    if rank == 0:
        shutil.rmtree(tmpdir)
    return results

def all_gather_tensor(tensor, group_size=None, group=None):
    if group_size is None:
        group_size = get_world_size()
    if group_size == 1:
        output = [tensor]
    else:
        output = [torch.zeros_like(tensor) for _ in range(group_size)]
        dist.all_gather(output, tensor, group=group)
    return output


def gather_difflen_tensor(feat, num_samples_list, concat=True, group=None, group_size=None):
    world_size = get_world_size()
    if world_size == 1:
        return feat if concat else [feat]
    num_samples, *feat_dim = feat.size()
    # padding to max number of samples
    feat_padding = feat.new_zeros((max(num_samples_list), *feat_dim))
    feat_padding[:num_samples] = feat
    # gather
    feat_gather = all_gather_tensor(feat_padding, group=group, group_size=group_size)
    for r, num in enumerate(num_samples_list):
        feat_gather[r] = feat_gather[r][:num]
    if concat:
        feat_gather = torch.cat(feat_gather)
    return feat_gather


class GatherLayer(torch.autograd.Function):
    '''Gather tensors from all process, supporting backward propagation.
    '''

    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        num_samples = torch.tensor(input.size(0), dtype=torch.long, device=input.device)
        ctx.num_samples_list = all_gather_tensor(num_samples)
        output = gather_difflen_tensor(input, ctx.num_samples_list, concat=False)
        return tuple(output)

    @staticmethod
    def backward(ctx, *grads):  # tuple(output)'s grad
        input, = ctx.saved_tensors
        num_samples_list = ctx.num_samples_list
        rank = get_rank()
        start, end = sum(num_samples_list[:rank]), sum(num_samples_list[:rank + 1])
        grads = torch.cat(grads)
        if is_distributed():
            dist.all_reduce(grads)
        grad_out = torch.zeros_like(input)
        grad_out[:] = grads[start:end]
        return grad_out, None, None


class GatherLayerWithGroup(torch.autograd.Function):
    '''Gather tensors from all process, supporting backward propagation.
    '''

    @staticmethod
    def forward(ctx, input, group, group_size):
        ctx.save_for_backward(input)
        ctx.group_size = group_size
        output = all_gather_tensor(input, group=group, group_size=group_size)
        return tuple(output)

    @staticmethod
    def backward(ctx, *grads):  # tuple(output)'s grad
        input, = ctx.saved_tensors
        grads = torch.stack(grads)
        if is_distributed():
            dist.all_reduce(grads)
        grad_out = torch.zeros_like(input)
        grad_out[:] = grads[get_rank() % ctx.group_size]
        return grad_out, None, None


def gather_layer_with_group(data, group=None, group_size=None):
    if group_size is None:
        group_size = get_world_size()
    return GatherLayer.apply(data, group, group_size)

from typing import Union
import math
# from torch.distributed.fsdp.fully_sharded_data_parallel import TrainingState_, _calc_grad_norm

@torch.no_grad()
def clip_grad_norm_(
    self, max_norm: Union[float, int], norm_type: Union[float, int] = 2.0
) -> None:
    self._lazy_init()
    self._wait_for_previous_optim_step()
    assert self._is_root, "clip_grad_norm should only be called on the root (parent) instance"
    self._assert_state(TrainingState_.IDLE)

    max_norm = float(max_norm)
    norm_type = float(norm_type)
    # Computes the max norm for this shard's gradients and sync's across workers
    local_norm = _calc_grad_norm(self.params_with_grad, norm_type).cuda()  # type: ignore[arg-type]
    if norm_type == math.inf:
        total_norm = local_norm
        dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group)
    else:
        total_norm = local_norm ** norm_type
        dist.all_reduce(total_norm, group=self.process_group)
        total_norm = total_norm ** (1.0 / norm_type)

    clip_coef = torch.tensor(max_norm, dtype=total_norm.dtype, device=total_norm.device) / (total_norm + 1e-6)
    if clip_coef < 1:
        # multiply by clip_coef, aka, (max_norm/total_norm).
        for p in self.params_with_grad:
            assert p.grad is not None
            p.grad.detach().mul_(clip_coef.to(p.grad.device))
    return total_norm


def flush():
    gc.collect()
    torch.cuda.empty_cache()


================================================
FILE: PixArt-alpha-ToCa/diffusion/utils/logger.py
================================================
import logging
import os
import torch.distributed as dist
from datetime import datetime
from .dist_utils import is_local_master
from mmcv.utils.logging import logger_initialized


def get_root_logger(log_file=None, log_level=logging.INFO, name='PixArt'):
    """Get root logger.

    Args:
        log_file (str, optional): File path of log. Defaults to None.
        log_level (int, optional): The level of logger.
            Defaults to logging.INFO.
        name (str): logger name
    Returns:
        :obj:`logging.Logger`: The obtained logger
    """
    if log_file is None:
        log_file = '/dev/null'
    return get_logger(name=name, log_file=log_file, log_level=log_level)


def get_logger(name, log_file=None, log_level=logging.INFO):
    """Initialize and get a logger by name.

    If the logger has not been initialized, this method will initialize the
    logger by adding one or two handlers, otherwise the initialized logger will
    be directly returned. During initialization, a StreamHandler will always be
    added. If `log_file` is specified and the process rank is 0, a FileHandler
    will also be added.

    Args:
        name (str): Logger name.
        log_file (str | None): The log filename. If specified, a FileHandler
            will be added to the logger.
        log_level (int): The logger level. Note that only the process of
            rank 0 is affected, and other processes will set the level to
            "Error" thus be silent most of the time.

    Returns:
        logging.Logger: The expected logger.
    """
    logger = logging.getLogger(name)
    logger.propagate = False  # disable root logger to avoid duplicate logging

    if name in logger_initialized:
        return logger
    # handle hierarchical names
    # e.g., logger "a" is initialized, then logger "a.b" will skip the
    # initialization since it is a child of "a".
    for logger_name in logger_initialized:
        if name.startswith(logger_name):
            return logger

    stream_handler = logging.StreamHandler()
    handlers = [stream_handler]

    rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
    # only rank 0 will add a FileHandler
    if rank == 0 and log_file is not None:
        file_handler = logging.FileHandler(log_file, 'w')
        handlers.append(file_handler)

    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    for handler in handlers:
        handler.setFormatter(formatter)
        handler.setLevel(log_level)
        logger.addHandler(handler)

    # only rank0 for each node will print logs
    log_level = log_level if is_local_master() else logging.ERROR
    logger.setLevel(log_level)

    logger_initialized[name] = True

    return logger

def rename_file_with_creation_time(file_path):
    # 获取文件的创建时间
    creation_time = os.path.getctime(file_path)
    creation_time_str = datetime.fromtimestamp(creation_time).strftime('%Y-%m-%d_%H-%M-%S')

    # 构建新的文件名
    dir_name, file_name = os.path.split(file_path)
    name, ext = os.path.splitext(file_name)
    new_file_name = f"{name}_{creation_time_str}{ext}"
    new_file_path = os.path.join(dir_name, new_file_name)

    # 重命名文件
    os.rename(file_path, new_file_path)
    print(f"File renamed to: {new_file_path}")


================================================
FILE: PixArt-alpha-ToCa/diffusion/utils/lr_scheduler.py
================================================
from diffusers import get_cosine_schedule_with_warmup, get_constant_schedule_with_warmup
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR
import math

from diffusion.utils.logger import get_root_logger


def build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio):
    if not config.get('lr_schedule_args', None):
        config.lr_schedule_args = {}
    if config.get('lr_warmup_steps', None):
        config['num_warmup_steps'] = config.get('lr_warmup_steps')  # for compatibility with old version

    logger = get_root_logger()
    logger.info(
        f'Lr schedule: {config.lr_schedule}, ' + ",".join(
            [f"{key}:{value}" for key, value in config.lr_schedule_args.items()]) + '.')
    if config.lr_schedule == 'cosine':
        lr_scheduler = get_cosine_schedule_with_warmup(
            optimizer=optimizer,
            **config.lr_schedule_args,
            num_training_steps=(len(train_dataloader) * config.num_epochs),
        )
    elif config.lr_schedule == 'constant':
        lr_scheduler = get_constant_schedule_with_warmup(
            optimizer=optimizer,
            **config.lr_schedule_args,
        )
    elif config.lr_schedule == 'cosine_decay_to_constant':
        assert lr_scale_ratio >= 1
        lr_scheduler = get_cosine_decay_to_constant_with_warmup(
            optimizer=optimizer,
            **config.lr_schedule_args,
            final_lr=1 / lr_scale_ratio,
            num_training_steps=(len(train_dataloader) * config.num_epochs),
        )
    else:
        raise RuntimeError(f'Unrecognized lr schedule {config.lr_schedule}.')
    return lr_scheduler


def get_cosine_decay_to_constant_with_warmup(optimizer: Optimizer,
                                             num_warmup_steps: int,
                                             num_training_steps: int,
                                             final_lr: float = 0.0,
                                             num_decay: float = 0.667,
                                             num_cycles: float = 0.5,
                                             last_epoch: int = -1
                                             ):
    """
    Create a schedule with a cosine annealing lr followed by a constant lr.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The number of total training steps.
        final_lr (`int`):
            The final constant lr after cosine decay.
        num_decay (`int`):
            The
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))

        num_decay_steps = int(num_training_steps * num_decay)
        if current_step > num_decay_steps:
            return final_lr

        progress = float(current_step - num_warmup_steps) / float(max(1, num_decay_steps - num_warmup_steps))
        return (
            max(
                0.0,
                0.5 * (1.0 + math.cos(math.pi * num_cycles * 2.0 * progress)),
            )
            * (1 - final_lr)
        ) + final_lr

    return LambdaLR(optimizer, lr_lambda, last_epoch)


================================================
FILE: PixArt-alpha-ToCa/diffusion/utils/misc.py
================================================
import collections
import datetime
import os
import random
import subprocess
import time
from multiprocessing import JoinableQueue, Process

import numpy as np
import torch
import torch.distributed as dist
from mmcv import Config
from mmcv.runner import get_dist_info

from diffusion.utils.logger import get_root_logger

os.environ["MOX_SILENT_MODE"] = "1"  # mute moxing log


def read_config(file):
    # solve config loading conflict when multi-processes
    import time
    while True:
        config = Config.fromfile(file)
        if len(config) == 0:
            time.sleep(0.1)
            continue
        break
    return config


def init_random_seed(seed=None, device='cuda'):
    """Initialize random seed.

    If the seed is not set, the seed will be automatically randomized,
    and then broadcast to all processes to prevent some potential bugs.

    Args:
        seed (int, Optional): The seed. Default to None.
        device (str): The device where the seed will be put on.
            Default to 'cuda'.

    Returns:
        int: Seed to be used.
    """
    if seed is not None:
        return seed

    # Make sure all ranks share the same random seed to prevent
    # some potential bugs. Please refer to
    # https://github.com/open-mmlab/mmdetection/issues/6339
    rank, world_size = get_dist_info()
    seed = np.random.randint(2 ** 31)
    if world_size == 1:
        return seed

    if rank == 0:
        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
    else:
        random_num = torch.tensor(0, dtype=torch.int32, device=device)
    dist.broadcast(random_num, src=0)
    return random_num.item()


def set_random_seed(seed, deterministic=False):
    """Set random seed.

    Args:
        seed (int): Seed to be used.
        deterministic (bool): Whether to set the deterministic option for
            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
            to True and `torch.backends.cudnn.benchmark` to False.
            Default: False.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

class SimpleTimer:
    def __init__(self, num_tasks, log_interval=1, desc="Process"):
        self.num_tasks = num_tasks
        self.desc = desc
        self.count = 0
        self.log_interval = log_interval
        self.start_time = time.time()
        self.logger = get_root_logger()

    def log(self):
        self.count += 1
        if (self.count % self.log_interval) == 0 or self.count == self.num_tasks:
            time_elapsed = time.time() - self.start_time
            avg_time = time_elapsed / self.count
            eta_sec = avg_time * (self.num_tasks - self.count)
            eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
            elapsed_str = str(datetime.timedelta(seconds=int(time_elapsed)))
            log_info = f"{self.desc} [{self.count}/{self.num_tasks}], elapsed_time:{elapsed_str}," \
                       f" avg_time: {avg_time}, eta: {eta_str}."
            self.logger.info(log_info)


class DebugUnderflowOverflow:
    """
    This debug class helps detect and understand where the model starts getting very large or very small, and more
    importantly `nan` or `inf` weight and activation elements.
    There are 2 working modes:
    1. Underflow/overflow detection (default)
    2. Specific batch absolute min/max tracing without detection
    Mode 1: Underflow/overflow detection
    To activate the underflow/overflow detection, initialize the object with the model :
    ```python
    debug_overflow = DebugUnderflowOverflow(model)
    ```
    then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or
    output elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this
    event, each frame reporting
    1. the fully qualified module name plus the class name whose `forward` was run
    2. the absolute min and max value of all elements for each module weights, and the inputs and output
    For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16 mixed precision :
    ```
    Detected inf/nan during batch_number=0
    Last 21 forward frames:
    abs min  abs max  metadata
    [...]
                      encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
    2.17e-07 4.50e+00 weight
    1.79e-06 4.65e+00 input[0]
    2.68e-06 3.70e+01 output
                      encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
    8.08e-07 2.66e+01 weight
    1.79e-06 4.65e+00 input[0]
    1.27e-04 2.37e+02 output
                      encoder.block.2.layer.1.DenseReluDense.wo Linear
    1.01e-06 6.44e+00 weight
    0.00e+00 9.74e+03 input[0]
    3.18e-04 6.27e+04 output
                      encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
    1.79e-06 4.65e+00 input[0]
    3.18e-04 6.27e+04 output
                      encoder.block.2.layer.1.dropout Dropout
    3.18e-04 6.27e+04 input[0]
    0.00e+00      inf output
    ```
    You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value
    was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
    renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
    64K, and we get an overlow.
    As you can see it's the previous frames that we need to look into when the numbers start going into very large for
    fp16 numbers.
    The tracking is done in a forward hook, which gets invoked immediately after `forward` has completed.
    By default the last 21 frames are printed. You can change the default to adjust for your needs. For example :
    ```python
    debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
    ```
        To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may
        take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next
        section.
        Mode 2. Specific batch absolute min/max tracing without detection
        The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
        Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a
    given batch, and only do that for batches 1 and 3. Then you instantiate this class as :
    ```python
    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
    ```
    And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
    This is helpful if you know that the program starts misbehaving after a certain batch number, so you can
    fast-forward right to that area.
    Early stopping:
    You can also specify the batch number after which to stop the training, with :
    ```python
    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
    ```
    This feature is mainly useful in the tracing mode, but you can use it for any mode.
    **Performance**:
    As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the
    training down. Therefore remember to turn it off once the debugging needs have been met.
    Args:
        model (`nn.Module`):
            The model to debug.
        max_frames_to_save (`int`, *optional*, defaults to 21):
            How many frames back to record
        trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
            Which batch numbers to trace (turns detection off)
        abort_after_batch_num  (`int``, *optional*):
            Whether to abort after a certain batch number has finished
    """

    def __init__(self, model, max_frames_to_save=21, trace_batch_nums=None, abort_after_batch_num=None):
        if trace_batch_nums is None:
            trace_batch_nums = []
        self.model = model
        self.trace_batch_nums = trace_batch_nums
        self.abort_after_batch_num = abort_after_batch_num

        # keep a LIFO buffer of frames to dump as soon as inf/nan is encountered to give context to the problem emergence
        self.frames = collections.deque([], max_frames_to_save)
        self.frame = []
        self.batch_number = 0
        self.total_calls = 0
        self.detected_overflow = False
        self.prefix = "                 "

        self.analyse_model()

        self.register_forward_hook()

    def save_frame(self, frame=None):
        if frame is not None:
            self.expand_frame(frame)
        self.frames.append("\n".join(self.frame))
        self.frame = []  # start a new frame

    def expand_frame(self, line):
        self.frame.append(line)

    def trace_frames(self):
        print("\n".join(self.frames))
        self.frames = []

    def reset_saved_frames(self):
        self.frames = []

    def dump_saved_frames(self):
        print(f"\nDetected inf/nan during batch_number={self.batch_number} "
              f"Last {len(self.frames)} forward frames:"
              f"{'abs min':8} {'abs max':8} metadata"
              f"'\n'.join(self.frames)"
              f"\n\n")
        self.frames = []

    def analyse_model(self):
        # extract the fully qualified module names, to be able to report at run time. e.g.:
        # encoder.block.2.layer.0.SelfAttention.o
        #
        # for shared weights only the first shared module name will be registered
        self.module_names = {m: name for name, m in self.model.named_modules()}
        # self.longest_module_name = max(len(v) for v in self.module_names.values())

    def analyse_variable(self, var, ctx):
        if torch.is_tensor(var):
            self.expand_frame(self.get_abs_min_max(var, ctx))
            if self.detect_overflow(var, ctx):
                self.detected_overflow = True
        elif var is None:
            self.expand_frame(f"{'None':>17} {ctx}")
        else:
            self.expand_frame(f"{'not a tensor':>17} {ctx}")

    def batch_start_frame(self):
        self.expand_frame(f"\n\n{self.prefix} *** Starting batch number={self.batch_number} ***")
        self.expand_frame(f"{'abs min':8} {'abs max':8} metadata")

    def batch_end_frame(self):
        self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number - 1} ***\n\n")

    def create_frame(self, module, input, output):
        self.expand_frame(f"{self.prefix} {self.module_names[module]} {module.__class__.__name__}")

        # params
        for name, p in module.named_parameters(recurse=False):
            self.analyse_variable(p, name)

        # inputs
        if isinstance(input, tuple):
            for i, x in enumerate(input):
                self.analyse_variable(x, f"input[{i}]")
        else:
            self.analyse_variable(input, "input")

        # outputs
        if isinstance(output, tuple):
            for i, x in enumerate(output):
                # possibly a tuple of tuples
                if isinstance(x, tuple):
                    for j, y in enumerate(x):
                        self.analyse_variable(y, f"output[{i}][{j}]")
                else:
                    self.analyse_variable(x, f"output[{i}]")
        else:
            self.analyse_variable(output, "output")

        self.save_frame()

    def register_forward_hook(self):
        self.model.apply(self._register_forward_hook)

    def _register_forward_hook(self, module):
        module.register_forward_hook(self.forward_hook)

    def forward_hook(self, module, input, output):
        # - input is a tuple of packed inputs (could be non-Tensors)
        # - output could be a Tensor or a tuple of Tensors and non-Tensors

        last_frame_of_batch = False

        trace_mode = self.batch_number in self.trace_batch_nums
        if trace_mode:
            self.reset_saved_frames()

        if self.total_calls == 0:
            self.batch_start_frame()
        self.total_calls += 1

        # count batch numbers - the very first forward hook of the batch will be called when the
        # batch completes - i.e. it gets called very last - we know this batch has finished
        if module == self.model:
            self.batch_number += 1
            last_frame_of_batch = True

        self.create_frame(module, input, output)

        # if last_frame_of_batch:
        #     self.batch_end_frame()

        if trace_mode:
            self.trace_frames()

        if last_frame_of_batch:
            self.batch_start_frame()

        if self.detected_overflow and not trace_mode:
            self.dump_saved_frames()

            # now we can abort, as it's pointless to continue running
            raise ValueError(
                "DebugUnderflowOverflow: inf/nan detected, aborting as there is no point running further. "
                "Please scroll up above this traceback to see the activation values prior to this event."
            )

        # abort after certain batch if requested to do so
        if self.abort_after_batch_num is not None and self.batch_number > self.abort_after_batch_num:
            raise ValueError(
                f"DebugUnderflowOverflow: aborting after {self.batch_number} batches due to `abort_after_batch_num={self.abort_after_batch_num}` arg"
            )

    @staticmethod
    def get_abs_min_max(var, ctx):
        abs_var = var.abs()
        return f"{abs_var.min():8.2e} {abs_var.max():8.2e} {ctx}"

    @staticmethod
    def detect_overflow(var, ctx):
        """
        Report whether the tensor contains any `nan` or `inf` entries.
        This is useful for detecting overflows/underflows and best to call right after the function that did some math that
        modified the tensor in question.
        This function contains a few other helper features that you can enable and tweak directly if you want to track
        various other things.
        Args:
            var: the tensor variable to check
            ctx: the message to print as a context
        Return:
            `True` if `inf` or `nan` was detected, `False` otherwise
        """
        detected = False
        if torch.isnan(var).any().item():
            detected = True
            print(f"{ctx} has nans")
        if torch.isinf(var).any().item():
            detected = True
            print(f"{ctx} has infs")
        if var.dtype == torch.float32 and torch.ge(var.abs(), 65535).any().item():
            detected = True
            print(f"{ctx} has overflow values {var.abs().max().item()}.")
        return detected


================================================
FILE: PixArt-alpha-ToCa/diffusion/utils/optimizer.py
================================================
import math

from mmcv import Config
from mmcv.runner import build_optimizer as mm_build_optimizer, OPTIMIZER_BUILDERS, DefaultOptimizerConstructor, \
    OPTIMIZERS
from mmcv.utils import _BatchNorm, _InstanceNorm
from torch.nn import GroupNorm, LayerNorm

from .logger import get_root_logger

from typing import Tuple, Optional, Callable

import torch
from torch.optim.optimizer import Optimizer


def auto_scale_lr(effective_bs, optimizer_cfg, rule='linear', base_batch_size=256):
    assert rule in ['linear', 'sqrt']
    logger = get_root_logger()
    # scale by world size
    if rule == 'sqrt':
        scale_ratio = math.sqrt(effective_bs / base_batch_size)
    elif rule == 'linear':
        scale_ratio = effective_bs / base_batch_size
    optimizer_cfg['lr'] *= scale_ratio
    logger.info(f'Automatically adapt lr to {optimizer_cfg["lr"]:.7f} (using {rule} scaling rule).')
    return scale_ratio


@OPTIMIZER_BUILDERS.register_module()
class MyOptimizerConstructor(DefaultOptimizerConstructor):

    def add_params(self, params, module, prefix='', is_dcn_module=None):
        """Add all parameters of module to the params list.

        The parameters of the given module will be added to the list of param
        groups, with specific rules defined by paramwise_cfg.

        Args:
            params (list[dict]): A list of param groups, it will be modified
                in place.
            module (nn.Module): The module to be added.
            prefix (str): The prefix of the module

        """
        # get param-wise options
        custom_keys = self.paramwise_cfg.get('custom_keys', {})
        # first sort with alphabet order and then sort with reversed len of str
        # sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)

        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.)
        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.)
        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.)
        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)

        # special rules for norm layers and depth-wise conv layers
        is_norm = isinstance(module,
                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))

        for name, param in module.named_parameters(recurse=False):
            base_lr = self.base_lr
            if name == 'bias' and not is_norm and not is_dcn_module:
                base_lr *= bias_lr_mult

            # apply weight decay policies
            base_wd = self.base_wd
                # norm decay
            if is_norm:
                if self.base_wd is not None:
                    base_wd *= norm_decay_mult
            elif name == 'bias' and not is_dcn_module:
                if self.base_wd is not None:
                    # TODO: current bias_decay_mult will have affect on DCN
                    base_wd *= bias_decay_mult

            param_group = {'params': [param]}
            if not param.requires_grad:
                param_group['requires_grad'] = False
                params.append(param_group)
                continue
            if bypass_duplicate and self._is_in(param_group, params):
                logger = get_root_logger()
                logger.warn(f'{prefix} is duplicate. It is skipped since '
                            f'bypass_duplicate={bypass_duplicate}')
                continue
            # if the parameter match one of the custom keys, ignore other rules
            is_custom = False
            for key in custom_keys:
                scope, key_name = key if isinstance(key, tuple) else (None, key)
                if scope is not None and scope not in f'{prefix}':
                    continue
                if key_name in f'{prefix}.{name}':
                    is_custom = True
                    if 'lr_mult' in custom_keys[key]:
                        # if 'base_classes' in f'{prefix}.{name}' or 'attn_base' in f'{prefix}.{name}':
                        #     param_group['lr'] = self.base_lr
                        # else:
                        param_group['lr'] = self.base_lr * custom_keys[key]['lr_mult']
                    elif 'lr' not in param_group:
                        param_group['lr'] = base_lr
                    if self.base_wd is not None:
                        if 'decay_mult' in custom_keys[key]:
                            param_group['weight_decay'] = self.base_wd * custom_keys[key]['decay_mult']
                        elif 'weight_decay' not in param_group:
                            param_group['weight_decay'] = base_wd

            if not is_custom:
                # bias_lr_mult affects all bias parameters
                # except for norm.bias dcn.conv_offset.bias
                if base_lr != self.base_lr:
                    param_group['lr'] = base_lr
                if base_wd != self.base_wd:
                    param_group['weight_decay'] = base_wd
            params.append(param_group)

        for child_name, child_mod in module.named_children():
            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
            self.add_params(
                params,
                child_mod,
                prefix=child_prefix,
                is_dcn_module=is_dcn_module)


def build_optimizer(model, optimizer_cfg):
    # default parameter-wise config
    logger = get_root_logger()

    if hasattr(model, 'module'):
        model = model.module
    # set optimizer constructor
    optimizer_cfg.setdefault('constructor', 'MyOptimizerConstructor')
    # parameter-wise setting: cancel weight decay for some specific modules
    custom_keys = dict()
    for name, module in model.named_modules():
        if hasattr(module, 'zero_weight_decay'):
            custom_keys |= {
                (name, key): dict(decay_mult=0)
                for key in module.zero_weight_decay
            }

    paramwise_cfg = Config(dict(cfg=dict(custom_keys=custom_keys)))
    if given_cfg := optimizer_cfg.get('paramwise_cfg'):
        paramwise_cfg.merge_from_dict(dict(cfg=given_cfg))
    optimizer_cfg['paramwise_cfg'] = paramwise_cfg.cfg
    # build optimizer
    optimizer = mm_build_optimizer(model, optimizer_cfg)

    weight_decay_groups = dict()
    lr_groups = dict()
    for group in optimizer.param_groups:
        if not group.get('requires_grad', True): continue
        lr_groups.setdefault(group['lr'], []).append(group)
        weight_decay_groups.setdefault(group['weight_decay'], []).append(group)

    learnable_count, fix_count = 0, 0
    for p in model.parameters():
        if p.requires_grad:
            learnable_count += 1
        else:
            fix_count += 1
    fix_info = f"{learnable_count} are learnable, {fix_count} are fix"
    lr_info = "Lr group: " + ", ".join([f'{len(group)} params with lr {lr:.5f}' for lr, group in lr_groups.items()])
    wd_info = "Weight decay group: " + ", ".join(
        [f'{len(group)} params with weight decay {wd}' for wd, group in weight_decay_groups.items()])
    opt_info = f"Optimizer: total {len(optimizer.param_groups)} param groups, {fix_info}. {lr_info}; {wd_info}."
    logger.info(opt_info)

    return optimizer


@OPTIMIZERS.register_module()
class Lion(Optimizer):
    def __init__(
            self,
            params,
            lr: float = 1e-4,
            betas: Tuple[float, float] = (0.9, 0.99),
            weight_decay: float = 0.0,
    ):
        assert lr > 0.
        assert all(0. <= beta <= 1. for beta in betas)

        defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay)

        super().__init__(params, defaults)

    @staticmethod
    def update_fn(p, grad, exp_avg, lr, wd, beta1, beta2):
        # stepweight decay
        p.data.mul_(1 - lr * wd)

        # weight update
        update = exp_avg.clone().lerp_(grad, 1 - beta1).sign_()
        p.add_(update, alpha=-lr)

        # decay the momentum running average coefficient
        exp_avg.lerp_(grad, 1 - beta2)

    @staticmethod
    def exists(val):
        return val is not None

    @torch.no_grad()
    def step(
            self,
            closure: Optional[Callable] = None
    ):

        loss = None
        if self.exists(closure):
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            for p in filter(lambda p: self.exists(p.grad), group['params']):

                grad, lr, wd, beta1, beta2, state = p.grad, group['lr'], group['weight_decay'], *group['betas'], \
                                                    self.state[p]

                # init state - exponential moving average of gradient values
                if len(state) == 0:
                    state['exp_avg'] = torch.zeros_like(p)

                exp_avg = state['exp_avg']

                self.update_fn(
                    p,
                    grad,
                    exp_avg,
                    lr,
                    wd,
                    beta1,
                    beta2
                )

        return loss


================================================
FILE: PixArt-alpha-ToCa/docker-compose.yml
================================================
version: "3.8"
services:
  pixart:
    container_name: pixart
    image: pixart:latest
    build:
      context: .
    ports:
      - 12345:12345
    environment:
      - APP_CONTEXT=1024 #1024, 512, LCM
    tmpfs:
      - /tmp      
    volumes:
      - ./docker/cache/gradio:/workspace/gradio_cached_examples/30:rw
      - ./docker/cache/huggingface:/root/.cache/huggingface:rw
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0']
              capabilities: [gpu]


================================================
FILE: PixArt-alpha-ToCa/docker-entrypoint.sh
================================================
#!/usr/bin/env bash
set -Eeuo pipefail
# Check if APP_CONTEXT matches one of the specific values
if [ "$APP_CONTEXT" = "1024" ]; then
    echo "APP_CONTEXT is 1024"
    /usr/bin/python /workspace/app/app.py "$@"
elif [ "$APP_CONTEXT" = "512" ]; then
    echo "APP_CONTEXT is 512"
    /usr/bin/python /workspace/app/app_512.py "$@"
elif [ "$APP_CONTEXT" = "LCM" ]; then
    echo "APP_CONTEXT is LCM"
    /usr/bin/python /workspace/app/app_lcm.py "$@"
else
    echo "APP_CONTEXT is not set to 1024, 512, or LCM, defaulting to 1024"
    /usr/bin/python /workspace/app/app.py "$@"
fi


================================================
FILE: PixArt-alpha-ToCa/docker-readme.md
================================================


================================================
FILE: PixArt-alpha-ToCa/environment-pixart.yml
================================================
name: pixart
channels:
  - defaults
dependencies:
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=5.1=1_gnu
  - ca-certificates=2024.7.2=h06a4308_0
  - ld_impl_linux-64=2.38=h1181459_1
  - libffi=3.3=he6710b0_2
  - libgcc-ng=11.2.0=h1234567_1
  - libgomp=11.2.0=h1234567_1
  - libstdcxx-ng=11.2.0=h1234567_1
  - ncurses=6.4=h6a678d5_0
  - openssl=1.1.1w=h7f8727e_0
  - pip=24.2=py39h06a4308_0
  - python=3.9.0=hdb3f193_2
  - readline=8.2=h5eee18b_0
  - setuptools=72.1.0=py39h06a4308_0
  - sqlite=3.45.3=h5eee18b_0
  - tk=8.6.14=h39e8969_0
  - wheel=0.43.0=py39h06a4308_0
  - xz=5.4.6=h5eee18b_1
  - zlib=1.2.13=h5eee18b_1
  - pip:
    - absl-py==2.1.0
    - accelerate==0.34.0
    - addict==2.4.0
    - aiofiles==23.2.1
    - aiohappyeyeballs==2.4.0
    - aiohttp==3.10.5
    - aiosignal==1.3.1
    - altair==5.4.1
    - annotated-types==0.7.0
    - anyio==4.4.0
    - async-timeout==4.0.3
    - attrs==24.2.0
    - beautifulsoup4==4.12.3
    - bs4==0.0.2
    - certifi==2024.8.30
    - charset-normalizer==3.3.2
    - click==8.1.7
    - coloredlogs==15.0.1
    - contourpy==1.3.0
    - cycler==0.12.1
    - datasets==2.21.0
    - diffusers==0.31.0.dev0
    - dill==0.3.8
    - einops==0.8.0
    - exceptiongroup==1.2.2
    - fastapi==0.112.2
    - ffmpy==0.4.0
    - filelock==3.15.4
    - fonttools==4.53.1
    - frozenlist==1.4.1
    - fsspec==2024.6.1
    - ftfy==6.2.3
    - gradio==4.1.1
    - gradio-client==0.7.0
    - grpcio==1.66.1
    - h11==0.14.0
    - httpcore==1.0.5
    - httpx==0.27.2
    - huggingface-hub==0.24.6
    - humanfriendly==10.0
    - idna==3.8
    - importlib-metadata==8.4.0
    - importlib-resources==6.4.4
    - jinja2==3.1.4
    - jsonschema==4.23.0
    - jsonschema-specifications==2023.12.1
    - kiwisolver==1.4.5
    - markdown==3.7
    - markdown-it-py==3.0.0
    - markupsafe==2.1.5
    - matplotlib==3.9.2
    - mdurl==0.1.2
    - mmcv==1.7.0
    - mpmath==1.3.0
    - multidict==6.0.5
    - multiprocess==0.70.16
    - narwhals==1.6.1
    - networkx==3.2.1
    - numpy==1.26.4
    - nvidia-cublas-cu12==12.1.3.1
    - nvidia-cuda-cupti-cu12==12.1.105
    - nvidia-cuda-nvrtc-cu12==12.1.105
    - nvidia-cuda-runtime-cu12==12.1.105
    - nvidia-cudnn-cu12==9.1.0.70
    - nvidia-cufft-cu12==11.0.2.54
    - nvidia-curand-cu12==10.3.2.106
    - nvidia-cusolver-cu12==11.4.5.107
    - nvidia-cusparse-cu12==12.1.0.106
    - nvidia-nccl-cu12==2.20.5
    - nvidia-nvjitlink-cu12==12.6.68
    - nvidia-nvtx-cu12==12.1.105
    - opencv-python==4.10.0.84
    - optimum==1.21.4
    - orjson==3.10.7
    - packaging==24.1
    - pandas==2.2.2
    - peft==0.6.2
    - pillow==10.4.0
    - platformdirs==4.2.2
    - protobuf==3.20.2
    - psutil==6.0.0
    - pyarrow==17.0.0
    - pydantic==2.8.2
    - pydantic-core==2.20.1
    - pydub==0.25.1
    - pygments==2.18.0
    - pyparsing==3.1.4
    - python-dateutil==2.9.0.post0
    - python-multipart==0.0.9
    - pytorch-fid==0.3.0
    - pytz==2024.1
    - pyyaml==6.0.2
    - referencing==0.35.1
    - regex==2024.7.24
    - requests==2.32.3
    - rich==13.8.0
    - rpds-py==0.20.0
    - safetensors==0.4.4
    - scipy==1.13.1
    - semantic-version==2.10.0
    - sentencepiece==0.1.99
    - shellingham==1.5.4
    - six==1.16.0
    - sniffio==1.3.1
    - soupsieve==2.6
    - starlette==0.38.4
    - sympy==1.13.2
    - tensorboard==2.17.1
    - tensorboard-data-server==0.7.2
    - tensorboardx==2.6.2.2
    - timm==0.6.12
    - tokenizers==0.19.1
    - tomli==2.0.1
    - tomlkit==0.12.0
    - torch==2.4.0
    - torchaudio==2.1.1+cu118
    - torchvision==0.16.1+cu118
    - tqdm==4.66.5
    - transformers==4.43.4
    - triton==3.0.0
    - typer==0.12.5
    - typing-extensions==4.12.2
    - tzdata==2024.1
    - urllib3==2.2.2
    - uvicorn==0.30.6
    - wcwidth==0.2.13
    - websockets==11.0.3
    - werkzeug==3.0.4
    - xformers==0.0.27.post2
    - xxhash==3.5.0
    - yapf==0.40.1
    - yarl==1.9.7
    - zipp==3.20.1
prefix: /root/miniconda3/envs/pixart


================================================
FILE: PixArt-alpha-ToCa/environment.yml
================================================
name: PixArt
channels:
  - pytorch
  - nvidia
dependencies:
  - python >= 3.8
  - pytorch >= 1.13
  - torchvision
  - pytorch-cuda=11.7
  - pip:
    - timm==0.6.12
    - diffusers
    - accelerate
    - mmcv==1.7.0
    - diffusers
    - accelerate==0.15.0
    - tensorboard
    - transformers==4.26.1
    - sentencepiece~=0.1.97
    - ftfy~=6.1.1
    - beautifulsoup4~=4.11.1
    - opencv-python
    - bs4
    - einops
    - xformers

================================================
FILE: PixArt-alpha-ToCa/notebooks/PixArt_xl2_img512_internal_for_pokemon_sample_training.py
================================================
_base_ = ['/workspace/PixArt-alpha/configs/PixArt_xl2_internal.py']
data_root = '/workspace'

image_list_json = ['data_info.json',]

data = dict(type='InternalData', root='/workspace/pixart-pokemon', image_list_json=image_list_json, transform='default_train', load_vae_feat=True)
image_size = 512

# model setting
window_block_indexes = []
window_size=0
use_rel_pos=False
model = 'PixArt_XL_2'
fp32_attention = True
load_from = "/workspace/PixArt-alpha/output/pretrained_models/PixArt-XL-2-512x512.pth"
vae_pretrained = "output/pretrained_models/sd-vae-ft-ema"
lewei_scale = 1.0

# training setting
use_fsdp=False   # if use FSDP mode
num_workers=10
train_batch_size = 38 # 32
num_epochs = 200 # 3
gradient_accumulation_steps = 1
grad_checkpointing = True
gradient_clip = 0.01
optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10)
lr_schedule_args = dict(num_warmup_steps=1000)

eval_sampling_steps = 200
log_interval = 20
save_model_steps=100
work_dir = 'output/debug'


================================================
FILE: PixArt-alpha-ToCa/notebooks/convert-checkpoint-to-diffusers.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2878bb5d-33a3-4a5b-b15c-c832c700129b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/workspace/PixArt-alpha\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.\n",
      "  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
     ]
    }
   ],
   "source": [
    "%cd PixArt-alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "7dd2d98c-3f8f-40f1-a9e1-bc916774afb3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of transformer parameters: 610856096\n"
     ]
    }
   ],
   "source": [
    "!python tools/convert_pixart_alpha_to_diffusers.py \\\n",
    "    --orig_ckpt_path \"/workspace/PixArt-alpha/output/trained_model/checkpoints/epoch_5_step_110.pth\" \\\n",
    "    --dump_path \"/workspace/PixArt-alpha/output/diffusers_trained\" \\\n",
    "    --only_transformer=True \\\n",
    "    --image_size 512 \\\n",
    "    --multi_scale_train=False\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: PixArt-alpha-ToCa/notebooks/infer.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8b2458c4-c461-4ddc-af94-fcd837357da4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from diffusers import PixArtAlphaPipeline\n",
    "import torch\n",
    "from diffusers import Transformer2DModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81a5bc0f-682b-4ff9-92e9-43b68b3df8fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# for comparison\n",
    "\n",
    "orig_pipe = pipe = PixArtAlphaPipeline.from_pretrained(\"PixArt-alpha/PixArt-XL-2-512x512\", torch_dtype=torch.float16)\n",
    "orig_pipe = orig_pipe.to(\"cuda\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efc07821-5479-4ca3-a2c6-114ac484fd1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "transformer = Transformer2DModel.from_pretrained(\"/workspace/PixArt-alpha/output/diffusers_trained/transformer\", torch_dtype=torch.float16)\n",
    "pipe = PixArtAlphaPipeline.from_pretrained(\"PixArt-alpha/PixArt-XL-2-512x512\", torch_dtype=torch.float16, transformer=transformer)\n",
    "pipe = pipe.to(\"cuda\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "57da873b-2c13-463b-b558-ee69522ccefc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d69c7683773c4c25914764800ec1ef4f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/20 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAAEAAElEQVR4nOy9daBlV5E9vKr2Ofc+b3d3i3VHOi7EiZIggcECBA06gcGH38DA4PINMDDIEFyGAEkIcXdvSXfSSbv36+fv3nvO2bvq+2Pvc+/rJMzAIEn6nQVpef3kyjlVtVetWkUuS8lEABEBABQgQBFAKFCgQIEC+x8UQiKSx/4CBQoUKDB8oNwo/AsUKFCgwDACMZSAguopUKBAgWEHLkJ/gQIFCgxP8HP9AAoUKFCgwHODIgEUKFCgwDBFkQAKFChQYJiiSAAFChQoMExRJIACBQoUGKYoEkCBAgUKDFMUCaBAgQIFhimKBFCgQIECwxRFAihQoECBYYoiARQoUKDAMEWRAAoUKFBgmKJIAAUKFCgwTFEkgAIFChQYpigSQIECBQoMUxQJoECBAgWGKYoEUKBAgQLDFEUCKFCgQIFhiiIBFChQoMAwRZEAChQoUGCYokgABQoUKDBMUSSAAgUKFBimKBJAgQIFCgxTFAmgQIECBYYpigRQoECBAsMURQIoUKBAgWGKIgEUKFCgwDBFkQAKFChQYJiiSAAFChQoMExRJIACBQoUGKYoEkCBAgUKDFMUCaBAgQIFhimKBFCgQIECwxRFAihQoECBYYoiARQoUKDAMEWRAAoUKFBgmKJIAAUKFCgwTFEkgAIFChQYpigSQIECBQoMUxQJoECBAgWGKYoEUKBAgQLDFEUCKFCgQIFhiiIBFChQoMAwRZEAChQoUGCYokgABQoUKDBMUSSAAgUKFBimKBJAgQIFCgxTFAmgQIECBYYpigRQoECBAsMURQIoUKBAgWGKIgEUKFCgwDBFkQAKFChQYJiiSAAFChQoMExRJIACBQoUGKYoEkCBAgUKDFMUCaBAgQIFhimKBFCgQIECwxRFAihQoECBYYoiARQoUKDAMEWRAAoUKFBgmKJIAAUKFCgwTFEkgAIFChQYpigSQIECBQoMUxQJoECBAgWGKYoEUKBAgQLDFEUCKFCgQIFhiiIBFChQoMAwRZEAChQoUGCYInquH0CBAgUKPA+hjT8IAALTc/ho/kYgVf3fP6tAgQIF9lMoQM/2N1EoYAElNAGiIAL2+eQXPIoTQIECBYYL9o310PAhVSUFVCCiBBJrs0xqA0lPpbqpd/fDm57q3rNt7sRJ55586qi21v0pBRQJoECBAvsfnkZsqIZArwKCqhAyZzOX9vf1dO7du7e7b9Bmu/r69nR19+zt42yga/fmWn//rv6eQU339Fb7Ogc62kfA4LVnn0P7Udjcf55JgQIFhjfygh4kgKooAFUnNsuy6uBgd0/Ptm1bN2zcsHHLpo1bt23Zvmlv556+gZ7UZQmpaY3QwqaJS6Wm5hLHTTqyo600uW1kS/vc1qlpf+2hu7Y0t4wCSEWI9xP5TJEAChQo8AKEQmnInwFRslaVaGAwHaj0Pf7kkw89/PDGTRu3bl6/cdP6gVrfYGUwc1lc0lJTXG6KRoxqGTW9deGkRa1t5XJby7gJo5qbS83tho0Kab91lSSt9Gra7TY/unHl3Y+9530fPOekY6HUiP76gmeDigRQoECBFxBU1REZECngrHPOCnSgv7Jl+85HV619dMXK1WtX79q7fc/eXQNJf1tH85hxHR3TW2dOmjd23KhR49rHTRw5esyIEaPbSsZETERaLkeaSRSpMWnV9leT2t7+/oGq7e7q7N5ZefKe9ZtX9x5zyClvvOiNJTI8NOS/wKM/ChVQgQIFnt/QIXpMqAJEA4OVvv6+zTt2PLzy0d9fc9WGDU909XYOpIMwtqM9njl34px50ydPmzxz5pTRoztGj+hobm2Km2LDTKzKKioOGRROkIFt5khSRmq1P+V0Z1ff5h3d6zZ3DQxkj938eLKlrz2b+PtfXzd58sRSOdpPqJ8cxQmgQIECzzmehUxRFSJSkKhasdXqYG9Pz7onNq5c+/g999/x6IqHd3Rud2Xb1BRPnDlm+XGLFyybN3nymOmTx4wbPbLJlGImA2YDqCg5qFWCqGikoo5EVGAodgpRMDRTV0mT3X3dj2/etWlb75OPd9kB6u+Uvs3Vb1/+xQmTJzSVo/2A83kaigRQoECB5xyEwOQTAAEECuLe/sH+/sHHnnjy/ofvvfnm6zZveXxwsJdKOnnqmOVnLZy78Pipk8eNGzdy7JgR5XIUl0rMEFUFjJKSgkhYoaRgKCmpA4FUVKBkGE6c1qqsLDEPWGzb23vPg2u27awO9IpJm3t39/XtrLzqNf9w4vEnNJdM/jD3KxQUUIECBZ5bqNZnr4Aks32DfRs2bnpg5SO33nnXmpWP7tq73cXZ6EkjFh884/DDlixaMnPsyPZSOWppiSIiJiFRNhBhJYif2VUV/12ZBKIqpCBDDlBVEmWoQpIUCrGMXZW++x5+5IEHVw70ANoWo61v6+C2NVsnjxj/u1/9Ztq4sRGzKoFoP0sBxQmgQIECf3+oBtUmKUiBWpLu6dq78vHHbrjthkdW3Ldp65ODyUDr6HjBAZPPWHr0vLkzZ86eNnbsqFLMTKIkCqhaYhaAmEUVkQIEgUKIxBCDoHAKgYANFEpONVNjlAxltmRI1CSbNq+/6a5H1z61o6eCOGpvx4jqrqTvib2lbvryVz83efTYiBng/S32AygSQIECBf7O0PArKSgVu2PXjgcffujGm25e+cQjO/t2SimdOH3kWSceu2DJ7FnTp4wZ0dzaFMcRKTlyTo1V578DMZjEsOeNlCBKgJABhJTDWIASgcgX/FaYTBSTqrUZLKjCtfseuOvW2+/evdsqt5fiEca02ordvWVHz57et7z6jUcdsbwUsyoB2C8TQEEBFShQ4O8AH2hIFKKwgj1dvQ+tXX3V9VevXnFXV9fOUoeZs2jqIUvnzZs7ecrUsa1tzaUoVnJEyJw1xsCAlRiqIsxEBCgYjJya0dBEIJCqECmUVAlK6pwzxMRGbAYi50xNaUd351U3X7di9VpDpShqMlmriUeiqjvWbtv51M7D5h/5X9/7/sSxI3zlr1okgL8LRISYaP/rthQoMBwRpnN9/9QquvZ2PbJy9XW33nL3igc2795Q7sCM2aOOOvKQRQfPHj9mdHOM5jLBpVyCOJiYnarhSJVBGpGSgIwBhEFQIWUChWwA+IpfVVmZAKHcAQJECgWpUwvqyir3rlpx1ZU3DVSzUnszwZByE1rKWdve9XvXr9zULM3XXn3t/LmzSsbU4+N+GZKeXxSQAk502/ZNI0eOLDW3C6RsfPcdtJ++AQUK7I8Ikn3f3XWgzr09D65cccNdNzzw8G3b9mxxpWz6/AmvufCQJUvmTJk6tlSKYoqIUyiEQBFbVS5BCaxCbJkNKaCWDMCqqqJg4pzmyaO0+mAPL/0xgBAUFhyJUJpJprx66+bf3njTxk2bI+oodZQQOWthuCWSlsFttd3rerkWf+rz/7Jg7sy4Ti/tp+U/nm8JgAAR96WvfHVX584Js2ZB7bjmltGjJk6eOHvCuI6O1qZRo0e3tLc3tTRzVGIiKAwxQYsTQ4ECzxPkFD9bla6ePSseeeSGW+688+F7N+7ZhLZs8qzW0884cvHSBdMnTRjTRE0RCE6NgwgTAQwlEEOJlQlKZFQ8lwPiiFRViAAiBgjsiR8DUlIV8mx9iNoAlKBMSeoypq29ndfe9sD9j65UF8fxaOayEDmkHEdGSq4f25/auXfDzgsufOkF57w0MqZO+++v0R/PtwQAIIrjKGr6/S23xg+vGDeyxGmXEokZxRxFqRsxcnTTyI6pM+ZMGDNl8qTxk8ZNmDxx8vgx40aP6WgqlcqGGGr25/erQIHnKcSX/YBT9PQOPLhq1W+u/8MD99+ya8cmjbLJc8edeeKSAw6aPX/O5JaO5lLcBJIIIhBwrCpMBKV6s5UNBSJYKPxRCT70hxRDgDLAxAL2qqCIhBnOOWJwzE7UOmQa99rsmlvvuP3BR/oqKDWNYhdRbKxmqsxoK3PcmsZbHtvQvWXrkvlzPvmJf2lpioZJTfm86wEAevv9d5/76ovi8a3vfv9rJ44t9/d19/bb/sGUUt25q3N3d081zQb3DMKpIVT60vETJrd3jD5o/pIFc+Ysmjdv9pSpLU1lo8ZEjCErHP4qQ3z73SRggQL/Z2j9NwVlont7e1asWn3dzTff+eBdG7evR3M2elLzwQcvPGjZ7FkzpowZ3dZSMqwOsUKZSckz9IaIwAoONxcRMRMxAeoJfvUqTCWi3AKOiKFCALFRJVEQFKpMCkMQdcoOmoHvW73u2lvu2LylS0yzK5UMlwwrGU3VGuJmtHRErf1PdK269aFmW/r+t79/ymmnGq8ZHQb3+vMuASi0rzrwyje86prrrznrlS9+1SVnzl88JWIoIamaLE3Zqh1MBnt6e3s6d3Xt7e6vPPnklt17Bmt9g719yYiWMTMnzVs8d/FBiw+eMnnqrOnT2pqbGAookbfx2P/f1AIF/nbwLpwaSn5NkrSzq+veFQ9fe+cNDz16R2fXTilh3MyOpQdPnztn+txZU8eMHtVaKisrEQkRkXGwzDAKEmGwRqQKhjIpgQle48MMINC7BDDyI0HIAYq890sAMYNIRACyMJGVOKOos7/vR1f8/pHHNqhEXG6OuMTEFJFjcWKZTHNUbkqbsj3JQ1femfTWPv7BD733Xe9taWkaPlHieZcAAHWK6274wwWveQ13YN7S0W+77KULFs8tRcrUUlJuQ8wCFRuVOVHnJKokNklcb9fAtu17t2/vfPyxjbu37HWZtpjmGZNnzJ4ye/7c+fPnz582fUpTqclQCYo8GRQoUOBPhYa4rwKtVCvr1z9193333HnXfWs2rd5T3RuNwuRZHQsXz5q7aObUSePHtpdby3FJoQxWgiEBEUIPl1nrjA+IAK/mAYFZwT6eE+BJfALBEEDgMD8gMACYVaAkkTHOOd8ZIJZUokpk7nz40V//7qbu3jSK25hbyUhsWKHMxrIA2sTlFpRbspa7f3fLnnWdpx570g9/9F9jRo1g5iIBPJcQ6EBl8KJXveq2R+9rmkKzDmx7yctOOuaE5c2mXMqoRQw5iHOGQcgIBiBmFscOVE3TWmb37hnYsmHr5vXbtmzZvXlDV1ePi0yppWX0CceedOKxLzpg8YIxHa0MJuAvyQMFHVRgv4bCx1qwiAooFe3p71255olrb7v5vgfu3bb18cwlLWNaZiyYuPSQ6TNnjZs6bUxrW1McxQpf76sJwZ+IILCRiUmZVH1gBxHXy3gChSKfDIjBDFLyLA9IDUEocLlMnuoheFNosAN5oScyV9rVX/vZddfc/eAKE3WYqMVoZDgiVjKkClWCQYSorKUObnngt3d1bdo7bcyUX/36lwvmzC6VSs/1y/53xfMxAfjAesNNN130ltdjfLk0pjJ6NM55+UknH3/suPKIks2Q2jguEyA2M5EhYYUoEZisE2LORESpWkv7+9LtW/ufeGrnhvVbtu3o2rlld+fO3jHto4496qiLznvFsUcvbzJNTmD4z4vkz/qSFcmgwAsfQfcIv0SRWKAC9PT1PfDAI1ffdMMt9926vXOHRW3UhI4Z88YfcvDCeQdOnzh2REfZNJdAajUSVVb2rL0Rr9MTMUQUEYhIiaAMgIi8Zt//QCYmEJigDDYhI6gQALAyQUL/Vw0BRPBqfwdlow5klBPLa9Zv/8lvr9y4oztuHalc8ttbYsMWwmQESswRTBM3tVHrY7c/uPWBbaWK+eV///KEk48tRfxX7Be+IPB8TAAAFJpZ96a3X/LL269a9KLFlcFOW9p9zOGHXHjmmTMntEWZRKZdRdWRiVidKoGZASWQgh0pEdssI0Jay6ppbe+u3p079nTu6uzeXVm7dtvadbsGu5IpE2e89LyXnXve+TOnTy8B+PPaPsPnIikwXKAQ33n1kp6evv6HVq/672t+c8f9t+3euZFapH1CadaCyYcdPG/evJlTpowul01Eka/QlUhBVjJlIfIjWazgiDiCGmVQpEogZZ8giHJ+Xxnw9I0v9xnEAJHR0BhQCjY/3vPB+OQAFuUMbJyyFTOYuD/ccc81N9+pWYmiDjIxRWAGK4hhQVYdxxKpaZLWjqh94wMbV9+9IuqXz//Lpy5+y5ubm6L9ZtHjn47naQIAVIAn1j1+2NknTjps5oyDp3b1bqr27Jo3c9JFF5x40Mx5lLWRapmNnwtXBVhVwCBRkjAKQoYAsUROGbUkqVQG+7vt9m09nbsq27Z23//AyrWPbWRuPu7QYy5+7T8ce/QxLXErwOYZ10ER7AvsN3jWsSYNJb8SuJpVH3vi8Rtvuem6m25ct+XxNK6NmzVi9pKJixfOnrdoxqSxo9tjilkZouydlSGOlViUvAenQkREFUoUs4mYjTJgWEGGQELeqpMYQM7lgInzHKDsNUFhwktZVUkVTARGJOqiyDhywlaFE+Gtu3v++8obVz65Po5GMrUwxwpFLFAXwTgRjWNicqiWqNTm2qUzueXn96Y99h8uOPtLX/7C6HGj/1waYP/A8zYBANBU7Zve/aaf3vb7A08/pmNSU5p0dnZuHN9uXnrmGUcuPbYVUs6sISGKRFVZCGxgiMhaJWYQiwipMhklIQAMB0lqFZdJb1dl187uDZt23XvX6kcfemr77sG5Mxa++eI3n3/2S8aNbCeVEj/vhiQKFPgrItfzqIAVcNAduzuvuunG313/+yfW3J+kfeOnNh+4dNYhS+dOnzFpwqRR5XLMMEpghkAQqCLx/4FIvde+qoiEwEJgMmw48vemEhmIChERgcIGAOV8nJPJMEBM0GDjQzDQMPzrK3RiIQOnIoatmP6KPvjYE7/5w419g1mp3EJUYjKkzExOPJekKiT+OMHUxk1Ng+aO396x98nKsYce9Y1vfXnRwnmRAcDDsNB7PicAKNy6jU8de/4ZbvqI+csXR+X+xPX29exoYzrzRaedefTy8VwSyTiCiqeAiJVVAZCIkjcVInbWgUBsxDmwEglIVcUqVbOse2+ybvPu2+5a8dA9q9ev2zalddJrX/YPL7vworkzZ0TE/MwGwfC7Sgrsj/A+OVBQxaYPrVjxiyv++/b7b9+xd1v7pPLcxROPOPSAxYunjR8zorVkIiOkqgZQA1JREEhCAzfEeiFAgkZIFaoiECgxQIYjNjFMoHz8DUThRqprMZjYgJly5x6QkgbVkCiYoX5SOFMiJUqVd3QO3HL3w/c9+FhmI4rKbGJlS6REBCElIiZRZWaBEBCTGW06Hrl69aYVG8Y3TfrKV7583stfXGL/fYfjBOnzNgF4BYKz4r7y1S984FufPfrCF5tWrlAnRVVbqbi+yjHLlrzmjLPGjWwXVyMVopiIoaSqXselVuF7S0San3tVVdn5bx9gKMlSRLxh3Z4V9z9++0333f/AU+0ds88682Uvv/C8g5csboWJoA1p2LAYECmwX0I1WPR4WkW2795+w523/OoPP1/1xMpUq9MWjjn+6EWLFsydPXNqa0s5YhYoEzkIgQTifdcg/htQfSRAFQqGqoQjhYpfyaJqQGATGcPEBjDh1iYlsBeCslf2gJW8XghEgDQeNJSZADgVMpEjscoDSbryyY033/Hglq1dbFoMl5hIWYXJc0oMcuzJIyEYoyhxqYxy17qdD165oikrf/gDH3zne99VLoPZDNsb+nmbADxUIT29XfOOPRKzOw477fCB6m6LPlat9iTsBpZMGvPql7x8xuTx0AyAihiKCQAcAApBW5lYc4MSkNe2CYiIFSIwUJeRYUtswV29gw8+uPEP195/650rylH7iceecN6JZ55y7NGjW0fFiKLCla7ACxWaC/mpu6979crHfnP1b29/+I5d/Vtbx8sBh8479JilC2fPmNTR3FwiEhVfITFBTcgbfhQ3mK8FRwbNlzmGk7eGsVwnImEZI4hNxMzMBjDepjnU/hwmu1SM1/4D8AofFcpvXgchAwdh5gym5uzO7t67H1j90KonBvtBpoWZ/bEeRI4ARaQEgiNVIhhloRJFJRfFA003/Owa7ecjFx9x+c9+OHnSOArzxs/Vm/Ic4/mcAHy0Fqvy7R/+4B2ffN8xFx49aurYfttbS3qNGmROB/pGt7W+8sILDpg7LZKKIYpMOXCSREYZBFXlELUVyiDVoDqAQry/uDFkrcBAGDZLTak0OCAPP/zkT//7xrsfWBNh1EGHHH7hGReee/pZo0vlZnD8HL8yBQr86dCg6lEk1q7fvOUPN//+uhuuW/fkCtNGsxeNWX7kwiMOXzBm/OjmplZlYYGSUyBYbPpObmDsxVvuwys0c0sezVWjCHW7CtR5H2gFEzGzIWImVjAhrOsN/j5QYoWE0V/fOCCFEikxERkr3tOfySn1DqaPb952+wMrt2ztsZZBTUwEY4mUxEAg7CcQBMSh18waU9xkSy22vOKGR3as2zGuffz3vvf9Y45eHoXFAsOX130+JwD4t0XgBqqVE08/abPdccS5x0k7Kmk/skxsBotksFLS7KXnnHLCsiVNMZgMgQwZ9leXejYQAIPqQ+weEsoZFf8hcYCBISg5QFOJdtfsjXev/s2vr1+1el2JRx9/5MmvPu9Vpx+9vIPLJcTDTjJW4IWB3MPYc/xEVnX7rh133//Qb6++6tHV93dXdoybMfLgI+ceeeSyBfOmjGwpR2TBVsEwqg4ASSjOVV3uwqD5xpV8gTtp4OilwQKFB5ALSb0zIzHBMMEPBHiLT38CIN9JJrDPKArJ1aFgQwZQ5VQNRKnmsLu3775HHnt45VPdfRamhVACCaloLMF1GqxMBCESBYtCWQ1zE8otSfuOlZsevv7+Nm7753/71JvfdEkpP33sd4t+/ww8zxOAhzrIjdddc/6bXz7r+IUzDluQIRHNsiRJrZSoVBvod7b39KOWnn3qSSNiNipxXIYLlxSTgYRTJcK8oH+/NdwkTAIhsO8bizLUmrimECuCUqmzUrnr1oevuuquhx7dVG6afvxRL37LK199xOID/FHAPMcvToECQ6EKb4zGSugb7F+5etXlv/vZ3Q/csadz54h2e8CyGUefuGTeolmTJ4yNo5iFPA/vIJ6SV+sMsxD7kE5e9ONrqeCxTEqNbKAgUdH6EcDH9rzCZxAp2Fu3h1+UlPMZXg1LvIg0zA4zkUK9I5BTUsdqlfursm7bnjseefTJp7arlMlvg1eNiCDqWJWUoazkQutYAN9rMCVTbqPWrjUDD137IPUn73rbW//xIx/saGkO7tP7tdvz/4oXRAKAQNM0ueg1r7zq4WuPfcWpLePLVlMn1gE2FXGWYDGwd/HcKS8948wZ48YYUSWYiGDBahRKMI1jnobTHsGLy4acasVrjYWQKWxEBILAOMRd1fSmOx795a9vWvX41nHt0846/ayXnfXSw+cuaAFHMPQXmUoUKPBXQB6RVcAbNm+87sYbf3/z71etecS1JOPnjjjy6EOPOnTO9Mnj2lqjOFbjnLKKMMg4JQDMrF7SoyFaayj8KWQJaH4PkaqKbyoo/J/zEeLcvI0Yub4zjHuF2S9DflQgWDTCE7MUVP8s6iJEIAJlqXMZ0a7eyqNrNz782JNde2ugFoFRODLWwJEQKTuGQAyYIAJSw0rWd5KNiWLXYrronl8/2LO596TlR/7w5z8eN7YjNsY/oeHJ/NTxwkgAvvLYvHXbUaccl46tHnfhGZZrFRkQQ7CSOUdQOJek/RM6Wl9++mkHz5naZOCnxJkiUiIxOfuT+5CoXx7qV8VB4EBEogwoqacyfcMYEKUIStJU6k8rt956389/esOqx7eNGD3nrJPPf/35Fy2ZOceAymAz3C+nAs8BPAMvBAL115J7H3r4J7/+2T333NLVtXHslI7lx8497KgDFy+cO3JUeylWC0tgJTFKAlEvugcF8Sbys7K3XPOGnAjVP8LtQJ7xCb+K39XSeCQgGDYcCnsf1sNMgKogDHP5b8UAWJmhIFU4JsAwSJ1SKjKYuie2dt636olNm3e4RJRKpExEyhAVUiFiEpLAGoFASuxUNEqIjYEpm5grzY9c+9CutbuXTFv0jW9964hDDzT+KxqK1OGLF0oCgEKc6m9+8YtXvueS5S85euKBEweywUxr1jkTGSsimYhTSapjmnHmMcuPP/zg5sioqiFWq4YiUi9fUKY6W0l+16iSqOeJQkLQkCIAf7QVkGFvSshiSjsHq9ff+vDvfnvjunWbxjZPPe1F55592kVHLVnYQRJTxODhflkV+HtBg0cCtu3a9btrrvnlb3+15slVKNfmHDDx6JMOOPzgA2dMHdfRGhtAyYpYRCzK3n6ZQNrg9sOlHwp58dL+oeyIKogRVKCad8+cOPWObBCtMz9EOfvvg2ygelQlEC6ihpkaCYCJXJgpBgmhkrrdXT1rNmxd8fiWzt6Ks8SI2c+rAQCcF5DmmSUXe7A4QgTiRKGxa22V8lP3bXns7sdGNXV88d++9IqXv7RsfKob7uSPxwsmAQBQSJLUXnfJW//77l+/+I1ncmtUc7VEqmqYQFnmDFitE1dr0+SE5YeefMzyjnJkSNmByUC99hn+EieVMPwRLnu/OVoJ6isjnyoUKl4z6n2sSJQYzGzivqzy4D2P/vq3N9316GbTPO+0Ey98w/nnHDp7XiviyC+tLtJAgb8JNHduIAd5ZNWqy3/5o9tuu2FX7562sbrsqFknHHvAkiXzxo0ZbZggRqBsIOJUlBhKBCHAASKob1jxpEzO86uGYVxP3AfalOqTXKoQVafqVBVB8h/uLCYi5sD/UBBeh+8afjHwds8c+raqIFViq6hmrqdSXb9916rHN2zevivJCABT5HVHnD9QZ3I9h7+Hc0WRH0s2LOWoyVTatq7csvr2lc2u/KF/+sClb3t7SzlmMvvxjt8/Fy+gBBBGwzZt2nLCmSfLBHvEWcfZkh2UAVERhTHGicIpaSbVWiTJsgPnn3XicRM72iImUoGCEEFVxYC8TgEA2Bf8pEJg9aq2QAH5M64fTBc0IrohGBAZI4Z7s+yOFU/+8re33LNi/biWqaccecL5x5x14qGHtiA2iIvLrMBfF+rDNmMwrd1x1z3/9bMf3P3A7VlUmTJr1FHHHXrkcYtnzxw3qpkiFrIsTE5MfbmVaDjmKgAWyfX77G+CoPn0TL9nRvNR3Xz5ih/TUn8sVlXAioqKk9zc3zM/FKx8hsTZ0J+goMZThTIZUTAYgACJo76ksn3P3pWPb3xqy87+fgvEhkPlBYWDEsBCgEqQcxAgvnmgJAQGRAmRchtaBjebW6+8OetJXnLmuV//96+OHTPCUCHa2AcvoAQQkDn73e99+72fvOyQM5aPXzAx4cyRtZKCmdQRGcksKUOsrfXOnT7p/BedMGfKxJgzEjDFANgZEMQ3oAQgIWVAXCMB+CvUZwgl3yyGMpMvmZgjWCJSYgMRbTIDWrvj3ke//8OrH9/Y09Ey66Rjznzry1934OQZJUi5EAoV+IvhY64K1FBP3+BVN/zh8p/+eNWqh6lZlh027tjjDz3siAPGTxpVLkVKNiIn6gBDnm9XFXGBepewLd0NEW3mPL0v48kTQARiXyl7qig37gF5V33fLUYmIip+t6PWx3ihnt4JS9opby8QSAXhXAGFkiHrkAkGK7K9t3/Nxg3rntrc05NaR0wxkwFSIkcaEUjyB5prkpDz+KFroQCRxGzipIX66NZf3ZX1ZYceuPQ/v/PtOdOmxFHsX8riZF7HCy8BCLRvoOf8l15457p7T3jpi1qmtlVsTcj6QoDViIoK4FSylCWdMCI659STDpg3o8wcCasKq/HD5whUoLfA9ZoHJfJXM4E0l7eFHEAUTAzh/QrBCopUwQrWxNCONP35Vbf+5orbdu0dWDhp6WvOftlFZ5w9uW1UjMgUMqEC/xcooCKBWN/d2/O73131Xz//4ap1D7WMbVl86OxTzzjhsKXTJoxqM3BkrCqDxFfUykxM6hzBl8YILL83aSAOKszAffofpgoIlHK3fqoX9qp5A5gCAaWkUAuIqnjSlPKxKg01O7zcM3wTBohhRZWYhMiJy1SrSdY7UHv8qe1rN23dvqcrzWBMEwkRiFU1yphInQlSvTzw54d2AljhXyElIoaUyZQHRt171V27Nu6Z0DLhu//1/VNedDx5FVIR//fFCy8BAFDoqjVrTznvdB1vDz/35KglrUrFqgMROSaGqLCFqooVSLW5TGccf8zyJQs6mqIIYkDiW0XC3oiEIIAIsdf8AP6ylXBsDcdm0mB6yAohYvGeo4ghGnGmrA5OSmbjnr2/ueKGa294aNdeXbrghIsvesOLjzthFJVKGH524wX+AqhK6E+R2bRj+/d/cvmvr/nvrdvWjpnRevTx844++vBlByzqaG9mFmElIieOjVGnrMowjklFiMQQiSOAHSsJAMdB4uALHIiP/76uz+v3YM8PwAdezxeR1jWc/qBsxY/eE0GCbiJ8N9Ig9vdnbTLqHf0FIMeSqVYz7R6sbtnVuXbT5i3bOgf6UmPKZIjgGGABhWF98sPDSs6rjBp6paAvVSFlEkZU0lKzLT9xx/p1DzzeSs3f+MY3zz373LgccejyPRdv5PMYL9QEYCHf+vo33vPpDx1y1lEzDhpflWqCTCSLKAKROGEYVVVxIiIubSJ71NIDTj582biO5shvJxWCI7+0zn9XCaYRQ1Rr4g+bgrx/lT8AkNc0g8TvsFZLgCEhZstcI7N6w5YfXXHtbXesddR89vJz3/yKVy6bvaQVJVMwQgX+N2hDk0kbtm/6rx/+4hdX/mzX3o2j57YsO3buCccdcciBs8a2tpWcg7c8Z2/FHGQ33vTHF8UIIhlShV/UAnUEGCUNzp1BCZFf93XvBwBhptiffTUvvfNmMQl8H1i8Z5DPFErBHVQp6IN8i5aF1AkzZVZqcANJumtv37qt25/asn1v7wAkYpT8KklBLSJiMaTs/PYwMCAS+hLqJT8CAgkRqYoSDEsJ5ajSMrCp59bf3MtVfPL/ffRd73xXuVwKr0QR/Z+BF2QCAKDQ/srgq1772utX33LGq0/RVk4pSZHmI4cMJ0SAaCYCkNiMJFs8dco5Jx8zfXIbO4koNiCyDIILs8IMWH+NiRc8i+8Q28bx0ZdB3sDQ1zee5gzDjewHGv3S6wFj71v5+I9+8bvHHt0ybuKCV5z1xte9+BWTyh0leHuK4jxQ4OkIgVhUmbfs3vX//ee3r77mt3t2bR47o/2Ek+Ydc/yyJQfNa20tRUDYkwvxXdzc/SEQ+t66JxfmhEzgzSHyQXgf9OtnXglCTT/K5fkerhf89Q4AEFhRqIq3/BQRqFfUGSH2oqCYIBCnjggckSrEKoBMMJC63f2Dm3Z2rnty6+7Orky0ThSFqRwmUoVwECIhtCZCC8BXbd6nlMWpEGesJtZSs5Z7NtQeuumRyp7qG1/zmk9/8pNtrS3GRKFyKxLAM/BCTQAABPrUpqfOuPCcWsvAEecclzRVU3IQIYhXi0FFFETsRNQRCzgbnDSm9YwTDz9k/tyyMpwYYnHKbES9/UlQB4VBMPWXoPMfIZJgfhgEEvlccUgN8G0z31swpIjiGsuOysDvrr/hiqtu3bHFnH3cue96ySVHLjqoTEyFIKFAAyFEiQgz7+zZ893Lf/zj3/x8264NY6a0HXHU/FNfdPTSg2aOaCsTRMkvQfXB0LMj+Z2seYtKxct98pkXX9zXjwcauqh5ad/w86wrPYc8tiETwCFteJWoiCjI/05goljA4rl4tQJRUmWIiBOxVtJU9vZXN+/eu37bru079iY18ZYVxKqwgABMavKEQ/7wTfXphJCglNSvnCdVp6xMLkYUJU2y1973h5V7NnadeMzRP/7Rj8ePHRlxpMED5u/0Rr6w8AJOAApkyK644oq3vu9t4w+ZuORFSxNKrKSAdf7KFvhLiADnhJVUXZbVxrTEpxx95PID5o8ox6RCApAJ8wGhCSyAyW+mIADVoDbzGranXU6BHiKqM6tEbCAgIo0oMdWntm/+zuVX3n7XxinNiy59/WUvOeXFI+JyDCmawwXgqRgBM3UN9Hz/Jz/5yZU/27rzyVHTSkcdOffYo5YffsCi9rYykzoSeGIcyA+jyDer1G9kyv8UpnsDsZPT9koqon4QHgj9XBFP1/glSkCdM/EHEv99Ke8ThIRDXv/jVDRkJCMEcWBWglNC5pwVcUA1k56B2vbO7k0792zetmtwMIOqgd/+4psQyv54ovnzCcO6AhDyif18IAHwlRqlBDZqmqkJPfGDNzy4Z333wXOXfPNb3zxw8aLI8PC2evvf8QJOAAAEUstq//KpT37tx18/5uyj22eNqnFqtaZEqo5gFGAJ9I0TpxASTWvVFoPjD11y4mHLxo7oMETqREUIkZLjBhEa7i8ln0xECeHec0RhX13QBWlDYoD81M0qiNiQOI4ki6gzG/zNzXf/+pe3bN1izz7qle98zRuWzZnVFLoCxUU6HOHX8IoAzP21ym+uvOq/fvrd9TufbJvetOigSccfc+jhh8wb39FRsqrsRAj1KXYf/clLeHKJWj3U56RJkOvkPS0o/KZe9QqHoNgRbxzKxJ7HrF+L/svrwmj/dyAMDWsw0yUnTgC/R95bulmxRJo5l6lUreutVHfu7d24Y/e2XXv7uqrQCDBEatj5l8AnGIaycP2+ygNTSEU6lHSlsOeFODMwURaXk/ipe3euffCJCW2jvvT5L5533tkRc5hgK+6tP44XdgIAoNAdnXve/M4337vyzqNfdgqNsRWXgpxv4YKUwXCOiKHigqoTBKDat3jOrDNOOnL62NElIogwGQCqjoMHlm9twUEZwgoldSSkIGFPSQ6pkii/yYQoSIiIoGpY1Te+TDlOIlq97fF///Yv7rl548Sxi9/z+nf+w9kXtnG5uRgZG27I624hZILrb7vlW5d/Z83aR7nVLT1k4lHHLjv++IM7RrSwKBkYQOAgPkLnGsw6K5KH8vrfNBTO/mNat+7XOgDJc4aXcALeuJ+CnUNOHOXNYfH0S5gBU1aCUMgkDlAigTpYUQdRq5KJ1qz0VJI9PdUtO7u27tqzt6cvS21EhhSG/EiCI5AGGYYy6n1qYMhTyo/i+RMm369wIDJEZSppX9S1fu/DN61p0ZYPf/iyt77pzeVSFG7nIYej4hZ7JvaHBGAhDz744AWvfiVP4uVnLsvKaQLr95ESK4QZ0NzOwVkHYnWi4pytTBnbcs5Jxx4wY37JEMFChWG8KShBhYzm3BBBPKfpDUIBhJqfJJc9U16ThauOCKpBbS3KEbMS0lh3JNVfXnHND398TXVXfPbJF77/kvcum7OgWSNDxRr6/R4KqLrQYwXTqsef/Op3v33TrVdmcW3qvLGnnXnSqScunjS6g52FcSJERutLVLya5xl6lnCN5t8fOTeUMzx51siHt4KBs4fkS32JiL1ZP/mrOrSVw4/IWwq5hSI5iKoQOFMIkXWZlTRzqapmTvqrdm9/dUdnz5Yd3bv39FWqCRlm8tt5XZhJo/p38021MIhcd5sIDQ0EOjb3lFYhEDkoSjDNtty3Ue67/t5ql73oJS/7yte+2NHaxBSFs1ER9/9HvOATAACBJs5974eXv/cTl80/ZOqi45ckcVKVhIj9ajt4Z3JVEiWosMKReJGEq7SV+IxjjjviwEVtTQKbRhSrEAuBfY0TWgCEUDYB6iUT4Vr1wwHwU5P142YghoRUVISZ/eiLMkFdRBWu3b9i9eU/uPquO9aNbV/yT+/66KvPOX+UiWIwF12B/RgqEiifaPue3d/58fd+ftUv96ads6bxCS9afvrJx0+ZPDYmq6xMxqljZkggfpSG2LMN4RqBIbR//efkCp+ha7xyOogU6sSrjfKLO6+Q6ttfNGeHKARnEmj954lCBU6dgDMVqy7LMutSByTWdfdXd3YNbtm1d9fu7v7BQXA4SoPhVBnMwoC6XObDQV6Hhkln/jTyv3gTIAUp4BQwoJjiKCm7Lnvf7x8c2F098dgTvvkf35o8frThopD6U7E/JAAADtpbGbjkzW+76pbfHXn+keMWjqogcWoVDmxUlARMhkQ8exh0bCIiakhKNjnp6MOPPXRxRymKxCjEIBZ1YBZ1/t7z9uFhDEUlmEkg9KoAhJkAYEgVJl4r5A0MScFkmKAOHHFC2Ny96/s/uvK3V9yfDERnnvDyD7/zHctmLygXR4H9E6oSLEYSMr+++nffu/w/1mxaGY/SZccecuG5y5cunN9KRJyJCAwDLKLGH181X6JYL2hzpSfwtIDZ+D1IPP1yRnizcwmfn3duJbBFqgrv3EmU93qD6afXRJPPFl5xJAoRQGBVrGotSTJ1zrlMtL+Sdvb3bd/dtW1HT99ALbOqKoYc+4YxUUbKYOOYCJYABYeJm/qKeWo8g/rsbtAs+d61IyWjVHKluL/j/j/c07W1e8rIyd//8Y8OX3awKSj/Pwf7SQIAYIH773/gwosu2FvqOelVp5ZGGStJhlRAhplBEOXgRAKCKPkbgNSRUUhWOeSgmeeccOKYptioiyhWgnNKfkeRCupug160AZVwLBfKZdMCAwgFsTKFo7hPFRw8c6FMyszkIBTxIOxtN935ve9duXJN16gJC975+ne/5cKXj4ybS36rZYEXLvZhH4IIJyOs37nzS9/4+jU3XSPZpvmLpp//0lOPOvqQMe0lQRYxhKwTZmYi9ttNPMdfD4oAUBfCPMvPDPy5Br1Onf/x/y75b6qqoiJhbMCfBcIqXg2eQFAiFSJlUUskQmRFxPOhKjZ1mbWpuNRmDlFidU9PdVtn97bde3p7+2tJJk68HSiHgwY7qBIxhLT+hJ7xPOqyVVL2ev/cj8LLjpg0QhS5KKpFT9y5cfPqLe2m5fOf/+JLL7ggMly3oCjunj8F+08CEMDBfe87333bRy8bv3DkoWcfFrVI1Vn1x2aBIVIhDmJiXxyJYWOdQkidZRpcOGPyWcceNX3cGFYDUoZRtewnykKh74U/gRtVKOcyCSj7+1VJWf3p1X9OzmT607T6ZXYAGWYlEznI41u2Xv7Lq664+h7XM+olZ1/4sbe8a9GU2TEiQwUd9MJHMBRHJRv86bVXf/Py761/as2oia1nnHLwBeefOnPauAgJqYpxfsZQyG/I9bJjqosi95F6/lFtS+NAClVRqTcAKHyRBiG/wIVOQCCGcjsgN1RIRGA4EhKCONJMXKqSOZvazFpnnROlJJO+Srars2/zju49XX21LLPOMSl7ZZDfUUmsfr/k0H1LVM8BecxWaONko1DiYCnh638HVqMUa4kG4l2rd6y8Yy1XzQc++L7L3nuZn/hVBReh/0/G/pMAACi0d3Dw3Ze97/Jf/eCA0w5asHxWgjQlUbjALKoh8hPs5PsBSmBQ5hyBnE1J7cyxHWefeOLsaRNiOAb7nXaqqjD5xepPA6J5saX5vZPfQn4s3vcGgDDDmIsbvIOtKBkGG3VgApWiXtt9+933f+O7N6xb0z138tLPfPBfT11+bDMoghZHgRcoQhEOqOrj65/6t29/4Q933FDlviOWTLroFeccc8QhLc0Ra+bUMbFAocrEEqbZBSR+whxDI38eK5/OBw35LM3bwHXVfqhfwuaUBsUv9QGwXDfqt8IPPTKIwu9LytSmWZaKTVyWWmcdWSfVmnT3Vbbs6dzVubdSSdPMAgQiOGcCdQOXG8/lyyDzo3RAo3G27wf9/ZYvoVElcowolnIpizvX9a64dXXSnf7DRa/4zKc+NaKjPYpi5FmrwJ+I/SoBAGqhG7dsPvOcczb1bT3m3ENGzR6VRJJKFvlChKJwVQVPEb/JQshQlgmRsABZdeKo9pOPXX7g7JnNJiKjJKKqTHFwjSPJe1X12RQ/yZI77SqDIGFYwBc0QTEdhvRzkZCAVE2JDakzRmuRPrRl039c/vvbr3+s2Yx9w3lvec/rXjttxKgIUdEZfsFBFArHxJ29fT/89Y9+/Oufb+rd1DoSLzrlyFe85Nh50yYZB5BTsSAimCCIVwNlgoDdkIsFuXgZCDXHkL/to3Wsf0b9EEph02/e3ZV6Dwy+uSD+RBxEniEpiAKicBDnuX7RxKWJtYnLrKgVVDPp70927Ozetbenp38gTVMK/kOsChXHAINV4cJqSM3ZGR2SAKjxNMKzUuRpwjuuKDGpAxGTMxpHSVOyPX3k5sf2bOo6+vAjLv/h5VMmjotNVAh+/g/YzxJAKLx/c/Vv33jpW9E2eMwFJ5lxUU1Tz2eGLXjqnQvZi5pJRcnLIBRKEFZx7SV30pFHHHbg7LaSifx+CxdxsAtVABzOAarCSiocmFPKJdpK4duGK79xeedGWwQvr2aNSRExrGrWZHbb6m+vuvEH37+mazOOXHbcpz7w0SMXLC0BMYrO8AsFeZBlvf/hh77+3f+4a8UDA9I9a/7EN7zqxccedWBrkzrNmNmpsFLExjkhVYoA4WC3kHsUYt/Wbh1PD3b7KIKonjAofzC+caAQUXj3NB/6pXEgIP+o/WJ4B7WqVjVzLnNZarNMbOrEilYzqdTcrr39uzq7u7oHKtWqN1JU56IgmQbgAm8DkSCKftqL9MwnsM8/UW4OpACRJeJIo9iadI9Zc/uaXRs6F0yb96WvfOnYo5cbL1/C0/ouBf537G8JAIAANal+9MMf+8p3vjlz2bRFJy7kdrVqhXzoVYChQsI+PMO7RhCUSKyIt/m0tZKxRy6dd9QhB4xrHxlHETsip+wvSxBQnxfzU5EC1Osq3/jN98n4E4AfsMmdWZCLlJnhHDMzA06ImWxEfcbe9vCq7/3Hb9Y+tH58x7wPXXrZa859WbtpjhEVl/fzHKri25Z91d7/+smPf/zLHz+5ZW37pNFnnXfcy1922rRxreyqpI5iOCL2VYhzgDHMfhOL10Tmgk/Vp0dKHVo1h9+GBr59FKIhqtdr7nACyHU+IlBVbwetAoH6UWGnyNRl4mrOZS7LXGadS61LrVSq0jNQ2b27Z3dnX99ANSib4KDeBAJO1JBhFYTDcVgvU3ehG6LzGYr8aeSpy2/rVlGNwGSNcmyb4n6z5vaNTz66fvzIsV/47BcvvPB8w/4r8h5HgT8H+2EC8Bf3Uxs3vvHt77hzxe0HHj9v1qGzXJwlsEJgZhIiOFYDeHEPsThCGH0Rhljx2h8jgwctnHPs4QdP6OhoNqbE7J2kvfUuyFCYyZT6jVov2Hwe2Hdwn3Nbq/wQTLkIj1TFGKIIEDiJ1Jpo4+5dv/jJDVf89s6B/ui8U1//wXe9b+GkSS0wcVHkPF8h6pTgrL3v0Qe+9p//fvMdN2bNtcOOnPnq155/+AGLmmJm40QdHEUmFlKxlgyIWCVWxERCmqlaHiLz2ScBUFCiDS0j/D80PkcRvBXCIUJFXW7M7+94AUIHWAQKsYAqiaj/n4Vm0ExcmmWpuMylicvSTAbTrHsw2dPZ39nZ0987YFVBpC7zU2EgBRnx37LB9/jjMlCnf+iZoX8ocuqfAvsDqJBGTCWUqKe0fdXOlbeuaTMtH//Ex974+oubmiKGKer+/zP2wwSAnAi64ebb3nTpW7or2444f3n7rLaEMkeqcAaGoH5tO4XaxI/LiD90OlGAWEVExFbnz516/LIDZ48b3xybEjNISQ2UVA2C0bhQ0FKT1tdO5BVQeEBh6TzqUoyhgo2wi8zPxTMBQmwkMnv6kyuuue5Hv7x555N2wqyF//T6d7z23LPb0FwqjgLPL6iE4tp19/d/77++85Nf/XRj5/qxk5rOevkp519w/Ixxo2JnlZyoM2xEDSk7ccawEsQJU+xUFWzgmBzljDyAXHcQCJ1QRe8b8yhw5/ULShsMY8POYUhTGjnP4zzbA1FVp1aciGbiMriaOBFXTZIks5U07c9sZ8/gzt29/b2VLLHOQWABB3GsIPUiH5Lcrk6Dj3/oYFBjNPl/BvnDj4alBOwTilHT4pr3rqk8cPODaY+9+DVv+Myn/19HexNTQf3/Rdg/E4BHNXVf/urXPvOlT8WToyNfsjweEaVwDqkSE4OFySt71G+Y8Jeo522cMsQpawzi1PZOHTf25OWHzJ82oa0pJqeGS0RgjUQkdIMhuZoCCt9mgOeC/c0M5LeC/7PPDo2bulH1EAHEqkysMJFluX/12u//6Ppb7nmM0hGvOPeVH3r7e+e2T2rCcJCIDr046Zn/UFfC1GPlcxELVL2vDdy9D9z32a98/r6HH9KouuyIRa9//SnLDp1TLhvnDaYgKkxU32erAPtLRVSU/F89ie63ndAQ5ianHhX5YG94CfKHgIaJj1KQJIS6X+qKB1EVkBUnIPHqGhHnnFNVJ367S+qshaRqa2lSqWWVVLr6azu7+/d291UGa1nNQnxSCu0DViJl8akgZByw5go4Pz+jQxmg+jOov78NAssrRUGq5FQ1Io4RRdWWwW2Vh254tNJZO+3kU7/2la9NnjS22PD+l2N/TgACdPX1vu+y9/70ql9NO3DikpMWoY0TTeFXzJEh9dv0QjkV6i7UbxvykmXrLDSdNKZ09LJFS+bO7yi3lGAIYijyCmeEUYDGS6m+uZAXZTKkAxZWzWhY0+Q/7pvDoNCkBlgIhsg3CawpbRro/vGvr7vyqrt7dw4esvC0z7/vY8fNX1ymeP9iPfPzUkO0jmfkgJClRTRzjuufx0SkTOyJN/LZ8W/+2iigTkDMu7t3ffcH37/8Z5fv3Lt1/LQJZ5934ktfdurkiQauAlIxCoo4cBp1yQtI2St2csIjbNMiIl9BB5scFdQXIDaKCKA+KeAPlLnQJ99gp+EoqkJghQpUFFbUqrNKwiqSQURcyA2ZOCc2cTYTO1hL+qvpQDXt7qvu6Rro7q9WK0kwE1KHsHAgbEeC+E5Y/SYgqh9Scu6pEfr35bHQeC5hZAdgMPyeSRZplibZFd/7+3u6dw0umL74e9//7tJDFtf1UShOAH8B9ucEAECBNRueeM3rX//ImgeXnH7Q7GUzXGQdnIMFU96YUhYmSH3Sl+BUiZmcd7olBtTZ/hGtTYcfdMDSefPHd7SWGYZYRZmisHggXOj+HpGhnblQgGleq4a7eh/9czgH0D5nZaUwNSYxVYD7V6766S//cOedW2aMOvj/u+yTpxx+dMQcPV9PAs+s7v7HT27UqoCKIs3sQH//jl27O/fs7u7t6evtySRL06zcVM6sG6gM9vT128wmSTViU2ppcqkbO3bcvNlzx42fNG706IljRpfLUVyK/TpDQ/yMPSd/+fNTvyvugRUPfebLn7n99lu5xR162PzXvOYlhy9fWGqyhNTBhsXlBIIjcEPar/my9PDn0K0lzk+VnrbJXxxqhM88yIPrtI8Gk7ScbA8lNpGKP5GKqkCdinXi1FpVR8453/OFiFjJErECrWV2oFrr6a/21bK+anVvz2BPb59NRazzvlrqlP00l4ZjBITDo/B5zT+cp0X5ISNqz/6KEsLUr0KgBmCKjGXpliduW79j7fYx7WO/8Y1vnnbKyVEc+dehCP1/Ifb7BKAWcvXvr3n1xRenLdmyk5dMOWBCFkuKTMhBlWEIZBwxYHMelfPxGa+ZIyZPlopN25uipQvmHLpw3vjR7c1RHEURCef7w4TrjGs4CjfiuJ+DQaOWCwUfoV7Ged6TtXHP+54AVNgQwFSL5PHde358xXXXX79ybN/Ed7/2bW+86B/ao5bn4Z5hzfPfH79FNS9lVZRFUatVO/fs2bDhqYcffXjNujXbtm/v6u3p7u2vJbU0TVWsU2vVqYqIOOeEFAo2KJfLxnCaOYW2tbeQxi2l5mljx02aPGXhggMXLFgwd878OTOnx1FcMjERG/4rvFzeTafXuu/++Hs/+N5/bNmzoWNM6czzTnnlRWfNnDKKURNKvbuxCoiMhl5RfuTbR8vTODoOec1kSAcpBEc/eKKhpdQoN1TVEamKaYRZQnCo9Ttb2Hllp1hxDhArzkIyp1YU4pyIqEs0G8xsf3+tb7DWV0l6+ivdfYMDlTRLs+ClSKHYz+t5/+DUe/l4+TPqyqV67Y+hXQt9+pMe8ioQQMwqKoAhjTV2vbzjkW1r715fds2f+MRH3vaWN8eluBiL+WthP08AAARazexn/u2zn/7K59onlI44+7C2qc0Jpc4IEdSCiY0w+/KJnFNiCn+DAhA/jiJKRCRZzZCbP3PikcsOmD5pdAmmKWplgIVUnd8EELxCwyWde+gC+dJ5XzkF/p/UazYoN31h1Ft3DAAOSgBzpM4quTQq70Xt+tvu/9Xlt+/YUDnrqLM/+5FPTG4dU6Lnd1u4MaApAJxTBVmSgYHeNWsff/CBRx56+JEn1q3t6u4eSPoFKbHjSIVJVRhKjFLJNLWWOka0t7e3jh4zcvSYkR0d7c3NTa1tzeWmUpamLW1NAKx1g9W0Vks2P7F9/aYdu3d0VWuuOWqaPnnGvJkLTjjuxMOXHTp+/FhDbGCYzf8pkKjXem7Z0fmFb339V1f8rFbdNnvhhDe86cJTTz+mqcTOJcziJwENsQuGtDk9Ei6SeoYHUI+NfhbFR0y3z8+sSwf8xrl8n7tI6F45v5U6/N9bx7GCnDoFWYWFs5I3ep2IFQexKpk6kHMqSWYHklr3YNo3kAxUsv6Bak9//0ClkomQsoqy99ESQjjghg15+fPKj8GN6J8/y30+0vho429K+cSv9/4RhjFZFGfxtkd3P/XARhmQi1/z6n/+2Mfb2lr8xO/TvkWB/xv2/wSggACdPV3/8OpX33jXLdOXTV183Mx4VJONnCPHjgggjT3fn7fL6ptWw4yAKphZRFUc1JFWp08bc+hBi+ZNmToibo2ZDNjfvXXi2u9L8qM1LtyWDAVBcqopfCYNuUG8+EE1d7QKv4mSUVFiTS20HPXB3vzAI1f//sE1d21dNHneVz78meULl0Vget5VRjrkV6+xhUbo6u+//vpbrrr+D3fefXN/X49zKUgQaxRzeXRpyuSxU6ePnztvese4MR0j2pqbSx3trU0tTU3lUmRMZCIiEBOTP5z5Jr4wE1SdCEVGBWpN6rRSSbZv79m0fvOdt97/+Mp1PXv6Jo2asGD2whe96PSTj3/R3DnzWuMSEfhPPhMoBCrk3Ion13/kc5+57rprmjtw+pkHXnLpy+fNmqKuAhGw+tWKwWsnED2SE1zkWSnsUwcHm9kws0X7MP3wn+tFx+y9hSAQERWR3CjO+GuOgzKNoSyAQC1JJs6qiDrnnHXirIpVUacqjpEhq2ZJ70Cld2BwoGIHU+kfqPb1Dg5UE7HW20SLn3iE73g1+rh5By3v+Ybj69Mq/GcMMwz5ECEcHZRJRZTFQEqIqb+pZ93eFbc/VenMTj/lhG//57fHjfE7fusvaoG/FPt/AkDIAXrPvfe/4W1vemrPhvlHzZh/+CzXJCllngLyigsWZVUOXVtW0voEL/tGmyoYXi/nJJkweuRRBx+0ePqkkc1NJTZkQELqHLEBIEHOoeEOobp/hKuT0Y3bJJynG6gvbAWHKWUHIsMAQa2QVCJ5dO2ma//wyD23rW2vjviXf/znc089pwkxP78KIw2eCGCxApg1a9Z+5Tv/ce0t1+/esYmbyMSutZ0nTxyzcPHMJQfNmzpr6sTJ45ubSnGkyiKxdxTzbRjyByo/IOU7JeS9AohUhZiIyO+2ZWLNlI0RAbFxlpIk2bl5z4r7HnvozhWrH91YqWhH29gD5i0554yzTzjhhPkL5sRU4mDU9D89HVWr0Ntuv+2yf/now+tXLZw1+vVvuODFZx/VPiISTQTC/gRJpE680pg0tzILuyWo/n5T+KjmjW/NB8lBqn7/lyJvGKt43tCJSpjd9TIzH5ENUd02M8ykCyiDpGSty1ScWOecWFUnoipKLM5kkO5qz96+/r7BWs26wUrS1zfQNziYpZbVRGpUoCJCYXlvLmEGDeF2/L0S7CfCqsh9r4JwodPQv/jf2Z+tlQVCJlWgJKWyLfWsq625Y+3AruTwA5d+5etfO2DxPGaf4Ar81TAsEgAABZzqz37x80s/8I+2qXrIqUsmzR+TcOZUHTkxrAoWGFAkzFAXump5p8lf86TELM4KQ1WyNB3T1nL0wQuXzJkxqrUtirlEgKgKExPAKg7wDrh1DyLK6R/N/+YLPp8LpH4eCExq+FRhZgcIERNYBVAhVJVXbdl2860r77lu1cCe2sUXvvldr3vL+JZRudfEc4zQRiGoaC1LNzz11De+8d0rb7hiV/eOljGlaXNHL1w8b/EhC+fNmz5u3Mi4pJEhGEdWhRxBxKgwwUBDG4YJDFEirnfN66N2dbtkf1ajsD1FVMkYoyoRlURALqrW3JNrtz9w94q7br1vx1M705qMHz1+6YFLTzrm9FNfdNK0aVPLpSYQ8TMkhv4SsEl65bVXfPwrn96w48mjT136lkvOPfTAhayZaE3VMbPhWKDihCloC1iBYD2I8LgDTa5ELCrBNyInDInzPsAQw7eQQER8Izc0m7xOVOEXuSiUvA8PIKKqZElT2EydtVbEwqqIOIVTAcOCBit2b3d3V6WvYm3iXP9AOtAzkKRpBoVTcspCEJ9g4TX+aAyxhARACDPMqGeFZ/Z5h8SZxnMKRwcikH/X2VRjirUao0sfuXZd99bBaWPGffUrXz3ljJNi5vAjng8X9/6C4ZIAPAbS2gc//JHv/+Q/W6e0LT3tkKbxTY4yx9axMhE5IihLxCAhF8bnwwZIX1rle/JAorBwpGiGO+ygAw45aNao5lKz4ZIxRiKIBTgch4nFkZeBB59HL5/Tes1Pfj2efztybgDq7YaUiYKo1BKIYEBgkCID1SKzYXvn3XevvvKqu3Zt6Dtz+Ys//b7/N3viVCLhsETpuTkQKMQfl3p6B2++/e6vff2rq1bdN1jrXHjw1NNOW7Zk8dxZi2Y1jSwrG6vWsTg4JSg7dhQWjOdREABxUEcRyK/0ITCFT8ojAu2jq0Hg0zhvhhoVIUTOOgFHHA32DD65atMd1971yH1rtm/vTQYxYeK0Q5cuPf/cc4478biJY8YR4hLHwdlJAYWDXv7jH37iXz+YlAbf8PbTXvaKMyaMak9cQv686NSYsN0EREYJzH51kFLuNIUg/lVAxesyVXNn2aEL2Sn/my+5RQGIKETyPS3IzcVBCmUVJf9jIAQnkjmbqXXWM0Uk4pfDWLDRGH3V6p6u7u6+gcFqasWkVgb6BwYGK4m1NpNQ4CtYPemvrn50afA/ucOQEvx6JKrnh32vhfyXxnuUW6OD/NSYwjgSisSUtVzZiXUPPLlz7d5xrWM/9JEPXPza18TRs6TkAn85hlcCcMDmLZtee/HrH1736JQDJ84/bh63cyqpshKJOmLASOw/1wddH5bDOdd3CIgIZJ0fVCHNsiiSJYtmLD9w/qSOkU2xKStDJDKxN11UqN8nFuigOuUfiNq8lAp7OQik0jgU+PpOGQh3om9PkIDYCRBFFmZbte+mu1dc+7t7tq3ZuWTEQV/8508vP/hQCuvJ/s4IT5kJvf09V19zzec+/9UntzzRPC4+aPm8C151/LypU8d0NMWxJC61ZbUKMJwTP/rARCriN3gidNL98/ADejnFETIDPUNJQ0MTAOVyFaIwoeS3njg/leQgIM1MT3d13ertd9503123P7Rn4142mDl56rnnnPPS8y46/JBDS2j2KxwrSe3b3//yv339C2PHj3jXB99w2umzmyLnRC1UyRAZdX4YUAlKZAAwmTAllh+FRJWYfDnvxI/f5poZ8tu48iZSPtalCvEObsG20wvE8qQXagX1amHrVKCZOutc6jKFSgYVwBlVopg1wmAt2dm1e09XVyVNrKgVrlSz/p5KUsuciMvDOKS+8y5E9XyjUZ4dQjOg0dIeInxD/Vns2/zNw79fBObbyKowAJIIxtTKcT8/dsfObY/vKim/5lWv+uQn/7mttUQUFbX/3wLDKwH4p3r7nbe/+R1v39G3+YBTlk5cMj7h1JEVzfzeMHYmfC4pS51u9rFI8gtaSUUZTkRZM+fYpQtmzjj8wEXTx40a0RQZJQgz+/vZz8oEjtc/klDcNYYj/fdVgOtGvoH1DmE/TK8Fy174Da5Myk7Flajbpg8+vPam61Y+csuTLRj9ics+dd7pZ7YSR3/Hm6YxC6248qqr/uVzn3hy06oZs9tOOe+oY05ZPm7SuJZSRBCIOlJLyMgSlIMclAWqQszwpS7q7cHQPRFfytcrx8YTy1vm/gtUiSjvoEoQ2gYqCggLdlnF+entWIWVS+TcYHfvqvvWXveb2x+5f01flxs3ctzJR5182bv+ceGiA5XSL3/lk9/53ueXvWjJ695z6ey5k0eUegQCihQsAsMEziWX+fsYrh7/8L3Al/KTnap1TuH3w4cukWH2DWKpXxC+PPbsvwben4jgt1SAvJ+5KhFTps4661SdU+esSNjHogpCzJHpz9z2nq5de7p6B3qdAxFnNqtUqwOVmkudComQhBe7XsrXtcxDq/v8tW/84z4ORX/k6sgvdICUmMiFAXwHkFGUJOa+5nX3PbFtZTcl5txzzvzs5z87blQ7m+e3wu2FjOGVAAAoYNV+59vf+cDHPtQxdcTikxa1TW9JTWY18e1W4ghKrEJgrxR1DB+aAD+oyYAFQH64hiwIWerE2umTxh5xyPy5k8a3NzWVEBs23uXTMEGEQi1oglYCyG+khl2EJ4LzjzABAgllYdAlCUBgf/OLIQbYqUUc94u7f81TN1374B23rs32lt/+2rf+4xvfNKbcxvirz0A9C0SFlIhp1drVn/vc52+4/ZqW8fTil5/44jMOnThxFCIVdmJFCSpGhBz785GK39JZX1dYz5NhSy2RkpCCBUTsY2n96RAQhE95WyCEXE+0UENuFag7VbATqz6PgFKnRJEyomDDibSqG9dtu+3399x+9b27dvSOiMctWjS/FCe7dj1+zgWHn/8PL+6YNlPVxZwwVCXyuq1QCqsQwYmru/D7xqxn/YIuH2Ebr/PBO/QwiABmDpaxwVc27w/4XKB5gZ2fcvLmgThRUcrEZmpFVR1UxLtdMRNFxlrd09e7Ydeuzt5eK2qisqSoVpK+an+WZdYJBEZZ/fWm4cnk15t/gRsDZqE0aRxknyliyL8o/EL12iAMbsO//XAEQmaUjbIZ4F0rKmvueaJs4/nT5vznD7+7cMFsJi5q/78dhl0CAKBA78DAhz7ywR/87MfjZo868PRl0WitaVWMAmJgfIBhgVFvzAaFOMqjSl6VB+qT1akDiMBpbXD0qLalBy1YMGPK2Pb2EkzMkaGMVJiIvD5FYhB8TAvRLlD8+UR/PmUTlmgA4XAdPkcIrKSiYgyp8y3iKLUO5bhC9MSmjTdc9/ANV6/o3F47ffkpX/3kZ2aMmRAh+tutFROIWuXI9A1U/v3rX/qvH34nS/Zc8JrTzn/1GSPHdBBniVTJUL5YMCcQmCDOKVRFfVekHrrrRsh+E0Pwx5dc5EL5wDbymN+oOnO7zCF9YhUCe6mXd1gTqFM456xXtxjjxJo4surNeqKylpqpqbpt8JY/3H7dtXd37u2cOX3CP77/jVNmmaY246IyxaUYpYjInzCUXJiBDS4MYd+6eNmPf9CB/AlVfGj2EIkIhRmq8PD9XKBVp+GMKKE40CHDX0GDL1acc+JERODUibh8GzUBkaqpOe3q792xa8+evs5EbKkck0ZJ1fb31QZqSeKsim/DciAjneO8c9QI8PgjOk6tv1VDvqDxCY1PRH6kITDlD15JlWCUIhfJAPq39D9281NRWp40cuRX//1rxx57jDGF0f/fFsM0AQh0/dYNb7j4jfc8+MCCYxfMPGISOshypqykhiFQNgArSEnC4C5yWkHrwy9KyvnIpmf8rU3bO6IlC6cfOGvOuPaOlrhkyDVFrCIMJjEkkebkZwhkngjdp0ryN12eF/KaV/OyKwhK/UYnFTAE5EAOaiPdsLP75lsfvuHaFduf6DlszkFf+NinDl+4NMLfYsu8Oieet3jgoYc/8IkPPvTIrSecduRb3vGS2bOnWJM4JKl1ZAhkAInIqSMLqFfJixP1Hgjk1FfDSmRyrkGFiUB1m/xAkTdmJNBom9YpoPDnPJEE9Tzlk3XkxDmoU3Eifn+hZ4msqIthIiOixmpJo7LGMUU9tgoXlazEJalUt8fNOnL0+BFNowzY5MYddfbLT8R6aaaqOoV4foYA7wxI9aga7jzfqc5zvI/FUIULjD81Jh1CzoA/CoiKc86GF1HVqVMBlJmU2KlmmeuvZVt2de3q3p05GzdFTJzWXLVa6+0ZrCZOlYkJzNY5hBkDUJiEx9Cgq89MAE/n+huJImTxQJbSkK/0l60B/HJ3QB0RIhfFtbhnQ7rq1sek17Vw+Yv//rmXXXAhM/t37/khats/MRwTADwRBHfVlb9/+zvftSfdfeiLD5u4aLwrJZYSITXEJAbQfLuvI8A4Y1QBCMPmtzEIJJqHbN/Molpaa2riBTOnHLJgztRxo5ojU2IyhAiGYEhiDZ8vQRUaXIRyLrguF/Q/QYVyoiMvAUkhYPbaEWVkYn3tSCAll0K293TffttjN1+3au1jOye3zfzy//vc6UccV6bI/PVuJYU6cRFHfX29n/3C537wi69PXjDikvdcePAhS9rKpcylGrtMLCM2YFJ2LkPkoGTJKFid8wMOyH0lnToQGFGDV/Yew0EGD2JDwf2mHmL2jQ3hYKBQMHnNOgcGBhAY55ySJtam1qaaWQlbGaM4hpCyOhKoRCYicawaUSkTtaxEqNXSpFYj6PixIya0j2pjikCqxuuRSASo28oq/CpdiCBkAL+hHb6nj7rXCKmqYQ6nndz+QaHO+4vktYaX/qj4nrkTVVHJrPMsEggK57cRgLiSaddgpbOnp7N3b6WWxOVyk2mWVCqDg339lf5a1TkFmBwEXuXmtxtRw0cOQN3Q7Zmx/tl+p3qv+GmHsvA8/LtFIhGgxJmQsMJoxNVSZXt17e3raztsE0Wf/rf/9/JXvKypKSY2/qcX8f9vh2GaAOAXh6Xp17781X/+/KeaxpaOOOfItmlxQomjDBCDkpA4kAETCYsaMaxKgKPg7wUguPyoMrMT543bMoWqi5HNmzV56cJ5k8aMaI+jplIcc6wOhkqAAk7hOHim1/nUcKrI1R2Ur9UISiFt3HUqBCVRJQcXblUVE7HLnDJZoj29tTsfXnXtdStX3b9tjBn1+cv+9eVnnVsi+qucAwSq1pkoevCRhz700X969MkHzrpo+Sted9bkqSPJpc45GAWTOKgQw7CSkrMkvkgGVNV4YTugfouI+vAWFoiHJriS+tYLgUOl7Lu4mrM99WhDoRssUCYmcaRQGP8OCSCAgyZpkjqXOmvzOV0wFGwAJvY8vkCJYJjUETESdk7FOZclKmnWFPOEjlETRrSWyDAMERkhalj3hAXoouJC9PfJW3KNr2rjnfTtAS/+r1NX6vvh/gtURcibOfvCX0XEirNOVMWpsr8OWFK1SeYGK9nu3sG9A301myCSZtMMZ9Ik6e0a6O0dTJxzxkBh/CMKdLwlApTz3oOEdbxDdP15DH964G9An+VP+ZNE4zyrRokICaBG2WRxsptW37xmcHui/XjHO97ysX/5aBxrxBFQ1/UW+Fth+K6ZZaClVHrjJZesemzNz3/78ycffPKAMYui5piMOmRWHYMjMqSB5nEkru7HoqSeNPUEKpOI5GObYgypGBWz7qndtYpdduC8mVPGSKZNBhFHUAs4Y8THBIIR3yAINb/kwz++WvRWYn4sWZnCmI/m9kFQJYGQhBFla4lJBIYxpq104tGLW1qbW/n+Ffdt+sePf6AcxxecdgbB/IU3ladklOmnP//5Rz79z+VS7z//6yXHn7aUIitp1THYGBBsZg0RxQbOOREoWJlARsWJKBkFk6+cNbRdNETMXCyr4fUmMgINvWz1g9nIc0Cuoc/DDkNzv0p/fPDrbSVzLhOX2sxZv/oTxKwQCEEteQlA3cEDsAoRMWBYVXVRzGTEGRLVnspgZGRES2tTFHxko9C8DW0aBQAm7/4k/huGhdH+FQz/+beaKe9n+KODijrA+PwYTgCkAhWodVlmnSiEVOEE6sgJaX8l7avW+gaqA4PJoK2RQUtcMiZymevp6d3b051kmVOGGhJVqFXl8FRDA6N+8MoNRRpXiQ7NBEPTQP2KyB98vSXc+AaBycrbPOQlTsIambTJ9cpTD23s31EjR6+9+FXv/sd3l2MwGww5ShT422H4JgCP0aNHfuyfP/LkpicfWvtI29iW2YdNM63sNMjxUGc/A9ngl+xxnYj3NhL5PIyv5eC7AgpWRFt3dqqk1WTm3KlTRpajmLJSxICSeDoXCiGONIw9he8TKtMGFdRw2fVVsYQZAW/wjhAlFN4IgZidKCna4/JRSxePbO9oa7n7tt+vevdH3tMa//uZJ53G/9e6ytegpFpLs89/+fP//o0vH3DEgg995L1zFky06Bepgsj4c6VQxIaI1AEgQxQSGqCAYeM8EYz8oTdK+qBwQSCUc8or74XUmyH1sn9oQM3ZCxUJZFDmNHNZKtY6l4lzGuIpKO8y5IIuoF6Fk5KKEBE5EYgaitQqM3Ecq3VpmnUPCDGhzKUoItJMxYBUNVwcjZk+T9145S78Y8x5/7y5H5Kcp4Ygos7/FggVFlXrxIl13glVRUmFNHM2yZIkTQarSU9/MlhLUydCiOMSE5HDQM/A7q7ugUpiLZGJiQjsd2zljdj8vBmC95DY3vjj00yJ/mfKoP5u5FNsQcnlvwkbFVESIoqsSbvN9hU7dq7pRM0cd9Th73rvO8eM62Bi3yzB0wm+An99DF8KyMMXPrfde8eb3/a2zbvXH3nu0aPndWSUOBaQiL9JlfMZYG/DQCwNzVveImYCERyF7S/qvM+ndUzS0Vo+eMH8hbOnjCg3laNyOTIxC5MwRVAFG1+sciOQBQWL1kdrFMhXZAPsRPwOVwBQdQjNAT8loCQARI0xnFmbqW7e2nXjHx684ke3jy/P+u//+sUBc+aXoj97Skyhos5QtH3r9re//13X3vn7l1109JsvfcXYMSMECZFTciCoX7QcJPjsU6gXxYs/wXgneYFVdTTUCKERfLSR9vwHBGRC3qrbPvjf8iBRr2F9KsxUAXWimbOZdZmzTkRBVrUxbeWVpsReaRoma0O8Ch3XkKHATpwSDEMzB4Uia2tq7mhuaW9qKkURKQzIic3XVAWaSnPj5tzBH3nrPxS4UufZKQx8hckAIe+7JlDnxD9+UaeAg9Rsreay/ko6UK1WUttbqYqwiESmBCicpDXX0zPQ29+XWe9sayhcqGrEKDTMXzytnn9m9N/34/8TBdT4eL1waXyM8gwHWFITu7Kpxevv3b51xY5arz384KWf+bdPHn7oQVFMBFMwP383DPcEAEAAC/vDH/zwPf/0T6XR0dJTDx41szkxLkMixh/gIyW/PJLDviLHAMCiCpcbkkGJQnuYxY8SqITpGWs7mpsWzZ2xcOa0Me0jyyVTMlqKUDJ+QpjJ79UD8s4bAH8GkVw44jkEKFRATlRgvY2Mv1e9JZioQJRJmdk5FRE2EQiZ4217un7zq7t+88O7l81b/otv/WTK2NHRnzMp7BkrIqxcsfZtl136+MbVr3jzWa979dGjRpecAAYQFWs59+mB1qML+3jmeStVFThVUlWr6iB59Kf8B0lOQQRvgaAnGbrjS4MZRk6pExGUyL8gouIgiXVOxIp1zjlRH/0B8n2GuoQojBN7Ct+TTxjSb6GGMlWCEF/C6iB1hqg5jtubm1uamkomMkQqwnVxTx73FPm4rK+2cz6F8sfjtQWaR3+/XZqVBCLeflbEihPn53tdkqX9SW2gNjg4mFZtVlE4oogidQqntaqtDA7091UqNaeisSn5I5QKvHyZhX1Txbs6h9ezkVH37eE+MzzoPl/zbKnCv3T1Fo0qACavWWOWSCLtbela17nurs2De7MpY8d/+atfPvXUE5pK8AYqRfT/u6FIAKEo6x+sfOYzn/vat78xelr7stOXlcZqlRNnLBlvzwAjHCgfVd+89fILCSyEL+1s3sfzH3OOPa9jSBEbN3fG1CXzZ40d0dbRUiLVmKNShIgMqzKY8/GknP/MDYJyqzgh8tHBivOhQYI40A8FQ1XJubprlnUAhQZ1Jrp9d+13v7zz97+494SDz/zpt7/TUY7/dH8VUVGi66+/+e3veXtWtm962zkvPmf5yJGp1VQ59v0JEqsMztmPvNXJPnRC1SnUWyAElaQK5QEjf+FCnK7X+lQvpvMmS0P6GXoCoqRe2aniRKxzVlzinPhBKh94/NMILF0YJPbh2SuLlAH4cp1V2R/tiCACJnhmR/IKXsURQ0UI2hSVW5vKreWmUmQMMcQFkiofAwu0Va5/ryc2BCOgfCxA82QYBELqxFmoVZtZmzqXplnN2sFarZqklSSrpWmWqSPJfF2dps5qpb9WrdaSJEsTxxwxGSgJi09CpBpUtAphNA45Q+6Fp98bOejZP6l+XgVC0m+odD1NCar7KgqIYjVRwn3r7cM3rMAAjWwaedk/vf9Nl7wuNspBAbzvYyrwt8Rw7wEgb1e1t7a+9x/fs2nL1l9decXj9z654JgZpZFRatSKzQO8L+VCEFGg4TjQKJbYk/L5PcAMdk5916CWYc3GzYNJ/4HzZ08bN6G5XCIFE4i9QJoVauqGK1D25gW+pvZuYDnJweQNBzSs5YayshcG+UoLIKcCYjAlWcqRAevoMc2nvOTwvX0D1/366m9+/z/e/7ZLVa2h/+Ua8C4EDu5b//ndj3/q4+OnjHrHu99w8mkLyk1Vp45M5AeIKPApIQoAoZ3qGR5RgZJT50ekcmrLh48htWYe/cP0FAKhE96oeqr1YSUkYBUVJ86KpDazIkEk4wUteW2bsxLBRMKHZ0Dytez+gecckt8GQf6l9qE6lPFBhBqa1qwiqc2QKIBMopKJ/crH3LoOvtQeSmgPJa3qrs7+tdAgeYVCU3Gpy1KxaZYmma2lWS1La2k2WE2TLEs9gQakiXOpy6wd7O+3mSSpdVaYo9iUADg/EKzeTDt4miggXF9H/fR7Qeuhe0h5/8ywnItx6++JT9LIj06aH7F8VoeyEtSoQcLJ7mzd3U+4Ptds+E1vfO1r3/DqOAb5/lrB+/99USQAIL+Cx4wa+eEPfWjL1i13PnovlWnR0XOiDnVqwV6u7xSAMtTb0KtnjY2DD73CsEQajMfYH+WhGhkEjSPDkVm/dWeSunSuzpgwES0RlMTAsEYMA84PAGAKc0A+YNSP0pSTyGHXTG4nKWp9ZDEMdeJZBgfrHCRCqilElGTM2Ojs8w/fs3HnJz73sRlTprz8nAv/51dGAILLnPvAhz/0rcu/uejwme9+36uXLp1Tbhpwxqn4vVpQ55SViTSokzwX5We94LyjATRPnHXqJrz49SBIeeJAENj6GJqLZDQ0BzXM2ZIV58Iqc5c5Z/1IrKoSnPoRUp+g698gL1DVF+ZDzOVD9KW6OV8okMOkWm444ckmMiKu3uRNMqualKxtaQIr4sgYIDDe+a5EIce5J5E/1/ljkbeabag8RZy4TKWqrppWa0mWplmaZbXMpS6tJGkttaJIM0mtTZKkVstcYm2aicJaESLiSAX+BwurNyJkQX2wXEioEWYb19WQe6ExO7fvx4feMlT/NT/ChYZJ/mfOXx6BcQCMRpHG1U7Z8OD2vVv62svNl779zW9/99taykQNWq7A3xUFBdSAAgLcccedr7vkrZt2bVx62oHTl43TZrGwDpYMqyhJpAxA2DvHCIwSqwIiTBn5ydVGYRsGOz2Fwl6couLs+BFtB82bM2fqzJaWUrnEpZhKFBtoZBikpBr5+0/AFD7itf9hEaBKpqrQTNSpiHjzFxBgIL40E/iWtCpD4NI0IWOy1A3WsG7tzu988de8p3TjlTfPnTwrMs9KBAVT0p6+vZde+u7fXP+7g4+ee8m7Ljr08JllqsGIEoBIHZG3rWao1uUlEFEHJ4rMiqjk5xc/BpU/t7AbERKOTVBf/hPUT3D59rofcFNyCOFT/IJgkczZ1Fkr3vO4XsCH6M2EsDPM7+cK1HQQW+WplHzLQdgq4H1XFUz+P6+VhDoYQLySx2/0qQt4fGuEROMoiqOoHEWlOI6ImE04HYZ5Qh6i7lLVYNXGQYKpTsl3ep21SZb0ZbaaJWlmrdg0sUnmajatpalTSWq2VknT1GZZ5hzUkjrnD0NK/gUSJlaoBqkwWHzTJNef5VvqG8cr3cc2qXFL/LFb5Y9+VPNur1ECNFOjRI4QcRLHA9H6e3Y/9dDGqKYvOfusL/77l0aNaDNeBVeE/+cCRQJ4OhIn3//eD9/3T++vlipHnHXopEVjtJRllFoSJmKJAIVGua5NAkfij8jKIGI4P+jpFx2FEUsE1wbAACyu2tJcmjdz1uL5s8aMbI5Um6NyZGCAKCLDRKoRIvZfR6FAFlIBfKvTKZTUDyg5EVVYiKoYAoMVCJZnRkV9LxRW1MIlie1P9JG71//8K1cfPuuYK37867ZyyzN343oGZdveXW9665vvuvO25acc8Ia3n7/kwOnl2AplOSdm4Byz33LpKRXvHcZWJHOZqlqRBovjOaLgaIS68kdznj/PEGBv+s915wQVsFNx6vxCKy/ssWGbudaN5Oo8Efmf9nQ5IeUVfr6d0WcXwLEAMEoUDJ+c/wYqjgmAUfiQLaxGya99Rm7QoFBihiETGY6NiY0xxsTGQJSJvc01RKEMkjDgR54aYVGkYi1cLbO1JE2qaeaySiaZuExcmmW1Wpr4f0oy61yaZDa1zop1pEokXvzlwOQ8PxVeNFIg9DYk/OA6p1YfPKF9gwANOf38UTztH582AaAgMqosBDIVhRqNShq7rtKWR7ZvWbGj2pVeeM5Zn/7cpydPGGuYiVjzwb8Cf2cUFNDTERl+9Wsv2r5t8ye/+NlHb1kVtx4ydmYLG8ORimfZgZzI9a01/3UNW5icNQ0Sdg1/aLCcgMCUBhO7+sknara6cM70CaNHQk0MjhnixGhkDEOVlRhhaXyumvdtYG81Cb/0VSQv5ZQsFCQQEVEh61mFzDmxSGxaE1ut2QyYOH/MoacccM8Vd3z5K1/58Ac+ZPa9+Tz5sXHb1kvec8nt995x1nlHXvzWC2YvGc9aFRU2JI6C145hhYMSEWsYeVKBcxLsCjTng/PIXM8GGFpIBy4iiMfh1Dv3+KgiTtWpeC2/dWKdy1wI/SHoB8+wITpRNMaEkTdvck6aQic597IPAYxAoeIXIjhSVRcxq6jQkCVcqgALHDV+IPulJgJJrTgnNpLIuYw4YkOsjDB9l58+yA/0OohYl1lXzWpJltWszazLrKtlaZJpZm1mbZplg5U0zbI0s1lmnYhYJ85PguXXo0+c4ak1avn8POY7sHnoD699vfXin3hdVIVnBvinf2ToB/YhiQj5oUtFYRRABEJqODF7nujZ8OBmW7GHHbjsAx/+0JSJ43yGblCBBf7uKE4A+6D+WnR2d7/v/R/40U9+OGLumCPOOLh1QtnGaYYU5Bevc70NRr76pGA1ibo6sbH3O+8J+1zg5aEEZnIuU3GTx409eMGiaePHNMcmiiiO2LABNI7KkSKCRCQNMSJUFZnXBRJ5V8vMOfGrZCBK5OBrZc+9iIQ14FpJk0FrByoW4MRKpcte+5837lzZf93vbj5k4SJjcqMgBZzs2tt58aVvufm+G8582dFvvOScWTPGRpEVEjgyzF5eA1YEr84QeZx4eY9Ya506EDnNDZsRXokwzEaNFoCG0bkwQuu5ES95ciLWSSYuc5Ja66xzohL26MJLbAwREXPePR5iFaT1bLCPohQY8i/+DVMJzpRMwbEHnpYyRkRUEBN77s3l3W1+2kCCQplYxXnnV2ZiIsPGnwBi4ymg0LW2EtJYZjXNslqaptZm4jLn0symLrU1l2RZktkkSZPMWSfWioj6wTSIAH42nXJpkeR9WD8wqIFuCxek1jMwDXnMTydeniX6/2+3ytAuSqPtCwCOxJDGKElf3LO+d+1t62yPmz9j1pe/9qXlRxxmDFMYuiii/3OGIgE8CxTqgC3btl188RtvvfvOyUsmHHTywvI4k3KmECVRMoCSmnz4iwLxEVyi8yKX6h1NqX9v7wdAxH4DmULFpiNb25YuWDhj6oSW5jIbjYhjw4bjiDkiFwMcBpYI3ipAnFM4dU5Z1CUq1llPRjBB/GIQp9Y5EScuTW2WCippOpDZWmZdKgLDWtr8yParvn/ThSdc+M0v/kdzXCKQCoiwc/u2N73lDXeveuDU84953TvPmT69LSYLOBCzxmLBsAAkSPxDVesDpIg4Veucp4PyZ05Djk3eeTh/jfLGhhKEICJCPoOpVUltljqXOJtmznd4QRxKfmICMbMBKLBFOeNDyDntPCDVtZdDfq6X54e3iyEQk3fcnRPDRkispgCBo6D44rD3i8OsGkJXWusLvCSfIgeTXwlEJorqnSEFEmt9RZ/YzGbiRFIrmbVWXZq5apZkqU1rWZpm1rosvLUKYYWKiL8CRAlKnJfQElrYhCDvD5V9nmL9i7NvoNUhb8s+H/8fYsLQDJr/2nh/8+8b3mhlZ8pZuX+jW3Xb6t6tAxM6xn/uC/963nkvjkuGC8n/8wBFAnh2+JLpznvvfvNbLl371Jo5R85dcPQsGgFl6zhzrIaYnKEgCfR6diXxMh7JD9iUW0PuM+vEpGHDn+dAWAGNxS2YN3vB/GltpVIpjiNDpBQbYwxFxjBRTCiR/1I/tQQrzjp1qhmpU+eLfgTHGJdZZ5211tmslmbZQJLVVPpraaqZOpckrsm0uIHokRtXrvzDY7/87q9POvKkUilipg2bN77jHW9/dNUDL375Ma99y/ljJpc5skSelWEIQ4jZL5pVVZhc2wPfnxBxEiaYAPIsTS7oCdJKVVH2RqB5gxuezVan6kefnLpMXOpsYm0q1lpxwb3NMDODvV0wgcKWTgDIp/LqYTBgSELI3986HRRiuH8H66c4gTFgQ2ADaCapWAiBDXnPTQwRsgaxKXmle/25MoUSFwqoUOpsmmXWucxa65xzcM5Z0cxlqbWZy2pplqQ2SbM0tWLVp28f/HMFj4pzfmFaUCKD/MN2DUpLNVx4hLy720h6z3atU96QqVNHf8Z90mgd14fdVEgNNFZjKu3VHdW1dzzZt32wzE0f++iHXvPqV7a0lIzXHRS1/3ONIgH8UQjgID/7yS/e+/4P9WZ9c4+aMXv5FG6RjJz4qTAxBCWJCFB2ef019AUdetoO4hT/NyJvb0+ZdcQCIlVHcFOnjj94/rwJI0aDHQiRAQmZyMSRKTGXCBExfGuZRMVlIpm1TqEEFhJI4pJE1FqXOZtlzma2llZrSTqQZhVns0xTmzi1mdiIm2Jq6dk8cOXXrjxy5rF/uPKGmPmO22591/vftXXHxjdfeuFLXnnSiAlNikTZESmzUVFyhkBKGSBKBgIfET2xI/k0loYFjGElShhW8Dy4kgKOXHhd4Hu8UCaBqCIMuBE5DX73mXPW+b1XIuKnIyi3evZDeTlJ1ojoYZNanYir0yCaH8zyPjAQ4raCvDpGSBGDnXXrt+8eMWLUxNGtmbqEMhVV7x3hRanKAmn8eB98BUzEzD7Jq7jM2cFammZZar0jBUQhFjazaZZZa5MsTTNXS7MsyzLnHf5zDwnNKRywiojzmwKgwYE5nDi9zKcuscoZH3+8yVmWIS2AodcoGlet4v8Uk0PFD09WKpEa1bKNdXfLo9c/3LVjQPr5rZe+5aMf/1BLkzFsQmOiiP7PNYom8B8FAQZ8wQUX7Ni1+zNf/MLGhza2j22fuHCkiaAMhQMscq15cB6rl05Bjhc6Bf4W9ZrzMDmsDIWSjYwIwTlrODIcb964J6vowtkzpk4c01SmLLUqkWbOGFMyJjZkiExkyPfW4M0L1FkrTlmZjVGhJK3V0sxaZzOXprY/rQ6kSSUT5xylTtUJkXVkKbWC5o7S7IVTH7n/kcv/8zubNzz1g5/+KG2uveNjLz/33GM6Ohim5r0ZGOSsY7CSBu06DHm2nsIT01xWmE++ksBPRWijw5u/QIzgqOTJaobnr1jZj75512qj8LlCrYgoWRHnJHXWiWRO8tczt8tuLFFj7z+WUzN1hqJua+k/2HhEQ2bYiARMTEqGS7ffv+LG6+/8xDvfvmjhVOU0NarMAiEHE5bEBbt/KPlpPmYG1ErY2ZIkaS1LK1WbWit+gsGp8x4VLktqaS2xfoBLBM6JEsAmN6cLhlAUTlfOv775AKLAj17k3RUMIbzq088Ir3L9+eajEPn1mnM69VfmTygKh9JHRPmP9o9ODGJjI9eLJ+5d27u91yR481sufs973tncFBlmbTSMCzzHKE4A/wsU6K9W//nj//LN732raULLQactGDG9ycVqxRIrE5HGAs+dB6KhHvEIRBDNlUJDUoT63bZCNlel+6UerCBV19pEC+fOmDF9QjmOoZEX1LNBySgrxVGpFEcRERMbJiayWZYkSZZkmUgWac0m1WparaU2tUmW9blsMMuyTCTLWCwTV1IRBoGMmpLEXY/23P7T+6IqJYP9U2aPe+W7LjjzpcePbFU2VnxHFiZUtw0HmMYAb+AdxNe7nqf3tL5XvyoFD7sGaSzB4x4h7tTzBteDyJDQLKSAgzqoCjvVzLnE2kSsE2d9A6Hee8/DHA+JYtQgOeqxp94crv9VFEowyEc5mIgM79xb/cilX2gHffrTl42aZAZKWWZYnEZap7eC1USwlBAlMqouTV0tS5MkTdK0lqZOjCpZkcw657I0TdNazVqbWucUzuU2T2BAvCMQoCREfleAekcjq6GgCMRSvXZvbBPwr2qebrVOcGn9xanH7MZF/ozmwL5/e7ZgTXlC9SMXngoVQgTEruS64+0Pb3787g1UwUvPP+dTn/23iVMnmIb6tMDzAkUC+N+hwPZde//pwx/48W9/NXpmx2FnHlIeSxkyx+oVmqzsx2CDvsVTyMgr4ry0yu/YeoQJN3LOnJIIiJiECY4iO2HiyLkzp49obyvFDFDmwpYCZBoZE5dMXIpjE5fiKGYiaFqzvZWBvsFakqWpS2q1pFqtVlI7CGctaQomgWYCscRpljGM0bictnSv2nvnz+/GgJ05Y/TFb3vpsRce2dJhYqNCGZC7WTaaG/X2X6Ah/JP1k8sKr5nJp36R214PITN8R0Rh6sHLvwIRON+OkkeJvMRWb7tN4d8ykSRzmbhqmlrfE/VZQBuxvh7hqKF49GGyHn9Cks7JEgGCxZwyQ8FMAEWIV9y++l8//p/nnXvGRW86NWkZpNYIjiJ1qc1ETMQRhEVEIE6t4chzVpVKbcDrN62fVDBOXJIl1Wo1S7PMOWfVOXXBOiRIJyn0+b3hHAUaR+DEWXGE4E1EWrdcGvLM6kkun+ENxX9OQO4beZ8eh+mPhvpnR547/SHYCQlDWE2UNce1+Ml7Nm1ZuSXpSV/y4rM/8clPzpgxJYpyYW+RAZ43KCig/x0ETJow5mMf/9CmrVvvuOfux0evn3fUtNIYVoLzdDCJupxUptxwII/1dWlEOBk0AiqQhybxYwSsTCTqlFkcb9u1t78yOGX8+MmTJrY3NYPZWUklUyeaCKowUWSYWCmOoqgUGSLLREypdZVaVkvSWuqS1GZwzoIsKVSATJ1vTTOZyDLVovVr1qVZdfaMSZde9ppjzzjcjFJFknuyN9qbnnSmIbV2oCfCE9dG/GmwChQif6NVmH/lM9gYCi8HBcUr8vgMEqgBJBcPRUQwhhTKUQLJxDamfD1tkvcB6hR3QwIafmT9AFA/HAR3NP8zyCgpMxmGLD1yybEnHnPTzQ+ecvIRLZMq4uKOkWMkdaVS2TpVayElZoIymyYRl2bJQKU2WE2SJM2sOqvOudTWsjSrZUmWZpkTJ0IwuT1P/lgMQeuy4sAh+naAX1VDQl5M6jcC6T6jDvs0nxoxVutv4ZAPhfcll0LlZ9Shx4L/CeRTDIXRFO94SEqCskau12x6dNPGRzZXutJjjjj0Qx/76IyZU6KwT6/g/Z9fKE4AfxIUsLB33XX/6990ycbO9XOOnrvgiFlRm6ZIhYQZEOODiGdjNac1GncnfAmWswVD/4l881H8mD4zrFMiIqMEjY2OGNE+ddK4sSPHlEola62IS7IkSVNxzrABqctEGXEpNqWIRQcGKoO1aqWSVm3qrKpAXajWHYtTRw4GJtKmFm1adcualXesOHDOjLde8trDTlzc3BYhSoWERIlln8tjCGdQZ29ypgfkN1lBnXjXT/8JJCr5sqwwOYFgqxnsN8N/5Ae/8ni1T+70aVIE4QQggFOkVtLM1pxLrc1E6jZvQ84VlPdFqfHAaZ8XP7wX5AfP8hOcP+aQITCTll15x9bs/33kc0cdtuh1b3jJDbf8oWVa+1HHHp1VUmHnXKbi3YHYqSZp0j9Y6euv1FKbOWed2kxSmwzWqja1WRbWb0mu1Hn6DZg/9kbyBFTgnAuK2vyqyl/h+hMd+v4MfSEx5Mt0yM/Ijz5K9RSRS6X+BPjOCxFgHIQ5IzVRFpuK2bmia80dj0nNHLn0sE9++l8OO2yZMSA8Y9a8wPMAxQngTwIBBtFRRy3/4mc+ffGllz51x1MdLW3TDhoTNxtL5CRjXz/6UFafTsXQsozye5bDP+QcLfIgBL9TGDCGxQlpRNDM2r3dvQODAyPbOseNHtvR3maiSJiFuJrWaukAOHxd1mdFVFKnThObeXmoOkSOSeHgQCxQVhOriVzMSXntg6tW3rl6yfxZH//4u+YdMEOjTKPU18DMftBUhx5onhZEGuV/Xlc3Bh5Ql5/k7AzqhFjjO4QoRvUGYv21QuMbqz9W1ed4Pe+gEbEYEyms10D6bzc0+ANoBP96TV3/98YfhzQHcg0NiAwIDsJWdPLUsRe+9JRf/PK/jzvqmANnHvX+f/14y4fHHHTIHCdOI4YLgp80Tfr7B/sHa7U0yzJJJUtqLknSmk2TLJPM5W9zMCca4oFUvxZCEA4dIw9psFiNyd/AyzUSwT4BH0/7S/3Eo0Ne5fytzT326rT+kLPTM74oaF3D8clLsgCNlKUa7X1892N3PUFZae60qZe9/x8PPWyZ8WLpQvPzvESRlv9UMFAiPvOMMy97xztjF62+Y/X21Z08EEcuIjEKIvLlf1h56gmNPA9QHl1yZgL1FgGg9dubiFgd4CRihnUiAEeKeDDRHXv7nti0Zc2GjZu27+zp609SlzodTLLunsHOvf09vZVKLR2o1AaqtcEktU6tI7WG1HihijKsOnVkJIrSuDlr2vzg5odvfeyYww/+f59816Jl01BK4pLzc7gM8i5z1CDV/SMcGv39Q84fuv8w5V/BUNZ8NPVpt35O0AyRsiPnYiiM9+Zf4wmP3L7Nc+IMYiJDFDH7YMp+I1seDPed+qJ9E8vQB0ND/+rnf8mf5DgfTgMpq+rgsS9aNmnquKv/cMOI9klzxy344me+2bm3s8nvdiMiImeTWq1WqSXVNKumaSVJBgZrfZXBvsGBajURJ0SGlBV+0ME5Pw0y9CE1Xte8tPdzg/n6+nARDWnA52xXmHv+E7BPmqAhvzY+Vn+T8z803gtVgm9TESk5EVASqS1lLaXBpu41PU8+sNlVdOaEqR/88Ide9KITDeuQG6DA8w5FAvjzUCrH77ns0rdf8kYd0JU3P75tTadWECGisHLJM9m5ApQa8RHwbbo8PgJD6lXypV4eB5mJVYUNE1gVIAZFzCa10tPbv33n7qee2rJx0/bOPb02U+eQpW5wMBkcqKU1myaSJjZNncvEWefbjFbFOlWAQZyhWdq3Pbb1vmvvmTN+yic/+r4FS6YJW+JURCgcYYLRQr2zGuJCnfwf8poEpkLz8B+Gsxrx+pn3vq9th/xLyAchztUZ6SGfH2pnMJM/8xATmOAdF5jyzJP/vPDThwj+kX/TIf9e/7f8BOM/JwRhyvWrVtg2dzSd88oXP7n9qcceW/3yF5/Xuxk//8GN6qJIWFUTl/XXKv0D1cFqWqmmlWo6MFjr66tUq4nzLp3C8CMDQJgE8KudKThP+Esk/JpLpLz16z5pKuip8isnl1c1wvQfR734GOLe9oxg/2x5RILPkHfBCCyb+nY0NFbmgbh/Q/+aO9b270xGtYy45K2XvOSC80ysTKbgmJ/PKBLAnwcGWqPmD3/kw//w0le4/mzVnY/tfLKbanEkhsXbGlN+aoc2LG+0Ee69gVeDykAkMAoD8lI6Yh81GWDDpE6cs7AKNQBDjRPOrFarSf9Atburr9KfpjVxGWyqWSqiUDKiDAULq6oja9WqCAsbG48wHbvW7b796rvHtJc++9n3TZzZRmwNW0BVvIG085nA8w20T2zcJz7UCRffmoX3xt6nnkUgkYZElnqi89+4vtQyhDyEfWFDdgWHmt7HSYbvmFNwHVYlnxK0ngFQ3/f1LG/h09JCo9Xg9UcN0ZBPhKqqpI5Tp+nByxYtPWzmzXdcO3rM5EMXLbv+t/fff9dqdrGmbrBS6R2odVcGu/sH+voH+3orAwM1P7zmtzN4D1fvBELeDhS8DyeFehAn+BhdT0dEjZjfeEPoaVF7n3dmnz8PPb4RwN6UCmoCfbPvq/MsL12uR4ISEXkfUxgL1cg1ldOWwS3V+657UAdkZKn5be9468VveF1klCiqP9wCz08UCeDPg78Rx4wY86lP/uuZJ5+edqerblq9d30vW8NqWFmcqxehjfGnenU6tILOb0eiuhgRgXFXIvGOkS4ijlCKyJBTWKgF/n/23jvekqu4E/9Wne4bXpycR6M4yhrlBEISGUnkHAw2xgFwAmy82NjGgAP22us1Tmt71/7B2gavyZiMEMpxpNFIM5Im5/RmXr6h+5yq3x/nnO6+b0ZCxlgSmlcf6c19993b4XT3t6q+lYRIjDhSBasJWKC+zxxEKRcrQtbBilNRB1+jwGy5YRvdA+1bv3SLtvIP//rPn3PhKZQqG4GCYzlWwaVU4KUKEeW7EZkKDAUiunqYLYCYys0Vln6kv8q/UQH2lQh64Usg2ul+JcOMGRBijUK5j5mUU/EvVTYSkJWqfwjRUyVSIg0VaSAViFPX4OSGV71srD2+fefm177spX2u8U9/98WxsU7WpfGJ7uGpyZHJ6SNTU2OtqXae5+qrthgmxMAr1JZXAsUlnyGiKqJOQ9zDE/SCSJqFDVTO91gnfawFeJw3e5y7sNUZAYTY0Zw0Nv9ThdSINTMTe6bWfW+9mzSJ6Ote86qf+7mfa/alzAn1bGNWnokyqwD+w+I56GWLF/3Z//gfL7ji6s5I64Eb1x3ZMZXaeqopg5zCUZgiq6HQK4YZCSFZ3hvGSj7DXUAW6kBQw0oGMIXGUGZBbPVLTMQqEDEg31hSRUQdxPnW874bpfpJMgbW5RAFMSNtah0Tyfe/cEdnbOpn3vmK51//HKROqOvbicI3u/bHBUR81hKkZ8JIZIYoRBliezTvx4S/MhOxx1v2hAeDfMmT30bIGELElcgmFXx4RDoJQ3N9zZ2HbtaKi+E5uABVxzhgqhrKPdyHllY3CvfBM/DQMGeFUljbnr9k/uUvvPimu28antf34kueu2X9oX/6py/sn+jsOXRk58FDB8eOTHdafjaZdwKJfMNuxCB09C7iQSlmwiT5Fv/iF0qhGob9lOmt4QR6CLlYd1fxt7TcSfW1Vt+c8bEZEugmUiJJSGuqZMWpyRiuZptpu9HZY9ff+tjUyGRfrf7KV73iv/3Wb8ybO8iI09hmdcAzW2YVwA8j/oFbtWrlX/71J6+6/Ir2/qkHvv3Q4R3jnIPUxJQWDUl+BCVwaQ370mDy8wg12LEg+EkhIRzgpwVTSC4SEBXzC0MtqM9XJAaIlaOJJgRWR8xkRcQhMUzKcGpyk+a1B2+8f9cje69/yYXv+vk31+ckllq+i4zClVZ5hbaP6fTRWJ1pJ5ZuTKS3gGhec/kfc4iQh0wef9i93HaFn4/brx5S4WkoYuUXVL1OQKF+IsOGo9GH4oaqTknxezzwEl8JvrWE9wMUvoUGFBc/7yLXnz/wyL3XPO/KlQNLvvvF++9d+9C+scOtTsc6ESUBhKGsobA57M3PmdEK846YHVtZbZ/eo9GtEY/+VYas+E5cGEUMOvWcYGUJopMVSaSC5KGeBZ9xiWOKUPABVVhVlVhVJAVzO7EH3Kbbth/Z2mrWamevXv0bv/XbixfPZ2KNWv2J3JJZeQbIrAL4IYVADD3lxJP//JOfPP+c86e2HXrw+w+P7W6lWS11qRFWja0aPU4Vz2IkmgsaWsIgLW8eCuAbbvoYZBHcREjYLrI+YoolFUyMT2JXQ8SANSSsFo4YaZOadTu07ubHNt738OUXnfjhj/5S//xajpaQ07AfxCycHtTUmK7aa8mVZE602LXoTMAgPzDBj3L0QwmJQvdOH7alqFaiwimRQov9oogB+F34McCll+CxUaUgg0pFdBSu/UDRarCatHRAvGcg6luwqYpbMH/O819x5Z3rbp3oTF96wXPcAb7v2+t0gjljsio+d8q3to7uFAV3oiDrywKL6HUEV6rgg0Aqvr9GL8dWKIfo65RKy6s+mom61PtvxfcIP6tfqn47qkxiMCvDqVWTAWpcmnTrmDAP3b5xdNdog2trzj7vf/71X65auTy0+ZzF/R8TmVUAP7x4eD/7zLP+4n/++dJlqya2HF737Ycm904mlowwg0RcsNDiHEIGKJAjfgtVeyu8WaBi4Iy9KxBNyGCfxyCDz9eJOAwAqo4UIiCQIUOgRLiR1fc/tO/hO9adsnL4w7/zC8PL+yXN4CcBUzAxPU199BnGw5qhGOJBRmJCe7/HFCz/HleAS+Y9muOleqlapJEIKv5Y7J8AiIgVyX2iUzkS8hhUxgzw69lXz2eo8kdooUmC7exdMSjUkbLigvPXnHrOqf/2tX81jWR+38DW+/a3R6whX7ILVacaeZkKYkdIjdkAcfGCIxhoLA3j4WMKUOUwC5WJHj+m6DY186SK468cwMwVOGolgOgJVUPSAIQIAklAidTchGy5d8ehbRPG4eRlyz/y8d8768wzTcK+nVPpYc3KM1tmFcB/ShhkwJdfevlf/PmfDw8vGN8+8sgdW1r720mWkONI1DApcUgp17LoaCZFG7kB9U0eA5iEagKvDTRAfhyjFz1tVQCsREpMpI58WwNSk6ppuMGDm8fv+vZtS+cN/MaH33vqBSdYyp1adapOPPxTsJ6PMvRjRPdosqYEsohe8QX8iBfW6AdQNTIcabCg/7TcVzyhMtIZE4E0KFE/cwYWyFV8pqvrdQB6aJyjjrvXmK7uuSQtNP6IfkAMjACqKlARmxC97V1vG1zUf+u9X5u3eBBjnZ33705dwkIAM5lA9pB6ZygUKAe9EpVAOQEzQL+Ii7PRfL5QgfdaaITSVj+KXq8mgh6Le6fwraBji51XTr28GkrRBoGSiIAygq1JPe2mPFbbvX7/9od21Kl+0ooVf/Anf3DFpZdAZbbc98dOZi/Yf1YYRMDLX37DH/3hH/b1De57bN9DtzwyfbBtHBs1oT4g0jZaPP8hBcSbw8zRqi8c/ZjtXbVMtYDPXhgown+igBPys1YIgEMf9U3snLz9azcPNeTXPviui55/nksyQQ4mVQnp9BrxqKqYtPRQAhz2CsWwrf90cExicicVcBKT3AtHBihSQItdROa7yIwJhxDQX8owrzqoFWdd+C9OiETcoWImwleO+Vi4eDQ5gsiFxWQkJpD6OfbkrKpLMuqX93z4p1/9M9ddePUZA3OTnRt2tEdsDanx4A1lgJRjSn9InPen4SPhgQbzkXxPo5WEWTDYCSGiVCjX6FAcdbw9gWWU2cZa+UxxZbX3VwrtSyopXgi0HgGqfs5Rokhy0+w0d99/aNNdO43jPk5/8yMffv611yYJGzZFIces/LjIbCuIH4EwoKCfeOtPTE9PfOT3Prbvsf31vvrpzzm1NidxRpRFyBkKVIsGpSuAeEc74B+0bNqL0B2B4NMFC0OcSps5Ouj+a/AD5wXGM88ENlzTwcO7p2//8vdSyd/3wZ+84sXnaiN37AwlHB9VlZBJGZFm5gN8DEOz/BmQQ70aI1U4769U6BuKxAfiV4CQIuuPgeJ+ffv+OF/Ln52GzwkcMavCilonuXW5c1ZDNW085LCkMw53Bu19lARyJLRp0oCIXKgxjaehSkJgsSRMndoCft7rLnXTVBvif/6Hbzx2584LX3yu9k13kKsSF+xcqc7C5mK4JtQxqPhKbQPRcCViEZr4aEBwBQo2jqpqsjiLSFpRtSNScD8qSVUz6TBU6alQR1FcK1UlElUklNRcmrYHtj+447F7HmFrFsyZ84k//eMbXnY9sVLopDSL/T9mMusB/GjEAHVjfv7nfv7d73x3nfv3PLpv812bZcolMP5ZVhFQ6AMU/QGq/ESRih6BT6AivpG+VnjkmAIECsMUAYBC93hfR+Ztz4ZN7CF75zdupWzqve998/Nf/pykX4Vy9kHiYFFr3GmwKwsCpTTWQdFB8QY+ozxmUKiXKs6IQySymPoe8l5KdPDMT2n9R/yNnwnLoqqiEqx+FQWsk9xZKzZ3zoo4VRfSgYByRXUGLh5DjvkJKvGw+Fx4wxdtAVBWIcTO1KpiGpoOyMve8vzTzljx2P1bjmybbmjdUMqAig1uGYmUVxYF1RbDNgF3VRygRFy4huWVQcEBFi/C3VJm7MZrFI97hhovIsSFB1D5TC91FDQVk6eclNQQG2eSTrr7wUPrvr+RcgzV+j78kd9++Q2vSGrGsDlq6Wblx0NmFcCPTBioc+2DH/z1n/7Jd6SS7tm4b9/Gg9yhmqYMVkIOEQYTGfXjYlkRsoO8u80h4Roexv1jLM6FGlIVwMHXGBCFrHefTkqJumiLq7KiKf1uNL31yzfaifGfffdrb3jjNaapAmtMAivwXDMFaxOosiBRCxR4H3/GhH5wUApMZILmYoBE4XxqqidqHKnA1/UWdEPMhWWWaBCX9LMW/wUNIkxCcIAvc3DQXNFx0nGuI66rzkIdVEjFFyIUiqWg0o4GpcehiCqgqUoihLDZ8GcJDFysDPANVp1Yl+T1efr2d7+8Uac7vnu3narVXIOQ+KGdYCZNWH2EpjCSqUz38VV6EoLlBVOkCh/d9gMWAs6zR3KfaxqTjONCFusXoxYhkQiikdISP77aa4kyplwEG8jrcgZIRJUtIImamqbUru3fNLLu9rWp8PBA/8f+6Pfe+IY3JqyGZjHkx1hmL96PUgiYM9D/8d/96Ote8Zps0m65f9uuDXupqwmxHzDlk/YVIn6sOyMWSZVOgS+YKoMAVFQDRGufVDx8Q4hUVcUpGfa5NWSpDw13yN3yhe/l46O/8PNvfe1br0/7Ich9BndpeZd1SYWU2Zw+pZQiRkc/pOR/KND7ARUrNIL/Y8h7j4hXBjM8uBVMhFbQJ3gi5R7VqToNxn4uNhObqctVrB87U1rzlXgEyndn8kHFMRzbTaim2Wg4hfCaoD45lwH2A+dVRB1U1ZE78/LVr3vLiyYPj2xdu6lm2TCB2BCROFUHYpLCKwocWOTOIteHXl0sPqajkZ2aEQWofPQoHe4POdYz++tSKdSITkWhiop4RLmaGphNAxjLaSc5uGHs3m+vt62shuS3PvKRN77hjY1GatiIHMPfmJUfF5lVAD9KIYChQ32Dv/exP3zDa147daS18Y5NB3eMUkapmEQZDg5WjYbKrch2BMAEQu2X1xdMlfzwgLcFFSQAmBwJSJl9HS8Mc537pg/L97/wne6R0ff94lte/YZrkz6ybMn4giahas5ggec9OZfV172nV/BDhSHrKxiouslqAmdglgoaPfyMiiHia4VS8tglKqJO1PqMT9XcqXWaW3E2tNUpVFWxE4931Ujvsc39+O6xtIASqLrNqApR0CMCUQjIeRsdCrVgQ694ywvOP/+Eh+980I5I3SXkr7ELtna5sTDtsTKogDzbXkl/kmigRwouEjgR+Gd4OhRuICqcIKVQW1JcPSXf/Kc865hWXOyIiJVIoGqcqjNqalJLOo29Gw49fNvD2qKhxtAn/uQTb3vzW2sp+8gwVa/9rPy4yawC+JELMfGyRQv++BN/fPn5V7T2tx++bdPejfs401QMgYVUyHEYouoJFo4/TexsWebOF5nzQCS6hQzYZ6X4VMLoQmjNcXskv/mrN2t76n2/8o7r3/BiHtCccnCglSgWD5e2o3oKP0BxNEXLlMWodmZazooq0kvxMtYj9C5KBXG1YEKKzxS5nhGbBSoQF5P9c5Wuc13nMiee/Q9J9MUGKukrvfvFMVVA5a8zZUYubJh4XCniihqYfTs6Ayioa/OBBbWfeu8bnM0fvmujsWkSSDwwKyQXEfHpPwjuisYXcdWiggnd8MK6EVDmBQU1Uro9Pdk+/ijFx0YKb63C+xfnUJ5ixUcqfQcAasDGMbVoZNPYg7dsyCbzwaTvt3/nd9721rfXGjVjkqofMSs/pjKrAH6UUvjlDFqycOmf/NGfnnTCSeO7xzbfu2t81zR11eSAkIiq+DRBZd/wK9TiYkahMEXDrDC0CEiUjXCClFzCIFUhaEJa12ZnRG/58u3Syt//qz91/euu1YbrwpIxEKg4+O40CCko0QIPqqiKlSET5Jj/zTjbYLFqIREXZgQYfV581FbEoBI7FIh5jj6ZVUUlzFIUtaK5Rddp5pCrOG/+Fzx29bhm2P8/8FJp9ddqGCQ2gtaYIYsIjMGU5gjHGvqmssu1e+qa017x2hc9un7D5PYp02oaTZSQuxwAwaAk/mMMt0wR9YehRVAoKkt/hlVd1/Mi6NFAnIUVDDeR+kgBEHpJaQ99hLjHUAlOpBAIyAHOKKeamHZtdOvEQ7dvoHYyt2/wY3/w0Xf+5E8lRhM2T2qRZ+UZL7MK4EcpFfAhBl1ywQV/99d/Pbd/7uiOsU1rd4zuPsI5JZowDMQxA0RCCvJcrwoVTeKCOVyYah7jYkMhVYLzOaMsbKBOUk3aB/Ob//02abc+8Cs/++JXX631LEfOJlBMKuLJF/FjXmLEoQBiLRj4o6zgGWeJCmAWwIWSwwggXskoQgx0gIiYTElvxYSiCMhFSSz5k3dQKyLinIhV0VAn64mZqgoCKv88Gelhriv+SLSbC3yN74YjIo20uaoSjDqCCEMz2Ny0X/2Oly1btfDmb9zRPWxT+H7IRU2XP3aKtQGowDHFCjCUJH3cdcwULjRdcfm4gv5Q9SWExaZLrw4zFUj8SZWDCN38QIAR1GwyuavzwE0Pjx+Y5sz89kd++x1vf3ujmRpO4jV6EiMIZuWZLbMK4EcqFZOWQQR93lVX/9X/+OS8/jn7Hjuwbe3e7kiHLZEAgIgKq7IHy0C5xCewIOpLugYI/LEY3wDIKZzH1Rqa04fszV+9LZvufOg3fuYFLz9fGllet5oQgWCFnItxRj9kRo865IDS/ggqaKGRj6gk7FS/AniGokryA2WCqjf5maovEFlq3/BMZ8BakTKkChXx3oALtbDhCIqy2qJ4ye96hoH7RNIbNp7xF4oJNZW115i0H8CcYo8PIvbWdoasfyG99Z3XTU9ObX94s7aIc2MoJZDAKnvvQTUGsMMA+ML9Crk7oQBEK3mf5aWqsHfxhgjVZPDxHYlVaGWVYLhKlchBmVJMIXSiyqIQBmpqaq5vbPvU3d+8MxvL5/UP//f/+UfveOvbU0OhzSeCLn+SKz0rz1iZVQD/JVLYbobMa1/z2v/+8T9saPPQptFH79nePjiVSmJgPLCKOigITKpMZYw3bkU1NgdAbCstJGIERgwRuaTWTfNRvefGe/PpqV/9tZ+9+rqLqZZZOGJv+zNisj9iugeVxnmJ9lQa/jGbvPRAHvdEKRRLaYSbeOYzKaCAXN5XIMCPxS1DiMFa7TF1fd5plV/ytnfV5kcBoJWD1fh37Xn7cS5T+Z3qb947Oeq7lXQtViY1PtYqTuCEmQjusmsvvPKq8zdv2DK+YyLpMsN4mFVnEed7URH/jwfpT6HM+ew5lkobqGo8QFChjGJuQKSDgm4t1yGeUBmm13hdfMKrssIo1Ww6ub1z57/fnY3bJhq/8eEPveXNb6k1ksD7V67trPy4y6wC+C8UH29Nid/81rf9t1/+ADru4OZDW9fuyickQcLKChAbUoKI54gR+Fjfm98bar4ylgkJaWI09c3QmNQQ1fKmG6vf8fV7p0am3/9r73zR9Rcpd5UlSRK1voFmBuMcIdZMBTTRgiKoWITe6AYVGSjQHvwM7FBZ3uqLTFEBckLZqCiy2JF5LnkNIk2IEja+UzTHiY6hb45nISh0yGAwKYkfokARH0N6VKktKoqsVMBPLFr9zLFBjXpea9U7QuHokIKZAIizCm00kjf/5A39DbPhrvWNrM9Y45xqtN0Z8GfEIb0nrKkTJxrUW+wNASBmcoIJxEoch9WEGRC+VsAh+JAUisqr517G6lU09iWk0OGOnIqQBYSVDBJktYk9nXu+dbcb1znNgY987Ld/5qd+upaooVne/1koswrgv1wUSJPkfR/4lXe9/Z1uWg5uGtm3aZ+d7hphBosoSIkrJHSFjI5FOwBJTAMHKSXMIpoI2XHc+a07pw6N/PJ73vmS664Et4RyGLVivcUngEeJOE2gsCHLHcUqX//nCttQtoKuYHfg88tDJRAXUWTETVb+HrUJxa/HQDCRARkfD1AgtkqiUqmQQWgwGZOECtqpJGiK0EvMaOw5AHocaC8P8PH1xFHeQ0E4eV5eg97yXIqGwLGV7spTF7/qddceOHRo39YD1IWhhBAhn+AJm3gC3nKPfZ8UoQ2sV4aR6w978TUgDr4GOvQTESqZfF8CViSz9pyJ+mTTcrgY+X5QIAUrEjF1m0ztzu74yt1juyfr0nzf+9730z/1082+mqEkhqMfT1POyo+lzCqAp0IY2t9o/tZHfuflL7nejnV3rt05tXfa2ITFQOCfaEuiHHnZQJEXProSlEnAVtQxMVPCarotufO7t4/s2fdLP//G62+4xNRypNGlcEKq6respfaIbEOVVQ7IUcmrL2iZGTROJRgRvhg1SYUw95jvTdGy+V0lZhjL3bx5H3uFVlVKJIkohohLGqkC2BTDCVWGfCY80RNRQE9SKopZCycgqshy/Xyxlqhaspq0X/Sq55166tL1ax+UMTS6tUQTOBWxcUiAqC/r1XLSQXSriguiZWWDnxrhpOjhWgQjfJyg+Epg2XoOnjQ0X4oqBxQKlcmpOgZSmLTbGN/auvfr93UO6dyBef/tQ7/23p//hVpqOPb5mUX+Z5/MKoD/cgn2rOrcuXP/9E/+9IKzzps+ML35ru2T+yeRewRVgRV21k8fRMDVYHBpeHj9htgQFMYqOrV1t60f2b3vp3/yda98/Quo2bXGwhARQTiwPB4ofO68WN9aRwu+XAMXXSQFRSj1zgNVQbhq0lfOLLD/RMTFEEhQbGWG8p2Cko8h5UCj+/kw0SWQuPtAGxVx0Miaa3XvYZMROGcQH0cf7xPIEzkBR/25zHcqrxIoJPt4LM9NVp+X/sTPviGz3cfWbuY2pZyADTOrcwDYsAjEl26FthgUWgxJeE1KKmUltFhV53tfQCP7X9HJxSEV/SvKoDnFjnJlh6YYR2eQETJdOrx1/K5v3NvalzXQ/5Nve/t73vveZn8jSVKUemlWnm0yqwD+yyW4zsSGzPJlK/7gD/5o+fxlI5sPbFm7szPWIUuszGr80xhiqRKQQOEQI3wOSlADSl0dU7WHb9q8a8OhN77+pW95x3Wu3s2NJZOI9fyxU4IQK0j8AEUnuRXrG8+UdniJ/hHEIuTPAFst/IMg1a0AMbexh1gqPgjVnjKx6DNEQiuqF6Xwv4QzFytwqqIikCItk0rl1Gv7V7Lle462MNIf7wJVPzZTZkYTojtTiXYXNnv8iKg4FTH5GRedePU15+7YtG18T8tOk1iCMIFJBE6gJNaJEyq0XLxgvtYYDijQ34kvgIvaothb2dihYMeodJSCRldVUibloFegCgdyBpQoUzvdu+HQ3d9em43QnIHhn/uZd/76h/5bf18j5vvPYv+zVmYVwH+9VPJcFHrlVc/91V/5AGt6cPPB3et3yXROghibI8R0cEHs6U8KhkBDdNSBO7WNdz66Zf3WF1979Tvf8wapT1mTU0qi6hu4EEEQWrMp1DrJxebinK8xLazaOGkeBdVS5nGWJHr1l5lnVpBIkXCOXY38Nv1Uc40hg+BYoAyk+iRHAsAU8iMFcOqcOitixeXihHxVQTS0iRi+eJqrvIRWfs6QMhvmceQHaofiVYUcK92OwIX5pRIwQWEcJGnIK9/44r7Bxn233z890qmpgQgzASQi1ubiSXwHqO96SnAg34jOqTgVC7UqceiBSmR5IrWmRRe86kWKvSuU4jxhEp9N5i8VgYjUKBmHtIsd6w7c8411nf0uxcArr7/+1z70/gUL5hg2RS7ArDxbZVYB/JdLb2yUGOZtP/lTb3/bO6jjDqzfN759SjNiB7Kh+Zf6mcAGIDVKJEasMpQILCln9Q13bNzy0LbrX3zZL77/9TyQ2YZoyqSM3CUaUvJJDITFeeMZDpQ517XOOhfol5hfFBPqfWy4zDxCAOuiT12pEgqmOb6g+FkAvnRBY6JRZdhN6TaEoK+GlqLECJjlRKxzubXdPM9snovNxVr11bbkN+a/U7TRi90yenLlZ4BWZNUf9wKVn67EGB5fp3h9pATh0Ie0cIIIDOecU+tIhPJFJyx449te1m6N73loO0+mqdTEqhMREVaGQpRVQ+8jdSJO1HnQV/WknRAckWMWQ0JQVjX+kmmldx+VAQNijb1klUiUQKTsu7h6I0Ihhkzqamayb+Mt29d/90HTag72D7/1ba//+O9/bN7wEM+MAs3Ks1NmFcBTJGX0DWg2mh/+rd+65LyLOyPdnffsHt85aXI2ltQG0lfIScjMVlY1SqRkFImt73l0x5aHHr3ysgt/5Vd/sn8uHDuTGIJRIQMGICJaIcglps9n4jo2y52LoUx/QEVg9qhIqgaDG1qU8RY4HqKfWhA50WmIiMwMjtAcin4JMf8nkkVFHqePkYiqEnJnM+uyPM+czZ11ImHATRy+VpjyCgl0UJnX2BOKnrH48byfwBXQx3k9Q6hove3D1RrXEkKkjNA6W3PnYOTaG6689Ioztj26fWJPizJASK26zIXlc+J5fxWNTcFDDmjVX4tBAo4dWn3fN6/DfZVBqFQOjKES+bwBEGslws5C5FIQO9Wp7vqbH3n0jq2mXW+mgy96wTW/9lsfWLBofpqYMOBlFv6f7TKrAJ4iKWECxITFi5Z+7Pf+6MQVq8Z2H9l08+apXZ0GagaJOg5zpEAKdk6VckNKjuu2cXjzoXW3PXjJBWf+6q+/MxlWYUmTmuakThMiIperKAyibR6IIyIHVUJXbMtmmQiYNcZqA/FMDuxZZ0WgIWIlWOChCtK9yOuHBMDx1UeWyLE6VjFAAjWqBhpsdo/8IcQpZTRTRMUpBAxloyaxRF1oR6UrkgOOVVlAagpfIkab4+wUxLBnmekU1cRMAqei+7S4Lr1XqWTVj5ZobfsQLDninDhnOBahUJUbO12LiCMml+RmKH/5G5/fbKYP3bNep2C6CZNhk2Q2EyeJGHIAmHyNQ1AGRdVD2cUHgf1XFRGIkFSLqkFQ1kIRR6onAYzzKw0BO4Imkqa21jrUXfvd+7fetzW1/YNDg2998+t+/xMfO2HJ4nRmRu+sPJtlVgE8dVLRASDg0ksv+cM/+qOFwwumdkzvXLvj4JYD6AoLkRAJFOLUgVlFrOs2qNE6NHXnd+48beXJv/GhXxpclDjucgJnLcBMLCrKyh6ffSp4aF0QYFGgVlzbdru269QG21+LRjGhpism3gDR4i4O3v8T7MtAIFHBBcVCsB6bv1ocHN/jwlVAzKcJDAqBiBOTGjYgCrVREoaXx3Z5paNC0JCWT4WJW5zGE2PYk/iM/iAQpMqHNG6yiHWElRJVEbYnnn/Cda95/uHR0b0b9icdJiERNeAEBHEi8A1Oo3MVXIvoJcWeR77yNyRPFQ6YCHx8AKqicAInECUoq7BzsEhIWcmAFakwtXls2+gD31y3d/0R6tb7k4E3vfq1H/6tD5588qoacygW1sfRgbPy7JJZBfCUiscLX0vPoBe9+GUf+Z2Pzmk2R7fv3b9+d7avRblRy2rVU8JKTsk0zEA+obd8444l8wc+9Os/O7y4IcZxAkDJdxX1lcMa5gYAkJAWEiz3or1Q5mw773addT7hqDCci3CwDzr76qCSOK9k1ACxM0WAvZhoUmF0ik0VWYe+w088f18moD7eCYgQAFbUiOpJ0lerN9N6aowhYvUzrDiMYynM0wLxi5qxGGuOKfFPLI8fEyg/cmxXwPsfrODQdDPCfqzkqkRUREScOmJ96WuvOuP0hY+u25gdcSYzkgNi4MjBOnIhpdN38hEfZ2EKXL8q1JE4EmGncKRCEoj+0A1OldT5rC9GzJISUXEgK5yLccpKrqbj9T0Pj6y7aeORx8aR1xfMX/S+9/3ix3/vo4sXLjJQgFVj5GdWjgOZVQBPgwSYVK2Z5E1vftPHfud3a9300IaDe9funtgxqpkyDJw3MW3KnOS89nvr8mn55V98xynnLrMmp0SZDIG96e5ZAZ+EHguEpFIqqwCYmJisSDvLOnlm/Yj6mAjks5AqR1hY29HGjaQ/xVY04ZMFQRSTi4pOERoIn2hRKmlki8QnIQIxqV1UlUGsqJMZTBpDjf7Bel9/Wq8xs1Q8hXhCFI67fAfVVNRjrfmMK/DEFcKlVHQAVd+lGIgFevwFv2hCvusPSEUgkKF5zbe96zUW7pH7H9FxbmgdQk6gBA1zZQAF+xI4H533QYWiSJgE3i0LTlHBx2lZnxbEKfyEYYSoscK0kO93W27b+sjNG8e2T6BTO2npif/jz/74V3/jA0Pz55okFnzNIv/xJMnTfQDHsRAztJ7WX/+Wt+/edfBv//bPDm8aabU7J6er+1bWQZwI102SZn1bHty5b/uet7/tVRddvaarXSQ5AFHh0OjdQw5JKB72iZWBD9cCKIgIrCxdsVNZN00SkzIIXHZXAGZgXHwRYp2R/i8JnxBs8FFRoeBqRE4nNoEIdInvQqDwBQ5Ww9grlHEGJcAAYLAxNdPoGNNWalvr1CkEYZh9KJPrIXECvxb3+CNrWBZ9lvJ3QrXRQsgH9ZpR49HFWKz6dj8QciLdMy889ZWvvfbz/3zjkkVLF61eKn2aIc9VDce2bqIQKIcVhCj8OSspg5QB3wDa009clg0SWIGocmI/URVisjWXiUxmk7tGDzxycGTX4ayVk+XhoeHf/b3fee3rXl5jFueY2R/trBxXMqsAnjaJzIk0Bvp+/aMfWrZizkc//rHRHWOb0y3LuguXnryoyQ1jaWoke/jOjReee+Zr3nAd1eGoQ0wiGq01qTDaAa1i+WeYPRh2p2AiAlvotOumuUmYGyYJyBXhS4sUdwqxaFAs3iqoldBNIqI/EP+lkhmKWCwa/AyNoWlVOPWIHoucA9+tvkaJCMZ3TDMppQBsK+84iCLOntQyIF0Q5+HY/LISxRBFr53fU4XwpKVnK1pcvMp1rLxT9MwBkUJECY4NxFhu8Itf99y1dz24fu1Dz5m7IF1OecJ+mX2pFoViD1ZyJKWXVeg1IvXlIUTwU9iK+ggVB9+STsgQkSMSshnyiWx079jBrYcObj/g2tblwjAM7RtKzr3ojIQI4owx5e0zK8eTzGr8p1uIDcFw8vqf+qn3vO+DjObuh/Ztv23nxKapWjvNDts7b7xjIDU/9bNvGF7U55DB+F6dLqCqsipHBqaoJwXKB9oHDX03MTAzDGcik1k2nWeZc5G5iV8JtUJl6kmoSwuGetn/Mn6k6ANUBogRtxjYHlEralWtaq7IVS3USWX6FQEMJRL2A9NCaiSD6kla44RjP7QC5GNZGGL8GkXTtBC7DsGBIh4RM1n/4zhXFHkVUokwFCHugm+jyEeFJnkKFXGi4tgOrUhf+aarDbmtDz7KrSTN/XSgMNYtpmSJhuNVIgX7UZJK7OIF9y1Aw9VRFWF1CVyNLCfQBjp1O5aMPdredtueu7+09oFvrdu9YVc21XW5MINIAJo4MvGXn/izkSP7Z1QWz8pxJbMewNMswWCF9Nf73/ue947s3f+pT/3f0cfG7ht5eNfi/bnrHtl/+F2/9OazLzxN0IYJ/INPtCnaKlRMcYrIGyfKFDPCi6/COaCT51PUNkRpLaVg9ftQbFABFPl9ibxP+D5Y1PmDL6q9IsdDRKHZg9cFQhQGUar6bhYa3Qq/ARBUYxjZG/hxchmgICPhaCjwLCjP2JMxkQ2aiWBUWZAC9H8IG5dipLvad4fKP2LGrn3UPVQKCDi2ZVCoE5cQXXLtRQ/csvWOGzcs2bpw3mlzJREr0Vnwg+LV640QBRKoL5cjqKiS54RUyJfFESDEjtnVBMg63enRqZGtR/btPDh1aNy2umV0gAkQNawOgEy1uv/62S/Onzf8B3/034v1mekwzcqzXWYVwNMvokpkANfXaH7kd393/uDCT//zpx/btnn6cIdTc/aFq1983bVIrDXWGKhVIhYw1E929EwBldAY6oE8X8wKcmqBgnICwMwgoVaeMZmE0maS1rhI6QSR8fFkAkJ/eVH4VHJPPQcSSD0uxsmzTBTQXUIFqwpFrh/wEeDIFmmY5gUKLYpVAFKICWSUKEFUHMSRC8cQh0AWzTWKQrSQxhQ6HRTuTBGE+OElRmd7Ygra60doQf8XnFRRYOe1Iqmq+ChrfSB51dte+tDaRx6778HLF15r0jQzmVEQhYYfRgxEmVj82hIxWJwQEZwSRBl+qqYRk+SmaVPbktaRfNeOvft27R4bOSydqJUNuGYazXqjr9E33Ogb6nPWjew+3Bpra9dMTrX/6Z8+85Lrr7vquS9IjMEs+h9/Uozum5WnTSJ2QNURkYXs2Lb7c5/93O9/4uPNObVf+/13XfmS87o0jcSRCgucTxhkQgBNFJwE+cnewTwmQETVqnMQpyoQzxwroOIgWjNmsFYfaDT7kyTxQUUK+OX5Fh9WFonJPNFoj60nix7GnnD3m+BAeUAtxIXAbwmZCuWCmwkxzkjNE7F4qkadIhPtOtu2rmNzqy62iwskTxlpDX5OYMK0J8PnP2vUHst30Oq5ACiqEZTgU3AZhQ/iXSelwNCbVFPq1L7w11/5p7/51lnnnrv8slW2X5SsiiqDAN8kHARhVVIwwYLARGLYqIqIGEoNJWQlm8wPbTqwa9Pu0QOjIq5vTv/w3DkLli9ImmZg/tDQ/KGkmdQaCRE4gaowG8lwZOehtd+6//CuCYPk2muu/NSn/mnBvIXGzI58Oe5k1gN4JkikKYgIksKsXH7ipVc+r3+gcfHzzrr0eWeq6bJxTpw6ZZ+oQRwzLJViKiJFnkJjDRYpkSrHLpMMVoWBioqCHGumMpl1lJlIG0masom8TkhijF3HqCwvi1mHVLRF9sFeiHdERJ36Sl8VBVxBCQXVEltBBIcFgV9BrBojBXxGuzoR55y4MDVTo0PiTzxEhDUkrRdB3xKcf1jop9BA//EciKPej2GPno5zUa2H0DipCMBiyTXq2TXXX37Ht9c9+vDDi05c0rdsoFNTIadwCiUlJnJUxHMIDDLkhIjY2NqAabq2O7j30KaNmw7u2A2HxcvnXvjyc+eduKg2ZwCJMQmxCQEasHbFkq/uAKmTWl+ycHjp1UsG7vjcnXsfGfv+TXf9679+9t0/995Z/uc4lFkP4BkiZTq5ABPjeMvb3vzghtt/73/96uqLF1maViPiBEpGId7QL+On0Q6N2FhkoUBVVKyKI3W+144nI1REVUgBZdW6SfrraX+t2TBpwmwgEKfE3rSOLoWq+t71MXUzKIDYfNi/TwBCMUKoR/BHRsE9QZgBA4lFveTTZcjXP/ugAzvVTKSb2651ue+DHBNbKegeUfLzEYGKJxDX8z8EZVVy/6gvHquNqFZaAEWHoOCnirBwdfOhYSoRpSZhq6Zbv+NLD/7Zx/9h3txFl7zoimye61BXWUglEcPETonUkIphw2ASAxjNJZvMdj62Z/tj21tHxsxwetrZp5x21sl98wfzpnV1K6QwrKox6B1XFwBDICqamoSVajDtHdM3/cudrcOd005c/vl/+8IpJ59qeNYiPL5k9no/Q8RDIhQkIl//xjduv/OuV7zx2hNPW+7clNaEVH2DxgJlKz3fe7dTRn9RaoJgO3PUNARSP4xLVHLRVp4rjNZQU05DdilFYinWCxMktqMXT9Mr/ExBrxs0BgUKtPZfL86QQl4MAPVFvhxDtRQjuapkIbmTTFzmxDeDIxQGf5mAFAzrmBjau5g/5FWoKlIUfouWybHVHZRvaeW3YnpPjyfhTxmikrs8TZIE7vxrzrj0xtO+9/VNJ2zbt3hoWW5EYAmsSkJExIYStmQ6nOYmm3ajB49se3TrgX378k42vHDgzKvOXXT+qmR+A6m2UgtWJ55bEyZWEd+TzoeQVVVFATCRiFVQpnZw5dCFL1hz59fu2bxzx9/977/73d/9aLNmZrNBjyuZVQDPFPEUh8B1u61PfvLPFiwfeu3bXpHUrLD4lEnmQH17vz5+z5MeFBsRAJEF9zFXb3WSctEhngElsHo70YeU0XWiyABxSSpMTMTsC7YQR8iG7zsNrWeKojAVQEUq8NcDo1rB6BBeCO9zyHdBZIAUQK4uF3Rzm4mzsay5zPXRGG4GV7dXwf8nwq9jZAtVoP3ob/ZEegOJpUd9uLqBHq9Ai0B0pLB8+zqXu0x1YEHj+je/6NZbN21Y/+i8kxbU55gcztcCs+OabRAYue1MdLc+tnvHlh3jB8dhMXfF0KlXnLvyjBPrixtTtZatZ0LqQ/gkpCxwUHIMVn9ZvJcvgO/uEYu0CSaDLD5z8fLty7av3/mvn/vcDddf/5wrn5fMOgHHk8xe7GeEaJiaQozkuzd9f9Puh6979dVD8wWpECt7Jh+kIgQiDjaphBhAbOJfGpvwSfQCBgTkKwWKuKnvxQ9RCq0riUQ1V7QyK0KasGEY1eCUKIK1TwF/JUSSoxcASMgw1TJPh1QDtQOEWoRoTEckJcSUJUABgWaqmdOuc13rBOo8laHku4UGur8sf/KnwxUnI4YnnlBmqIEn/kIMT5efib9VdHD4mxRKjuJCxLpp9Qn7YcSWMUps1Z15xTnPv+Gyb372riPbjyw/ezkbtYBTqgtjLN+7+8C2x7Ye2j+iFkNzePVlJ6w6f/XAsnloUsZ5x0wRROE4nLdnfNi3WRIV311PRWNkRGMYRRks6rrops3kzKtWT41OH9g28ld/+clLLr6cG8lscdDxI7MK4BkhMZwJAb7yla8m9fyKa85rzDGOOkDs6xMjp94ajtn6HIMBQQmEDQbdAMCnWfo2nHGMbIGc4Ng0mlTEEnXFwYph1Dh0IQBFLkhCmzIfQ44UkJ8qo8xMiDAUs1I51F6VzcUoEiO+CY6qglihTjRXl0M71uY+juyz/kPg2PeJkBjujnUPMbuoKCY+dtQ2ilZ+Vt6j6rrN/IrXnTP+UO4dlWhADycUrijFELcfhKDk1LJhkHZtNyF66Wuu+f637nlo7Ybly1bOWTZnQqYOHT687oHNI4/sz7raPz897bwTTzvrlKH5c2QAeYNy7kiiTsV3DTKe9SHxKQQQJUIYoOPgbYJwNUo9Rj6EDnFM1JzTf/olp903OnnTbbd+5zvfuO5lr2AzCwvHi8xe6WeQKHT/wcM333TbOReceso5yx131SiJhuhpSJ0vp0VWiBWOpWC+aaYvG/J5md7eF6Fi5HxI4Ve4IlgcjD5V57QLMWqcCAe73cdxw+487+9RMdjuRR/LcjBXQJ0yjOBD11WyJSARnCKHZiqZtVYldyGbCMRF1Dc0Air7TwcqLIZFQsZS6Rc8jjz+X2fwU0f/tXihsZ9EpX1euBKV3yvN6rSyEa8pRQSqbAyxO/2CZS96yUVf+ed77rnxvtpQbffInunu9Ny5zUuuXTPv5EVm3hBqDHKtRJXEz0Zj7y9SDLSETI7emHQvfaVl9hRiZw4HsEVuarzw1Lkn7l6+/b5tf/k3f3X1NS8c7C8mgs3Ks1xmFcAzS772tW+Nt8df+KKX9Q2xGqvkKFZnQTWkQgIFxFSZicL6ZQJ8l+gQlA0EDYFDyzgCAAaXDL0qQiswdULKISJNnkgIPoLffA8ZTrE7KMOrgWjpF+AdVZXGfKAQytUwsN466ajNnBMRW0JZkc1ZmPsabfXIAkWG3bs7CCrgieQH4fsPkKOoIJ2hUUIjOF+jTVW1EQarAepEDTMRnLMEbdZrL37V8777tfu3bdo6uHjggsvOWn7WCbXBPjF5t5Fbto7LaEtQP8VyIFRuU1ymyumXYYmq39Pr74iAc9han1l13gkju8fueeCBr33931//mjfwbE3A8SGzdN8zRRSwar/89S8vOXHBeRedRqyAg7rAeSj8xQqsjjeSe5sQM8KwFQWp70JJAhKEpjgUtEB47ee9cAVnfRNiKFSUfDVvGDqi0Bj41WJzTGyIGIYpYRjjh0AWh+EpaeIKL+MLuVSRi81VOiJt51qSd3NrnbNOoCTiT8dHCHy0m2Lj68jFlDmOBeRRoTPiJ3+IKzBTExQ7paLtUM/uysUspwKUtn/00KKaqLZREj+6LeGOdFadt+LFr7186NSha97wosWXndJdXh+bm030593ECgnUAf46hjpnLZOAFWEucFADJdiH//zOynY/VJ6lv7IiKhZZY1H9hLNXosb/88//fGJySmazw48PmVUAzxSxzu3bs+/B9fdffPUl/fMS4ryc5Vqa3ihgrsyAjFBbwKV/M76P8Bk/rTe+KNIxS0OxaOYW4B4ClYLARpi4Urzy6fzMTGBiUwCiT82XAFpwCqckIX5ADpKJZKJtZ9t51sqzjs2tH2QcghKBK/cN0FAg0Yy4bTXrqQS8H6UcpULiLo7eTw8ZVOjo4rMaDzKqLk+iKURdDqU+felbrj75kuXdQZnut1Pc6nI3Z+uHKJAwij5/xZ4UVQoqvqdU7K2iAbzaKIYqRGcCPqDjJM+dQ0LLTl82d8XcR7Zv/spXviTi/lNrNys/JjKrAJ4pQoyvf+M7lMill52XNCAsEBdy9o6GHG92xrJaZQE5b3sK/Kwqb3dytFGVS3vWI0RUBEAsQo4SggZwos6FQTNecSDMfC/2HouP4Xcdkno89FtVK7ACq+oAK8hEMie5c7mTdmYzJ9ZXp3lXwXe4JvL+S/QheEZkt6gi7nkHM+YD/NASVu6oaIEea+O9PkHsAxpfU9EtL4YnPIgHh0YVTkXY2aQ7uLx24sVL2v2tjNsMYfVz0iT0/uSwSF6dlE6PouoMzKgMmXH8WowU9j5dqCcM9YEucWY+Vp6x0tT5L/7yL1udlshRd92sPOtkVgE8I0Sh1rrPff5zF19y7vIVcx2rt5v9XyP1XJq8sdg3gHlI8yDfyscPYFcghAu50nqBQgRT1ecGhbrcCF5xf8VkMVFxTqUYOKbF/JiA1D5+7C19K2r9T9HcqXWSO5c5yZ1kzmbWdvO8+M+pOBG/m8pJFfR/CcRUxgOCl4GZXL/ODLjO/Ot/+HJU6yoA9KxPz5Yr71a5J4rbiV1JY1O5kEBbKAwVsXmW9qVLT17coinLztP6Sgiz0kqrXmfsqvJm1ezX4rCKldVYzKdliFhVlUFkSFQza8XIwlPmz1k2Z9OOLV//8ldF7H903Wblx05mFcAzQpzqpm1bDhzed8nl5/QPsRgVFngaJj7FFXgJpDP8OxqYF8AgUPBaoKYf78tKnppnAlH4lyMjzApW9s30KSbuK5GAnEAUzokV3+Ct6PDgoZ99bwnnxDnKnXSt/0+7uWS5ZrlkNr7pJHOSi+YgRwxlImZmTycBvs4tHBXFozEEo2SIQOQ/S+w/Gin3QiEdZaFHDv6HEKrGcHs3GfRuNORnRBwoRoK9Ji68Fa9ySYXIZ+34km5WEEQU0l59xuKBYRbpquYqElYknEKvmqn83huZqEoI4BTqQEt1AF/hQeoL+BQEFeTOYlBOPGcF1fP/7x//T9bOHqcV0qw8e2RWATxDRL7yxa/1DdROPmNFUrcw3AP4QPVFxZRHGRb1bAlMwCcGBcAJnRyo5Ho8zrL/j1BCf2nnejtRQszWKcSpFbHOOhEralWsqHPijf3cSdfazLrMucy5rrW5SC5iRaw6K86KuDDn3tcSAwixTFLP+YBCAKA4J8Q2dwr1rTRjkCGgYJyN+zhYrz0vtbSSe8IJvbR+JeJQMi3V9df4ZnxRBtg1Bl2KF5H4Cc4S4MuzKcbRfT8mMNikCZvB/r4kUmDV1J/S3EfZADsa+zOCH/H6afVIj1oijQ0E4dOFVSC+YeycFUPzV8xb/9j6h9evc9bOaoBnt8wqgKdfFLBiv3/brWeuOWvesrmSCKmFOpThzQJvPDVNVEEXFKmY5GkaEpBz8AFaw8RRGzAxMzGRIWKECAAFBA5bR2V+ozdbPWrnikzQFXSdeou+7VxL3LSzbee6ql0nVuCcBqfBNwSNXoyEwTJVm10KP6IkNCjyQRQSmPxEdJ+iVEagQ7iAI77iKFboWGHhsolnAZ5VLA0fp4JbL5Yk+Fxlak/YytGhGSVSMiDjjXsikAFYAFc079ZAAqlCIVBlIhXX36gtmTsXNhMnxMYHCWJM/XFunKNOuaSACscjgH81khQiw/FrEn0RdWx5kFaes3zaTv+/z/1bbmdDwc9ymVUAT784tRs2rD9wZM8Za05HapWskkPso9lrgc4IRcbSJwrPvLByAmZKDbNArVAY7cIUEjKjsQ8PehTKACgkk8dc0YDJkcMgAF4TBIpfxDrJrbNOrIgVFYRCM0XVkSiguMLnayWBMtZyxUBweWwF+19JKi1PPFJVFStZY65MhQTvET3qZ88Kh9daZNeUvH9UitVlOSoxtFypEmkJ8MV0vqgOiPyR+jWPUQInYpjnzRlMSFRUVAhV6NdeRkpn/Hus34q3NN42veoiVpGFfGJVQEVFWBasWjCwcO5XvvG1wyMjblYHPKtlVgE8/cJEt952G6V21ckLTV2EtWJbarTsCg+gKhGDlBTMlCQmtc7meZ5Z58hw2lRiTaAGRGA/Fh6RCAqZnCWzTdGGjeZjUexbwqbPGxGJY78KSzrCiUdhH6ao3F7VmgUtUxgrzEp0ZwK4xjOs0OwFiFGMfhSZNyqlijw2J/7EUj2cGZwPRT1bsm/lMZTKICxA5LT8gE0APpRfUQ/RnfMKs0jMTE0y2NefMMMqueLUKuV3xSH2rEa8Q452gGbqjKNUX+STKByniIqoM33pqjNX7Tiw4wtf/MKzOxdIIVoYWselzCqAp19E9bbbb1916uKhxakmFqyqJK5q5RV4WL4ZUCcy+wZkBKyaps3+wTnNpA6oak6qRMJcoHFAzWhfMxMzsQGxjwSUgmIfJRqHf6GFBRzejzEJ9dmg/lOkcTdl2VaIMT8ORvcyXlV8q5I6IXmpeqz+ozpjW9WNPh6Q9Zr/1TcLxO5ROxVvJmrHeEFmbJcIzLGywZBhMlSek6+xUD+sDUQieX9fOtDfdM4KyvDGUQdePc3Kbive1owF6Dmp3vP1PwvlpBBHaknmr1zYN6//3774b9blz+pQMDmg67LD00e2794m7rhzd2YVwNMsArd7z57HHtl05XMvTPodEqccul0e88GLj2xpAUYyBYCQYuJgZ9ODu5yr1bjGmiOxyiBVYiawz92PWfYaGgZppGZCKXGJCJEMqXLiPbhYGOhc0M5+m1Iaw9ErCFZ7VBQzTPqqeCVQsivVD1Te9TlNlTTRAMvHZkuOipf2fgAooL3YQ4/2Q3QFqDgRihRK+ZFCMYTDUT+1xkATwMQkXYq1dRycO2VVSuo8OKcPHCi90LePquRT1AmVii6dCfEzpWI9RFey2h4ilnl4b0RUnXG1heny1Ss3bnnkwYcecs8uWFTAqVpxVjE2Zddv3PI7H/voVddc+/O/+MtT3dbjGwrPTplVAE+zKHDjd282DT5l9fJaXQW5OBtwrfT6ew3TKm0TLWViZQOTpOvu3v7+d/3hX378X/btGDOSJkxkrLCATYj5ImSs+BqrgvSPf0FBbZfmZS+wzHRHIssc4JeqbEnxsUDNRw+kiv0z+P1iJ9XtoyCRylWpfK0MHfRwQCV9ohX07KVCqmqmV0UEaC6WolJrVnXJoj8SPaO4vuV2w2LEMV2EULOtIsJgH99WRS2l4eEms6/CrhxZOHNV9EydPCoE0PsG9fzhWBxQz7cJoYm1I7UNt/i0xVnivvyVLzp18iyCRVGbU7b30L6/+bt/fMc73/ra173o3ofunLKd+kC/WufHXx8/MqsAnk7xz+RNt918ynmrhhc3KHWGC979GB4AHeOJJ1+c5bv1s5gje8f2PzT2hb//9p9//FMPPrCPu6YpJhEBWY3xVw+Xyv5eZxQ9+rVMUSmzG6OOiE0aCsiOqFsa8bHSgMrdVEl8iud0NEU940QrQQH0KgNUkLogsBUiUCH/X+hajeh1hHKH0nAvthKOpWfThZpRKvC/SgHFng4VFSzaq5YRmjb4RFdSZWVSsDI0gbL/I4MAoTDek8gYAobn9CU1gWZRZwChh8NRbX5m3gsz5eiPFS7asb6kiH6AqOSUDS4bHJw38NnPfebQyMjRG/8xkWB2OFWrbrrb3j+655vf//rb3/HG57/owk/+7QdXnNr51Fc/9sHfetPQILpTh5OaoeOsD+psN9CnUxQ63Z7atHXDVS+/JKmLSq4sxsdPY+/LY1nH/rsFSBIUAgHbNIVIPtBf16xx9zceOTwy9aZ3PvfFL76yXue2ZgIBGxXfsMFbk7Ek2ONciGXGptFF5+W4U4omtNcgHvdCcWlPFJJim048HpGl5b9Hn5/nJIpYeO9RUJUiK4LJxbSrsLDFEHn0kuEUFU+xuaom6IHLooo3kjtatjvl0F+HY9KsarnhXiIIpLFTm0hBJRFYYZBomNZIpKRO62mtVjdZywVAPibaH2O5HvftY3hSOOYtRQVB5NtMaQ2LT1qy6Y5H773nnhuuu55N+oTH8MwShRAgYFFxag8fPvzQhoc+/7XPfefmb46O7j5l9ao3veeG615z9ZzGUCOlnXbCoTM8fx6bhIorfHzIrAJ4OkWhGzZsaOeTZ645xaQQVohTKUhtjcVS0fikKmRqYf2GTxPnznWnWrCuv9FgJI/evffP9n5h1+7szW+6oTbIgpZJyQLilIhUmEjjxEI/+f3YN38ZY6WKTihjwOFg4w/ETvWVfPPYdI6i8V4uwbGfuMpQ4bDhALWVtBilOJuy52CBCjcWjqe6oZkXoTTwi2k5AcOp5H8I8OO1WMkxwEJNkzpCTtJVp5AA7aGTQ3mRhAB2vvGCgBAiMb7XKTtYIlLVXCwMc8KDwwOHpse1cjGoOLwnJ/o4rytL5C9NXJ8Z7pZ3puoy/4T5j93jbrzpO9e95Do1z2hgLE5ToAoIMN3KRsYO37du7be+/rm1a29tTe4/4eQVb37rtWsuPWvFoqV9c1LwFCWjHZiR8fGJltabc4wxpT1wfMisAnjaRIE8tzfe9N0VJy0antenpsJpeIvatxGO3MKMHvMlAGtAcVYiRWu6Y5KUOBkaHLIT+aEd45/65JfbE+6n3nNDbbi/oy3hnCjxPWkAUFAfFUZdCZFFOVqKZB/0IlSEfqrCe+WjKKib4ui1chI/QCo6BohOQEGLhHho5LC9R4TYbKd0T4JlrtFKjweoxSkEVcG+TY5CiWCISZ0yG1JlcqySaA3itm/ZecTK0hXLh4bIGRWCE5igqore3XEhix2EMu0QeFFhQZioTEBC3N9sHtKxyJMVZ93rt8xc+N71Puai9X6kNDFmYH/wq4xAanPSgYVDN95y4/jk+Px584meoYxxsTRW3cTk6MMbH/3+XXffetft27eut2Z6+UkLr3/rNZddedac/qFas7+/P3V515jMoZtxV7W2e/eedjevNwfifL3jSAfMKoCnU0xq7rjjzjMuOWV4uOa048tii55oweYvVUDvfVnlppU8C24cjoxMESdEmudZPW3M0WRs/+T/91dfHutM/ML739w3nIrrOhZllZCsbzxN7etyNTTtjyyGUqwmLVgXIKgBKnyGwp6PBDlVnkrqNcHLc3lyz1mE5QpT4zWe+hEzRW+14hDLCYilkRut+Kgyeoit6GuVGik21iQGWA0JO0lA2jCsadfs3DT+jS/f+I2v32zb3ee+9ppf+sANSrkCxjBZ3yJVK6qSCo+CVbzvRJFgIiIiAygxIdOEuVmrU4WXOuqqz0Soikv2BNrgWIuqYdhcuUaxpZ6IU2Ye4CUnLd119/bbb7/jhuuvf8YAo7/koeu4QjOXHzpy6L51a7/5vW88sPb2kdEDSR+ffvbyd7/2qtNOP3HhvMW1WiOhlAyJAmRNPbViYQyjMTbd3bpzt81tYkjFgo8vSDy+zvYZIuqfNNU9+/aPjB886aTLag2yTE5R7YsA7XnUlWYYayWAUWy3YDM7fmhMhPKsS8K5s0zpykUrDo4f/sJffGVydOzdH3jt0lP62ppn1qphIxBmjdPlI9BWqSZVLWAhuCCFkd+bq+pJ6yI7nqITE/725Gz9o+2v8pSJwlQxP8cYkU6ageYkYbRZyUeVR1RcAgWOokAICiFi+GxZZWICZcSGU1LI5OjUg2s33fiFux+5fUd3+ygaKVxt132j2eFuc4G6xG/LROVWnEukv4JaiksR1Wro+aA+kCHNZi1NEkiMYR/bE4vn0bt2M9fy8aRwiCq+kd8ccQBXESWjc1fO3Xz35tvuuv366657JtjGBX8ooHanvWPX9tvvuu27t37v4S0Pjk4eGZhXP/X0Va+88oVnn3vS4vmDfQ2ppwxrFCQORMQQUSJmZw10YGKqs2nXgUceG6mnzWatrk+01M9OmVUAT4NQzOz+3vdvBGjZqmVKQiRMBRWvqOQRVk1T9CCf35z4jgkibNtu7579UHEsucpA//BFF17wM+/+uf0HDv3dp//h+1+9fe++iff+t9ede9EiIpurKpGSIzaxkrTI06kmw/U8FVR5z9MuJcaip9vwEyLX4yHJ4yNMqXMUEiZWVtimCtNEUigwX28VAtrBcAwpTMFG94a4hnw4BZhJhXw3HhJNmaXVnjg8cc9tD3zz/925ae2YsUPLTzj71R9489jYvn/9zP+Z3rK7fXhyYOGg83lVCgUTOyXxXL9q3F+8gkGdxjVXbxEgUEL1NKnVanmHVCmM8CxpoJK9Kdbrhwet6MZVzAot1aZREducX6/Pad58x60T09Nzh4Z/6F39RyQuVMU3CX6JAkSdzO3ae+A7N9/43Zu++dgja3M71bewce75J11y+YtPWrVy+dJlzf7UkAgrs1gnhMTbNIZIWeAAdWli2ra24+Dhb3173dgIpWlteO6QMccdHh53J/y0S3FXC9yXv/TVxSsWD8xtqnYDoUHsWwQA0UCt+AEBwEqiNvxFYhJ/d9odOTDhBLV6krj09a997a/+xgfmLpinTi6+/MKvfvmbf/WPf/XnH/3UO375uiuuOadGrqUZjBgnyim00genxHeUHBBAQJgp3MPnV7URgjVbJS8qIB3JoQgzPUsyY51mQluRGUtV2ItMi4YPROLMMxleGyj7EyKCZ9s5rqLfDBNT/CIryBmWtJakU+OtLZs2ff/rd95z4+ZDOybrZvi5a65/3uWvOuXUNbWB5ujYlv/3mb+bao3v2r53wWlnOWQgTsiIb/mpXFwylEddNo0AxFcGsLJPrCKFKpKE6oYzEeVwVFWvK5xZhRD7z0gV/RGOKlwQESVFvZEuWLF46yPbN2/ZfOGa8xN+ymYFK0Kjat9UD1b10KGRW2694/P//u8PPHTvVPfwohPnrHnhaRdfdM7Z5502f3i4lnJqVMURW3XqhyKRM8yGASZRcSBjDAmpk2Rk8sj377jjwL7WSaeevm/T7hrPTDw4HmRWATzVUgTfOll72+6tL7jhcuIMRoh8C7AeBoioQAyqbiLmpwQLEhAig5z27hzptDIVw8Rvessb3/9r71u0aJ5CRXTlkmWvecWrmrXGp//9U3/xiS9NdWrPf8E5iSGLrhJ8LJhAEEikeQgx9ExhzHnYeUS0guFGmbOp5TH6f4PWKkzXo7GejvWyUH5xMYpNqFYxNH48KIzIsvh/SMIRCYVy16hUVQisID+aVw2pCBtAataZuvCebQceuHfzbd+8f9MD2+2EGRo+8blXXPGcy69dvnw5ad62o9KaqtfN3EVzDo2O792+/2J7HpNlZn9lRBDGpXkT21dje5+Fgqr1vbsh4qeyda06UXVqVPv7m5OT7RjGKLA/rkVJWpXvPK48oY9Q3lta6nuEAIWCyCWYt2Lujge3fO/m751/3pon2M+PUFTFdydUKBGPjY2v37ju//7bv9x567cPTx6ev3zospeddtVVV5148qpFixbV0hRiFAKyAgGzQo0BlEWVvWvllI2FilMVTSyS0Sy/bf36h7fvSRv94xNjzMnCeQv5OAsAYFYBPF0iKvfdd+9E5/DSExbW+g0olwJhS3grqGEgsEKRL55ByxAUzCbZsXmvOlKnffXB173+dQuXLfDbYcMDQ00IXfO85050Jv7qn//uf338M5MHX/qKt1zdYHS1AyOwCphoeOsMrRNQiChyU0XMtAeQSY+B7zEuUGwpnE7vxsO7Fan+QhT0UdFjDSSIg9B7tE6oaItmuEJ882ghjjlBqqJg8n0qRKxRMppylkxPZ48+sOO+79x/z53rR7ZPMw0tWnj2xVdee8HZly5auIzJtacmVTOT8EBzaO78eatWrtp3YPdD9z9y3ZteVKvVHDzHDCIGyQwqo1gHv0oCYTCTceoANcwiIiJJwvVGTbUF9uFkjfdCReM+Gdv/ydFDZTavbz8UNQF8E2+m5twmpXTPvXdpyBH4Lw0EiCqUWFUt3M5du2763s2f+dK/rt90X3OeufA5J73nha87a83qucN9zdQZoxASEu+V+hlGSkpgUlUV5jheM/i2gfmbEHvbYw/c8eBGa2qNYcYRNzk1lXB63Nn/swrg6RImuuPOuxyyE1Yv10RgFFJh/Y/BigQUiMVVRRYlPLNBoqTpxg07cytM9RWLV5y75qxgaGoo52Xp5O3ppYtXPe+Ca778jS//3Z/+2+jo5E++6yV9TXS0A0PwGSzKoYXlMY4iFkepluNjovVdxieCme1fR42lgPclCmA6yiPQ0NY6cPhF9DkawoE0F5/s43WRllUKKN5RIjJCAsCo8SumXptCCcwQVUewSUJsKG+3pw601967555vP7Bl7ebxkUxyXnnSmisvfOE555y/YM586WYuP5iLE+eY2CBRa+tJ84KzLrvrrnt27jx46MCRxYPzwI44BVwwpKGF/q74cSSRyfJVy2CGiDjxjQgE0uxLOBFRq9EDqHJux6TMjpInBf+FbkG4rdQ3CJGYJqxG+ubU5y6ct27Dg/sPHVqxaIn5L6mVLVrKkhK12t2771/7/770mZtv/ubY4d1nX3T6h37zDWsuOnPp0vlJjVWMQo1CnERfyi+yMrOCoBLoQO9IiGUiInWAqGmpW7dry0333TeV55bRYTtvsNFs1hYtWfi4PQqfvTKrAJ4uodvuuG3psgX9Q3U1VtRRQasg8trhpX8+A4oU9q9/dGNqtoqq5LJ5wxYoWHHepRfU+xqBQPVfEcfkaqnW2Qw0+gfTgUP7p//5z75k2lPv+IVXpn2NDuXEMAJRn3DkU+m93VkGIgoHpEwBKn6hmH8ZSIqyCozQazhW0Kli+0f7uHhZ6ImiQUbI6KG4BsXHIvejsT4ZrGrIOHHCZBTeuPQVvJJADSUiJhvNd+w48ODdj264bcfe7eM6rtKuwQo7PmXpuZeef1Ut7XamR7Nu7rUGKalJfB4qCa85c02jVj9y4PC6+ze+5NQrSR04BZhUwP68g74rKKoiTh7XM7BYhpjBTEziEoZopmTiUs20u38QtB9FGz3+58pN+qzgkrEDg0iQpMmcxfN2rdt+z333rHjZDT94o/9RUQGIiLt5Z+uuXV+78ZYvffHftu1YV5trrnnpZa981XtWLlo60J+aVARWod7eF4Gq8asGdeIEBGYKHQ/VwTCIxSFJDUBORIgm83ztzk3fvfv+qcmMDNQ5p0kn6zRqjRVLlh9fRcAAZhXA0yICTLUnNjy88ZXvuKrRr0q5kpIUES+g8mSW5tlMgjy4BEweZujwofED+w6lCbNg3txBJjDIqRITBJrnedZptybz9kRncpxsXhfutviz//trArzzl95cS1sZug4+MAlRpTI3iCqjpQjoBfPghvQcc9HPODoFJeZT71lVT6mH24rKo2hKgSKZP2b5ICAsqYYAcIiPkIpXjipggJwKkSGCI5cAJre1I2N202P777tl8857t9rx3EjakDmStsnY6c6UgCbGDh46sL3e1yA1hhOoIyZmShIVUlHbyVunn3rSyiWLNh8YXXvHw9ded5npJ1VSSAxYkA9lFoXJ1TavhUKVMEcHzMzEIkiTOptEtKR+UF2WJ0VVxM0/OV6jPMbgtgT0J4Kkbnjp8Kb7ujff8v1Xvez6J7W5J9pT7G9aRI2IDo8evvueOz/1xc/cee8t1kxfdvHSt/3CDeevuXDO0NyUU8dCal0I6ITe5kxKpJbZt9HgoCtBcEQUeDiIMezEilJHaGR6+t6tm25e98Do6BSblFXShFPmsUPjanV4zvCTda6eRTKrAJ4GUdVHHtk82RpbfeYJtaZzHM2vmYG9ym8zHvsY7iQiqPi2NzseO9iezpNaqqDJrKXsVMHEEFLrsqwzMTY+Njl6eOJga3rU5Zkh1DXJR2uf/YtvNpPht/zSq1SOdLlDJrTsLJJqime1sOmDQR9s7vBKy79V7HQq+0FUW0oUp4WSPqq4ONU3i/MnxA5tKCKX5DtnFryTZ5x8izYOQXRRqwmxUOqSvINDeyfXrdv6wK0bDj460rADC9NhbjYAM51NtDJxZNkkksrW3Q9v27N62ZITa9SopTCUmARIQ5jBqetm7b6BRZdedNkjX9i8c8vB6cnWQLPunQ2vM4HC3q9cuR4KrHgRBhuIKkBprd5s9k23coWEkWiVc/6BUlnVJ6kutKLEiUL5X2DulGVgfh83a2vXP2Bdbnxf8R9KtOQQyYqb7kyvX//Qv3/nm9+/83s7d2xdccaSV7/r+VdfdcHqE4f6G3XkpKTOWSEJY8tIfbICkZIPnWtJiwIQVUMkPg2NSQlO4chNdbub9x2689ENj+7c1WrnRAbqg0B+KzJYHxzsG3iGFjr/V8qsAngaxKm76aab++f0n3DqMiVHcEWLHaBA/hmWyIw/+twcDzAqSMjxYw9skswlxlCzvn3Ltm07di2ft6jZ7FNwa6rdak/uHTl0YHR0++4de/fvJNiEjTqtc9Nl7n9/8p8G5vZf/5orkprVmhUR8jkqKFNR/FEEZj7s2h+Lgooa+mrPZH/AP9ikKlGqqvXKeOdRHy71SOCCIq2iIflHyDAJSI0jKDvSDDLtdm06+MAte9bfufXQ3qn+tP/UuacM1fpT59pZ3upM1k0nq0OnXcLsEjedj2/dsXPuwLKkv5nnooklsCHEhgHiJO/m2XOee9VnvvKFQ/s72zftPm/BasD3DAqeC1XOLCxVeZo+4BGWLtYqOEDSNO3r65ucGmUu9eqM5Xn8t6rUWFBXP8gRoKM3FG8zp0T1Oemc+UMbNjy4ZfOWM04/0/yHkVJFxa8L2IxNTGzeuumr3/ja92+5cfvOzbU+ufDys9/7gV866bSTl8wZ5LpV7XS7loJ1Dyaj5HyiAwMqDO+hqhjAEIV+tiriAz3EAlhxmctbmT04enjjjh3rt+/ePz6eWzWUClliUcskSg5q3dlnnZ2kP07d7n5UMqsAnmrxj+Jtt91xzgVn9w8ZIFe1qDLaj/e9o9WCKpgEUCbq8qNrN7MYkyYK3vDoo3/6J3/8smtecN45a7jWaLc6Y2MHt23fuX3nnr37DrbamUkT7VgVIkIKI+3mn/7W386tmRe+/qoJN56ZjNVbW0VijyIyLQVgx56fvrPZ4wF9mchYsDdlEKOsMCgYo3hqpZTehUaMr4SS44FEjIUnrxnCOcgklu3Y5N4tex68bdv6O7Yf3p43zaIThk9bNH/x8FDdSXti4rCmEEeuC0cwSUKgBEkntwdHdrZba2pJX61GxhhVP+CLmRLAKDTPsrPPOnvBwkV7j+x+8O7H1lx2lsLG1N24Nl6VzTghb8f68Ekw/4OrpVDA1moJVLRY42NC/+PdHcWihhWpMnCPJ+V3i2QzHxRwqrVaOn/pgo2P7Ln33nvOOP2MJ9xOzyGqT8EBMZnM2p27d333xpu+/p2v3b/hvpy7y06e9xPvf/Xzrj5/2YKFSQ3MInba5U5Z/BghqBrf3IOCS6XiN+tnWbDvlC2hPEWdIgc61na73cnp9sjYkT0HDm7bs3/v4dEpp6JMGgct+JEMTCSYODy2+vLTkuOvCgyzCuApFgUEmju77+C+l7zoCpgc7EJxUpkkWCX9K3ZZkVLv4Y980wJ43mDq8NTuzbtZAWvIUHt6fN29d/azO3jgwNCcearamZ4enxg/uO/A1PhEt21tBmaTKKkK4FLUSPgv/vjTy09YcdLlK3PrKBEixGJURMoJFXQJzWw4HCbFwh3vqx9tUfZEAcJ69OCSB8qC29GK41FqBh+goF4dEQ4mRCqUiAwrpD420j/y6MGHbr1v3dr1+7dPJOhbNLx01cITlyw5adGyRZpADHZs3zKy/xBRTsjUiQLMxigSsVMT+8fHR4YH5hMSQsKUMiXglCgxnECNy+y8ufOuuPSKz3/j8xvXbxkdnRxu9AcQZYIgxFHKhKjyRD3I+kYcni/yrxwUcHnWhYLKiHrRY+OoqIAWP4vYwgyL/snw2lrVAfDQCu+ikGFetnz+I+xuu/PWt771J3x+6xNvVCGq4vX9dKt9yx23/9Pn/uXOe++Ynti77LT5N/zEmsuuPG/N2af39TeNqqVpp3DQpA6IL+YykoAcFAIVglEYQMiQivWOEnxDVRWr2rW242Q6s9OZHWu1xibGDxw5fODw4YMjh7vW5eKIEkANyICskqpywqQgq65jzzz99Hqt9iRW6dkmswrgqRZV3b1nT9ZtL1uxglIGclVlPx0kfKJqzXmmPdQAxCff44U3eiVhR4Z3bT1w5NAkUc2kSZa5mqFGszY2NXFwbH+XctbEZd2RsSOjk+PdLM9yWEcEYiInKiBxSJN0ZH/rF9/7kY//9QfPvugkka5vcQxFaH8fwMxrKi7ik56dlfiHor+YJ+Z77M8e0hsleROBkWayFR47S9wMnXO8aio1ZlEzCmKoqiGtI5F84NufvePWf7kxH2/lmVo3ODRn2YKhJatPP+3k089Jmn2m2bA2W7R42X133HWok9s8SziFZJxybnMm7rTa4+NH3LIum74kSZjZmCRlkxg2YCa2DtbSxRdc8qVvfvHAgckjRyaGFveDvW4mosJT6fFsCgincFKeLGIgzOgRp41GzTBKB6xnTZ5AZlr6FLRpjwdyrC32aIuiA5RAjcBAB4eapsbrNz7UybrNGjPPpIF8azkAChUnYCXC4ZEj373xW5/6l8/e98g9GMjPv/L05z//+WvOPn35ikX9jTpsR2naOYgfqyCh3RWzUbD3fwQgsIg3R6DqAFWGiGYOWa5dm3W63el2Zzzrjk61Rqemj0xOjo5PTE5NdzIrClETqtlFmBjimMgBECTGkOVsvG0zx/wDldqzUGYVwFMqBIjKxoc3c4plJywUGmcGuZgXER/eaO+Vt2NhescGNj7XJfj3JObRR3ZlmdZrxESJMf2NZn/S1580h/r7SUDQls2mOl3XdZyLdJ1hcgSBMDGJJGSsczVKOvu7X/70Vy64+AMd1yVD5bHA651Kr4g4iz0aqPEEAq+hkbcpIF3LOYfVeWclMlFxqqHeOKZ7KsCAUtlo35vVvsoWhfvkd8pKLKLa1+CpHVsPbhuppX0wdeL+GtcWn7j01AtPX7J4xYJFi/sbfXm3a7tZPjZ+d+vwZGuckSaUpqaWSW7IiOq+AztOP/0sgk1SrdWolkjKtpZQo5bUTZImtSxzF1140YK58yaO7N384K5VZ65gViJyYn2wtAwG+wuoAIWRj+o7T0NV1HcAIRCUxGozrYu1SE3sDlL1C3tf9YiWrkDPq2OEnvF4W6j4oEwKlizvDAwPzF0wd8u2rRsfeeyC89YcvXcin3kvCgLTyFjr7/6/f/70P//Djh0bhxfxS1592Stf+bxTTlsxb7gp7GyOPOsSQwS+ZRIRM3n1qOJZHnGiSgyABSJqncKpdrO8k+ftbt7K3FQ3a3c7rW4+3WmPdbvj05MTE1PTnU63m7lcolr1BgsTE4QIyqwq3rcgyTSltL+//3FX9FktswrgqRbr3I03fX/h0vkw1o/fg7drwjA6QpH6H6WankwFIvscUFDOpJY2PbyJyOc1cKNp6o1as96/bMnK4cE5QwvnjI+Nj3ezgQGeNCId6zvjOFEBjDouSAOiGnjegkUu9KKUeEwzIhRFTwUqTflweOHwQ6xYoRXoA4p2cceyR2OoALH4tTxtjTkpMfBd9DArQ9KeoOKAdtZYSfMlywfJiMI4ZaNZ3yAtXb589Wlnnr569UB/XyNNRbU93RkeMIcndh88fMBMpcakKgoFGbK5HBkfydy0qc2HsZwkSY0aDdOop2nKaWKcaG6z4eE5F1140Te+t2ffviNZ7mpJyKD1RWcKKbHFF+VFQs03TWKi0AvIr5QSM6UpkpRtoNQqPCCOevW4UrSNOjYl9DjXoCD0QASBQKCGUNMFyxdt2LXp4Y0PXXDueUfBpaiqkgHx3r37P//FL/z9P/zjI9sfHl5Rf/XPPO9Nb37R6lUrGokSdW3WkQQOLErsHIMpSYQ0VJ6Rgtg/CwxDTAKXi+RiW91OK8tb3azT7XY6WatjpzPbyrPpbneq1RmbmJ5stbvdzFrnb1BDBhAVNUTs489BIbOKUwUZAOhO5XMG5p58ysnHJf7PKoCnVgQA45tf/+Yr33FVo985Z9k3KiPubXCGY92Mscyq2JpCWcik3VHZvmlnkiBNkr56XUn7++prLrrgqmuuXnXyqna7U2/w6NShe753y5FHt1Du6mltstMCNGFQ9K3ZkFXq5u60M08hw3CxriuiO0UbO5r9MXDpfw2wHunqIjAAhDACgePApkIKGI/Wf6h29kxAoRAi512mqhchh6i64ktf6AUQU1fzU9ecyvVv2W6biWuJzlu46LTTzjtr9VkLFw4yA8QA6vVkaOj0617ysr0H9rWmpjpd7ms0O+22QDnhTndyanqUsLTeMI16rVZParUkSRNTS6hmKFElm5rm86587ndu/Maux/ZNjbXn1htkwMSxuJUAPwWehCgyJeXJ+VOiCrOvgPpcdxFV5ri28Y8zVejjAVdYrvDRWNUxc1u95E9xKX1HICIliKolt3DpQrWPrX1g7Ztf/ybjLWv15bb+usmhI0c+9dnPfOpTn9667aEFi/Vnf+WyG1519erVJ9W5nrsWgTIRNSwQYjIKJiFITlAYw45IBaQggROBc+LEdaxt59lUuzPdbk9nrmNtJ8u63azVtu08n2q1pzvtVqubZZkVpyKhEkyIlJSSsJR+5E40Uoh8JQElzK6T9yXNhQsWHo/wP6sAnmJR1UMHRzsytXjZYNIQMkbUGV+zEgw1/7n4hWCQlU9lESAgMEGFwIKx3a2JfZOGKUl4oJGqk9NWnvSiF7/wwksvqyV1wAFYma8czuv7Nm55LNlDBmqMgMgh9CmDQMnmMrSw//xLznM2I+ZQolniBhDzWgILhcjxzOAleqj8aJdT6H5ZKQWqnKT/ks/3KNI5iYFYgUaRXELRmK7YI/kPiMYOzKRQdbaz8ozlQ4v6RndZl2WaNE4+6cznXvu8RUuGq1BKTKZmLr/yioc2PLxj2/ZWd9q2uNHXbLc7gDrknWyq0ag1mmmjWasZrtXTei1NayZJEpOoSShN0gvOvWDO0NyDu8eOHBibv3gFyHrauTh6it5PcR2VfLKoVwhUhFN88z5j2BimohNoaR8QzVi48FIr7ymOUhRUXiNFr/6oSuVjcYtKIDh1w/MHkPAjmzdZscb33vGtN2y+99ChL371C5/+v59Z/9jaxny+4e2XveUtV59z9kmNFE7zLBc1LM4wG0dgSQgEEmUSgTooqZ9GJFALZ8VmNu92s3aWT3U701k23e62Ot1W7rrWdfJup521Wlm7nXe6WZ7n1kJFmMK9EyLngpjFULZUhK8N9IvIMMSuk/dzrdlszoisHycyqwCeOvG2yNZt263NVq5eosYSC4HEKXrvvuCCH1VDVJDrYWY8w0jCjndu3pN1xSSmZkw9x0Dad9Hp55615vw0aQLw4wk5TVasPv+y51yzddeRbYe2sTJLmE8lquJUIFazsy9bs2D5nBwTTBJKbFFCUPUAS97naF6iKMvV8vMB3JXh5zLNJCW8IRzgkRCznQjEHHUGe/SSGIhQXwdWKKdAGzGrsKjCDi2Zc9LZJ4/ufJi1MTxv7gtfevWqZXMqYFkAJSW19Oprr/n2d785MTXemW7Xa2m30zWkBmZsYrRvoF6vpfWU+5qNet3UEmOMIUNpatJamhpzwooTz1p99v1b7z20e/SUM1cyQyl0ONDY/z+oc6qO9olcGgCOhrSIKpjZJIacAKzqqMTxHvs9rnmxoscgMrSE9HILxxQtkdJXdpBSyLsUk9eHqG9OY8vWxzp5xoYMmURpcnry81/88l/97d9s2Lh2qC970+suft1PvPyUM05oDKRW865QJkbZkBgQsSEVF1R0bNrBIlCxJJlIZvN2ZttZ1s7sdCdrdbtTedbOs3Y373bzdpZ3bd7utLNOnmV51s1FoQ6EhNS7lyFKxcq+lk1ZcwigLN6wEAnhMyIHVupMd5f1HafmP2YVwFMpBDhnH31sS/+8gaSPVZ2LZsvRD+QMogQxwSbSLAEsjLgGma2P7W13aCDRWkr9XF81b/5zrzh/sG+gdwOU9jUvueqaRx7dfM/9j027LLFKVFNSH4IUUtRp1fmnuTqLE8MxBluU8oY2C4jdfkoCvuB+NOytcA5QoIqveGKfw0lFlOAYkMUInEeZORpsZJ+a6LEpppsixBoAIvatfjyHRA4uGUjPuvzM+76xTqR72rmnX3ntpc1GDB8UbS/9x5lOOOGEhQsWb9my1TCnaa2W1jqSqbrRw4cHBvqH5/Q1G321WmIMpUmaGJPWEjYmSUxiuK9/4AXXvPjO9Xfv2ba/O31ms+aNTVWtdNAMKO0Xxfs6CO3uKWR8+qZoKvDBfII4FY63RDzYclV69ULPLRPVA5NKLCQo7qRSdxythivankJ8hdWppINJ30Dt4L4DE2PTjcXNVnvsq1/44t//70/ftf7e1HSe+6Jz3/ML159/4WpTMybRrmQgtmKYmIhUjADWCrOBgToQs4OKiojrWDtts6lut9PptLvdVjdrZbad5a1u1s5sx+bd3GZd221neW67WabilKBOmBgMqFPfg0MhTATyDYO8CqDQca+83iAQkYioRdZyq847ua+v7/jUALMK4CkVIX1ow4YTTlnRGEhBHQ5mEAPAUSb/MVz94rGlojmO62h747bttaR28iDqjLmUrV46fNopJ5JSQc4UBuPgshWrzz1/yYKvT4+NwYoSKXPXKkity9JBnHXOiawZB+zXMgmz1x0gzw0HUJnhqlDPMSOWFUXuhskoNExN8cQPCqococ6/ynUFlaJBH3mtEb0hBUAhgF4eiSEncJQK8tUXntrXb9pZtmzp0sGBfhWnxDF4XaguJeJ6vTZv3kJjak5barVWq3eznIgnJicUbniwv95oJoaTNDXGpIbZEJskSbnerNXr9csvuWLuwJxH1m9+3pFLm3MHCNB4XAonMcMnZC759SUl5p7UV1XPhCRArZ6ilUdnCOVIySJ0UMpM9EdE+tDYtbqDcPK9HE+4BkHtlvgPISiEQKopzZ0/tO/Agd3bDnzvazf9z3/8wwcfWA/C5dcu/vmffOtzrrlwzvw+C2fhOiChlJCYBCJW2QFCRIZCEW8O5Lmbdlmr05ludaayfDqznTzvZt1OlnWzvOtsN887eW670s263Sxz1kLEWT//jkSEfBABChUiaAi6kFcDUHjvkHxs2jdm9TebqEINcz2t5227aP4S1ZkLepzIrAJ4KkVVZPPmR1ZffCKnjjk8yFpQtr3mWG8goALkVIQVnTM8MTY5vmffipqeAszpS+YN8JLBtDZ3Ptj4B6DaYM6YZOXypacsHmod2DM5gXY3UzI11q7BVNedsHLhuWedBBHyiXLFTgubsQz7HkX7zHhZHH8wcRU+0q0QhGG73titfLUHxXrXQcqKAh8gJlIXhpoHRUDl90SVwMQMcStPWrJw+fwdo4czznLVXEw9icY5+Q0ogaGwzkhSm5bcgkFKjLSWZpmdmBjLsu7c+fPSmievKTGmliRgpIaTJDEgJpxx+ukL5y44sm9PdzpXqwqnpGEgge+f4wumY3oTASp+YglCppC/H7xT5lcp1AhDK8tZLk7Zd6NCa8V/oosV1q2SdyuRais8uKBHoUrMWknkjR8jwFBO5595zmA2593v/pmNGzeItpefN/T2n7jqTW98ycrFczlxbWnnTIKaU1aqiSUyIsSefHe5dQ6Zc1Pd9nTbTnWzqazTyfJ2lrUy185tZm1m8zy3xX/WWpuLc05EVJUBUhb15BQXT4N3AwMtKtG7i+5o0HUB+0EgPxhCFZpD2nZ4cJiPywAAZhXAUyY+gimkh0b3X716jWki+PwUDeDHsUCKO1OrkKgARARIahNjk43Dhy9eNHzWQNpUrklnmIFmUwPrUkELEANzBvpX9kPng4b6YGpTmYyqbh6bOtKVKy86d8Hc/kmZNjWQKyhlKjcR7NB4wAE7iI5x/BGuA/Ht+RtSwPh2Lb4/DCoJ6kXSPMEpuEyMUfgOXx5D/XFATVAnDggjFWM8FT4hlpSc2oEFzVPPXbVn8/jWTdu27T2IxcNDfQM1JkMEQESVWBRZBxsfO7hz76F2JkpgSihh0wCcs8h37d35ypWv6nbbYi1zWktYIWw4YUNEaZomCc2dO/zcy6/656//w8iuIyecMV+JvR4RCQhNyrFswS8umEOCq1aiKn56mYgQk4epYCj05IOipHSqv5YqUilUi2jVX/KZPQKNix0uD2Ixmki457yp7KurlblOzQWNhet3rF+/7sG87eYODz7nJZe95k0vuOY5q02zO9ltdfMuakk7k9x1lQlpnlvJutY5dPI8y/K802l18uncdqzNcpdZlznJnMud7bo8s846l3Wts9bmubPO5SIqSlABHDy/D8STiismJZeHSD+GJ0rV+JWjyJuGmxhsDKnVqfF23pWFC5ccn+Y/ZhXAUyYEEsjGh9dPtcfmzhuOfG+AIVTjb0f59kcn20TaWJyzozv2LzF8Qi1ZPJj0Q6XDJjVI0uLTcQchKNtf57l1xRxTp8TUkw4aB0CPPDCazO+//KWXuSQMKSm/WrHLKbJK5XsaUz1LIzTsLlhiMXtUC2UWc2GYWPwA9xDaFcQnNDoa4SQqCYyFnYyA9DARCqhIl2RlkIpAidKaOffyM2/97vqt27d99CMfO2Fx39lnnXvK8pPPOfvMgaFmalIIso5s23Xwc5//4mPrH6JuzkScsChYDBujTjc+tqHZV+/rqzm1YokgDCIDBjE4bSSkmqb03Oc89zNf/b/7dh8RB2J/iYVK+Om5IPE8JDBbIeKhvoEOA0X6Z5jLWb0oPUsdt1Qp0uu5hSKeR12g7JuIVI+iV8n7+jQCjJIx9SY1aALf/d73Nt74SJrS2c855eWvuu7ca85evHRg59iITLccHCXoijjl3Elmu1bzTFwmaq1mos6pOues5opcxDrJxeVWurnNXN7Jsqxrbe7EiopTb+QrAP+S2LcmDMOmtfBKdMZt592/kjz0d6sQeZIIVDxnSsbw1PhUgnTxwiU/dH/TH3eZVQBPnRB0w4aHNenOX9wPzdWbdyWzA4+mx/QFingn4nPAMAROBQc37lrZ4FUDZrCW9zmrtUSycZme0Maw35AE2xoASK2bGmugO7+fTeLypMssQ5QKt0887YQzLzoxRwesTlxSFByU7HH4QTMOrOdD8YGMz2URpqUYUQw9xgIT7nnb8GSWVUuhhxjFpkB+Z7FVGoqsyRAR5lA74H0KhKiDEkEz1znpvJOGVgxP7hs/a/7Aiv7OxMYb71z7vRv/La8NDvXPWYCkeWRsYvPWjUd27+nTzpIhl7ddK1cLAiEXUqbHHnv0yNjoqhOWGmabi4ozxKKOiAlkEs5zC8aF569ZOHfJlo17Oq1Oo1/BQQuFow+KrLe9Z1y04PAEV9FTMj4xqKDHZkB/9UWPUqleCSUIERM7KymMy3IhTRvm6AHong5jw86332E2qDfRMK366O4Dd3zz+52JzkUvPOesC85ZcNrC/pXDm6a37tqVNgVKAmJi01XrZ7qJiJJ1JFYB+HpHcioi4nJk1mVZ1rV5t+OyLMts5kRUIE5jRMhrQhcy9n3jNhSdSSiuofb0nCrrKoqwk7c6SgezzDNTNZx0Wt1G0jj55JOfwAV/dsusAnjqRBTr1t2/5KShdFAKG0XDo14wuV6ir19RDQVFQH6cuSosWctH9o3Nr1F/U+oJsSEyppOPZft2pnOXwKToua8Fkk+N7KolebPJYtCtiSZuero9pfaa51/eN8wt2yYmKmO/5VFoz7ao55+jHx6tvh8iyN7UrRhuxCEmqQU/G7DPH66UcYiI/RVNE0zq0gTU6CP4tWMicZxrPn/1nIGV9YlD7auvPPU5F54wPXFwZN+BHbt2b9m8c2rPVsm12e2e3B0/YdCCeKxjxigfy6SVq6XEkebg0fHxdQ89fOKJyzmlRmoUBgKiBKHqTFnZMBYuWnLW6ec8vPv+rJ3VJCGjIApJTV45xVUMSYoxFyh6TKShiZqCwBxiACgGdFbdqJkLXlHYFaePfP9SAZMhR63x6aSe1JsNJzaQRQEpFeKrcNUIamgkLq1xbezA2EN33HVw554Vy5aeds3qhScv5aH6ATc6emicEmrW6k3Ua0lKKYvC1ANjRaSGIYBTiKiqdU4ya7Mst5nzGZxZbp1TiO8U7QlRUn/5Q/erkLhTlFNUrnD4rbiRgl9Z/iXcCF4bVPlTr2dZCaLTE9NzBhb09/XNegCz8l8rCiiZ22+958LXnJbUcgWIA9gBqGDsDCSNOBvslh4iwMC4rH5w8/6zB5uNxNWhKRylQonuuPNb55x6ChpzgVpPhWnWPbx/a427zQagtgtTqzd3bztCcxvPfdH5uWRqlFFy7yW2HmUvRtuqqrSK7gyItrxq9e89//j5wIVT7n123+cz7Fnjcxx3qYrq9hxFUkpUisefQKriaWIGK3E6qKecvXzv7XfpoV1D+fx5A3bZclrVrK/qYmrE5S3bHWuPdvPMyGQ3m6jlh2r5EcJ0Ql1HfXnt0Fh3Isu+e/Mt1730hUkaGKeC1PHHzkzEqDfql118xT2P3D12ZGJo2SL4ob6+KaiYaJNWObVyKcPpIZJfPqBBocspeuK9R4uWm4tbJF9NERaRCIZB40emBuf2D/g8KO9CxaQgZVUSFtTRTPIGd+iBu+7fsW3T8PDwtS+7qm/xQhpKRnXK6ZRJeHJa6sZ0udtN67VawjZlkxiHlEkFIg7qnIhVss52bZbneZ45m2c2z52zvuI3Fm4ogSHSkw5FPtirrMTw/Z41TnwujP3qCR+1GjHNoLCvNNbZsRrmhLp0eN/oWSetaQ40jxXEOi5kVgE8RSLQkUNTubjlKxYkNQE40CxU4GyZmx347ao5AwBEJKBIAhkLy90RW580Q0OcGscqzJzWSNGe3rJ293c/u+LqV1L/MqUa/NBBRefwzrEdG5pGGzVyQs4Q6rx7auqkS85ZcsqCDNPwliCUYFA+aMegWsNBFz8QYaz8eOUcKt/yh484MSuEdSPtXLR5iIRTWJpYORTg0W8/BiqjwigtXw4ZRgR2lig5+4IzbzN37d2xU90ZyCdqdmwOj7WSCeoeanctO9fOWgwkCdcZdcODlrqubqU5OUjrWvtbubv7rrtb7Xaz0Uc+raWo4CIA4JQJqNXo8ksu+4t/NPv3ja44b0lkdByUi6SnaMHHAC1QcFsKFX+WvgLOK4uC8tIqTPW+rt4miLeS747tHUdSp5Y1aU+0B4b7AN/OtZwpDfhQOlgJTnc9tmPL+kfI2iuuumTOqmWuwS10BS1KSRWu49ghg3OUdUyHCAmZmkko6HRyIiJqnXNirYgTp6LOxfo+YgY79vaBkk/QIVXtte2VfbsJEMQnART3U9Uzrt6bke8paMMwVqDkIBWkIpIqUZuy8c6ShYvouOwD6mVWATx1sm3Xtqnu1NIVCyk+mwBQVLzGNPjSLCwkUichsMoAqbBNkvr+3ROYlub8OpM1CYFTlyTMef/k/vG7v07Illz+Gh5cgiRVZ7LpyYlH78TYrv5EayDrkNeSw1PTh/PsNdc9r5YqkZIx5Aq+payrrx5PQcP716TFzEINoBWedFNEHv33Cvu3SMsID2fcPsMzFmGco8TgRVUPUeEISGTJAjfkK2xLA5EIgGNDVvMTTljSHOzfuG13K5vuT1vkcnSpRk2yabvVyTMlTr37kRoaTLQubC2JQ95gXjUv3zl2YOem+9eufcE1z0WiPru+cp2CPW0YZ5x2+py+udsfG1nzfE5rBBJRYlZyQmCKDS2kOMIC5aM6UN9ZjUiKxMXyv16joMeBrOpXqnyWVZQShhXNJW9lqZhq0BcU9A/BAFyrmUcffKQ91jr1/JNXnnJCN3WTacexGCUVZQ1kXAImgrPIxRKBJe/4BFZAQ8Mnr7ElkPEKGC7C+36Gi68L92dSIDtF/y8kR7EKIF5vxNsnHvexTj4sDHG8QI4UYGiY4wYiYoF1nVHRtq5Zc15i+DiF/1kF8JQJgQ7s36cm6xtqMFSVON7sMbVSq89k+U9PhjIJe59ZyEhS55HdB2u5qbPWWDgJrRZSdim1JEv2r7tldOzgsrOfP7zyTDGNkUfuOfTALTXb4VQNJ2TTpJbu3HU4a9bPOP8MNoAaOIsKOswkfuJhzWQxZrztn7wwtkaLfFeU5n5ZMwzEMDA89e3Vnfpp9t4oLoneIgUkpjYilsSVfkiwnn1LT1JF4mjZkvnzli18YOvW1nS7mSJribY44X517DpCwsZQYlicGCcNp3VAEkHiVNG/au60w92HRv/ti1+45pqrDIAQUizrhwr13exrDg3M273zCJuGSg7NFWCkBJCGmiwJx8vqk1MicEKhSqKiqgIXFGlMH1LtdaiqEhRt1IhBCVOINEBJJQWbnKSdJZxQBWljHCCcTKeVnXjSickpRtO0lXQdORUYNYCwC0cDJacKghKRhH5NLmryqLR8j47Y9iIUO4dMTmJf2Ec9XyDfLrE4AxQPhc8fKDXkjJXo8Z17bkqNlQ5M4YYECUETMSMHpijjk044MTZiPB61wKwCeIrEuXzjow/OXTHQHKip+sZo/gYv7r4Z6D9TvFHFoTiXhFUlO7B3u+t2TZ6mnDALpyQKgmEmdGUgP6jTRw5tXz8xvEhMfWLsSD5+qM4Q32kGmjYaDx85cvLla+YuHBTqGoKIqgSNFBXUzGdjxu+BegnmfagQBgmRg5IR78dT6O4Z3IQQ3Q0hvxAippC+IUjAAnVFbqz0OCER3Nlvp0C+oEYRiQ8oIAwWlWZ/87xLzrz5s9/dtWv/wlWLKJ/MwS3X6mgHrJwQKYlTGD/1MWFiZeJEEkYf2YtOnN9m9+2vf33fb3541eJ5GuorCsgJNBVA/YNm/rx5Dx/Y0Z3OmzUGiNgHQn1eKJQ8IcQ+B1Z8Wz0CSWCoEbpziEIgMY1WVQvkO/YdUvDhEWJD2qcmhtVJIiZrZ5JRYhqkFMpmg54Gk7cslGoApblR0cyTRIkynGhRj6YFOGvRiCOop+DYaInplSLEEO9HYe4XTnCvWzPjtgegcb2rZXBVM2SGEeI/SMUoOyKAyYX5qaIMk6oZPTDR1xg66eQTUX36jjM5TmPfT72wMTt2b5uzeJATg9Ljhb9hK3cfRXM4/C28W8kDBymzJiKJ5lMTByfaUw6JMwlISDRJmdmItcZ2a7abtFuYOJjt3tDZs4EndqaSM9QwO0OcajvvHARd+LznpmkOyVWViJlizk3kUX/AuZEWR01a5Su8ViiGuMAbjcU2C3DQGAf20FI1G4MGirUGFQqA4r793sqD1LidQhIxrO7855ztUrrv3keN65dpm0+08+mWZJlhY7hGwuzU5KgJp8QpoZYgZVejbNB0lw3wpScvwsS+b33920XQoYpYChGFc9i+Y99Dj9y3cN5cVuOciKqhRDVSJwr4JB+/tuILWAP9LaoiPRhbQbbiFojzFmJmS3mvaO8RFf/6hs1Os+kOhE2SxFADEFrmEADyuWlshER89ZoSOR9RBxCa7UCLtSaScieBwSqOKVzh8ioUp18e7VESUn4Kd8jfAlT8qXLCR39dqy9i3DyYFf7QhRjMbIS5g8lDk8sWL583b16RKHocyqwCeIoks90DY3vnLRlmE0zraLRyBTE9++G/4dM/4tNd4oHPomeSxDEGFqf7UtmbJdO2z1GNE8fe8ktJDByp/92CYLuJE4gzhATMbCip7Z7sdAf6zzjnpES7oiJOy+fmiYH/qL9RJaQZ4nVEQiqswiIsakTZKUvxpBYhAV+TG1Ev/BcYfo0fDN8g+PQR9ZRzSZ6UIYZSCXnAUQVZsavPX7hoxeAt9z2SteskLNl4d2KEXSdh8VQ1EyfGGMPGwKTECThRk4Co228mTxziy06a/2+f/uR0blFgMQgg5wMhRA88+NhLXvF8Hjzyxnc9P20KEQjG5RLij+G4vLEdWRotzjMo3VjtQIGZJ9942QdIZtKCx7pGcUli9ySVkILa6WTGmLRelxkeXIHOvjxb1QAkcfkqn6zejD1K0CO7X0gtYLy6jzgGIdjxGq9Pz8+qSivq5uJUTCrYsN7T7nWd48WvGPUEsJI6CICUUmPZtVx7sn3Ouef0NRvlvXL8yawCeIrEip2YGBuaU3oAJab3GCAanwwtfgZbmUJjLgAQIk0kwYUvOXv4wuXfOXJkG/dlSR9MYtSaMIyQACERKJMjOBg1RmAoMTA1TqRee/jw2IkXnrZw8QCp88n/IuqtvxkZPJUD9D+LZ7K0QIPhFZnawkhVimgWf6lsjnqf+rCHYA9XEkOOEYwuuYTqkVVf+r2LqDrVofnpc15wwf079+4Zmcggeaa2bTVTFZO1RZ2QgMGJ4TQlTpRZDSmRJMY2tTU3za49ebnd9cjae+4QKffh4JMn8ed/88+v/ulXLT93zgc/8e6V58wHtRGYLiZwKPUt+HwVpdCXLeb7hCwnjYlAsfdDacv3wn2h8GZeohnry+w5N2pNtpt9fZyGbqoxhZXCdaGK7e3KLdOMvRS6tvi98BRpBjQXny+ulvZup9QmM26DWHJe6v5jmBzR9SOt3kYV7Vg9Hn+qBAJSTdrjXZe5008/yyQpiiaHx5/MKoCnQhTIul0VOzDY54t7fKQS6DWV/P0aPOmep5hiYg0I6sFd1HXtaWtO+M2//OUL3/WqL+/f82iXp7ihUAPlkGZNhtloypQACZQMQE6ZmYhHsvoDo+7aF7yk7nKFEBNiQFJ7MvF7pdc8K0Cp0FshkAlIAOjYwFIqCB6sTl8iLDFBsrQMw0e0ZxlmHEvQN/F1JA0iGPlgQsArIYY6e+0Nl0sz+febvp113NSBKTclNdM0jutCKVMjoUYiNXZMeULSANWUE5gESc2gZtsnDZmXnrHsn//XXxSl1Q4wIHH46J/85fs/9v6XvPE5H/z9dy44pb/rjiDNocrgEgALvYkKpxUTF1XJtzzr9V1QBL+LeGZlKSL0VeLDJQ4GRUokzDBi3eTExMDwIBkqcnDLGy5QOEErUPXQKutb6tpCTfVeEK0UfsT74piGREWlVFR/mSfQe57VTZT+Xakuom9V2WKkF0MZHimHZB+nRmujIy1Wc/rpqxNDsZns8SizCuApEdXc5kilb6gp4gggcKzRQaQ4/Scp4mDZ0cSjQCREIUoqrDC5SzuWuamXve6id/zNbz48D7ccPDRh+vLE+LxSgkKExLGCBCJKYEOsDGuTdfs644OrTj31pFS7KqriSNUfGAD1w2KOSTGgcsDwKCxV4iXCeFG46pRE/cA/37axYpcV1fzVAF9pFGo0m2fiCiJZXHhTRxmloeMSqSoYzumKUxetecGaz9xxy7797bwLq9LOunnecbZbS5Mk5SRRTiStJ7V6UmNTZ8NsGCRWUuf6pLNm8cKxh9ft2bQdAht399E//cRH/+5Df/APv3rDmy6f5nGrLTIG4rtUgJkpZjhpHPEVYa6ntE8iAVRQQZWSvIqZUDnR4t4pDeuKW+STi0UcCHlmO51Ovb+pfgBc+FZMuymCuaWhfgztP+NYgqcS36q4rYVGPtpB6X2LClUX0rpKq6PnU1WJLFBcR6Lqb/6PPbxQ0AFEIHWilJvDe47M7R9auWRZzyEffzKrAJ4SIWR516prNBsgqEqo7SwqYIuxMKVhWPG8PVaEWKkahU8kJTVOkRPBdOYuT2740C92zjn1lgP7xmt9kjZUAWaFr/AkASyRISXnEsEYGt88OH3uSy+cPzQFOJ9jLRqMVPVdemacQ0nwlAn9gBBcbNfseScGmJRJmbWuIZEbhDIzXABVFrADSfQQiEIbZNGiG11s7+mPIOoBVSlcp5IlK6HFBwYo/EcKqAgsqNNxb3r/DSP1yW/cc9+RNlvbVK2ZpJ40G9aIsjKhkaQpJSm4Blcjl7AxxhiTkLjUtppiL1019//+7R8qh1Fkv/eHv/8Pn//LP/vH961e02w3xtHUnEDEhslDjogowGAEXt2vhE9UDVFg8Wk/KlrCL4X0fN8VAhwWP9T3UnFnIJLjFPOJAF8z4nslKQgkaE+3c+uG5w8LaaleqgBcuBylg1a1xqs3Qu+LeAHoqDePRQr1ft/vpcw6K/f7hFZ5YYTEJODqDgtui+KCeOh3loBamkiOw/sOn3ryyavPPC20TD/6yI8PmVUAT4V4my7LbLfVNsxEJL4FSsUALMxfLUyaUkJ6DUDwLYX9f6zqVDx0qVDTXPPzb5k8a8V3tu4Z0SFJ+gUsRn0/FhCDObMWoLx/8KaRkZ19jWuuv1bZgQ1EQxKIlo9UZCJ6rP3yWIvWy/4lxcafUIQcDIaqCJEaUiZNWJgcGzCDwsg+CURN5EFi54IKdxB8orD3imVXTd/ubV6Ego5WghBASuQk6QgPzxv4qQ+9+cuPrtt4eFrcYIJ+pjoZTZrgmqapL6YAAwwxoSsnJymnUeEluwABAABJREFUCVKVppGThwYf+M7Xdm3fScBXv/3Nv//CX7//o+9ZdeqSnNua5DmFAIH45jbBMQodVj1GB8euEsImZu9GCZUhVCeqHGC6+GIPMs6gZRBDJnERRURJmZkcWlNtkyRcN8Tq28yhXLIYoJ0BgiEuoTPfDC9m3BnRaz1aM8wUmvFv/GKpDJ6cPU5ex0ZfuZIqVWwt3B5KRExMgFHTnbCdqez01ac36zUmc4wTP25kVgE8JaKa59ZaQWpEHEiIo33iH6jYCLkw7opxLBrT63w/lEC1kIi68ASKaOBrVGrNF3/gl6bOXv3VDbvGzLycyZLAdBTtGgs5Q1Sfv/TEmx4e+fzDm9/y7retGOjA95cIWIzCqKxk41XwFPGPhdpSghpVBoRJiSEEp05JDVnDkmkNZoi0j/I6SdPogJFh2EbWyZ3kgKiEcR/qYwEQIRWIRkM+8hWxSU44gmikBqNRCw7bL15oteorGQSkKiQ20TzvPP+VL3rTr/3clx5cu1+sSQZNLpzniUo9oaSmaaKGchJLqkaRqk0oNyRJAuK8Lq2lTX3peee85y1v+avP/suvfOJD7/3dnzz54kUtN+0goqJO2A/58o4UxZHvRcKqIlryPr+FiYyPjTsV+OB3dGAQgVVLy74Mm1YBFJWLV6huMuQHMdbRmDg0NTA0mPYllvKiTs7zRAVvFC/50eir5QtfrVHde3ljHOvmf2JsnZHQVNXiP0hoxsve7CMKSxDuCigZZhZmaw7uP8Jizj3vPKou5nEps4VgT4kQsiyTwEkI4CIMMCBEXLmXoyddxIjD5FhV+KHqUDgoMRJVqDoGE5ELSgDK/PL3vvnr2Re+eP+Dzzlj2Yn9C7U9leWSoS4mSRYO/Z9bHv6HRzdd+otvO++yExI3Zl1NySoQ0y5KlI3H7nG+aC9Qmo6BQiD28CyQ4HCrEis7bXXb37t9w4DOXT53sXa7R6ZbW3fv3bTlkb5hXHPt+WeefVpST1nL1sQ+WxDlA11SuiXRr5WGcNVlK2gBCq2EikiCQkVBJE4sYEYPT5z+wkuWnLzw//32751Tn3fpipPTTo1UNW/BaMKcNlJRB0sAG8PKhCQRAtVFBGlNhvrTfQcf/e2PffCjf/FbJ57cdPkYJU58y3n4HjfhKqoPqxTr6+tf/YH5xnca2qHF/MnAiaiS+vbIRMUZRRUYlXJp+x/T6FYGCUhVjfLU6OSiZctg4sCxgjjXGQZwxYc4+k4uVY/OVBNaPQ76QSgeLf7CvtDyAj4ZmYn+PfRT9EH9L6EEnQjKRDU1+7cdanLziisuN8F1PU75H8wqgKdMnErX5lPTLWNIRRQgEwYTFlPhe+zsALEUrUXPymtkhsXPWAcDECdKYIEIQV1OfbWX/Orbv/m//vEz377jmtWXnjp/lUlp38jh7QdHv3PL/Q+3D//ER3/m4mvO65dxB3EwgVyO7ZULBCCfpNP7XHltUBQroLAXSclPe/Q5j1BiY1A7vPfQ33/60/V2vZ70jRwez5Gfd/np173qurPPO5nY+Zoor3rCEEotHt2i9LSqeCr54+VzWz3CGEz2Bq6W0OBUCJ6Uspm26quH3vGFv77p7//PZ7933xnNhafOWTyvb2nNtbXTISJjRFKffG+QmrYxyixwIjKZH167Z9tFLzjl3W9/zeBKbpsJQ13xNaY+/yocXZm07i8z4ohOqqJVGNAoAidwUpaIS1DpiC5XCb4z4KpyzxT1dCCARFQZSWqmD0+1JqeH581xJEqiscoiLFbU59ASD48F79XvYObfw63xZBG8d7+Vo/7hLPJiU/Gf+FT5lwQIBAwjuY4dGjt79RknnnQCF6UJx6vMKoCnQnwqSifrdrOuqDVUNDAgYlYRPy4X6qupOBi/RVAMgM9kiLwCiEVDraYr4FKVyQlxy3abdXP9e95858pln/0/X7OZG0LiJltukOpXnPrhd//msiX1hnSdgzIrrBY4GczTkJxSYG3PP2FkTcUPoFhmCRAM1Imf2cvaHKq//SdeP0BDn/rkZ0ZHJlecsPTNb3vNtS+7lOZwhmkhS4WTowBYEbqD9aZl/P/svXeYZdlVH/pba59z761bubqqq3NP98x0z/TkPNOTk2ak0UigDCIYCWz8jA3YgMHvAfZz+gzm2WAERjbI8EBkhISEAsoaTdQkTZ7pnFN1daUbztl7rffH3vucc6uqJwjB9/mp9teh6t4T9tnnnBV+67fW8pdd/BpCu0vsxdJjQKSUgBAtbF9dHoDmDAPTtguG2zf+8Jvk3ste+txDX3z8xeQQT6TN4UbDqGZ5VwggyhRtdFsJLOc83OxfNbZm24Y7v++tndHxfCDtmIWcM6tCKkogMLyWKRatBFQiyhIFE0XKl/p8DRUrzqkTCAgOzsKJV/ykBewSvIslBrpSNKQLH1IJBIGwJvMzbXXcHO5T4xCgdhRSO0ynUFzloQtFS70fLf+UUw96VB7wrIMieNWjwV97LFGBld0rhyKKbxlIWQlEzsyfbmcL3SuvuKJeS2npXt9hY0UB/P0MEiVntTXXgYAShe+AqMSGFTD+1WGoKlOoc+nfdyHfDpvUBR3gESP/LcF3E2dAVISInQiY2m5G077r33XvBbfd8cTDj6m1W7dsHVwzjFX9pKep08ryzBiTEllxQeiHw4cDBxtyUTw6ZKZWPP0CbqaAXMB7AaoCtlmeIH3v+9+6qm/gxW++8v7ve9/45pF5mmrnrSStMUEVhlggCM2diqijlvKklCQlb3apmFr2JS6JQv5iVBXKyo6kZlJnu21Xo4lztv3gBTve0Z09enLqyNFjJ063Wws1w5ykDdMYHhmcHBlpDo80BhuuL6X+OgSzkoPQlgUylGgg2CL2N0CpMr1BTKQMKCspCZUAlu/Ly1BAjbhcnfhSEEysCgiRUNSsCKUYyFfoqUjawnAOjkXEvQVsSJRY+PTx6YGBQdOXWM60Wka1fEKrgEn1hvfo/0WWdnX1l679WT7v3aTQAa97EMXKUa9yzGJjAFAJNU4Sk9UPHzzMTq++9mpjuHDRvmPHigL4+xgEHRgeVJO2Wl2lkB0Khlg1ojUFCwyngiSps+GUlOHgRETFsrOaq4CYBBAfExVRQ0FMwJdZExBcKCSpSjSfZR2Z5770qnuuJko6nWwBOXVOsczXkSRpH0Gds8RJbMEqMWnGzxmMYHwjEFHC5wEWCgZ3ASJwKG3j91cQITFJbtuOs7vffdPd77k177ZnMZVRK22mPqIdPPMg56OPs8gWjZa7apEsFr59FdkTlVk0AcN+AVMzSk7EoK9LmiXMmteMNjas2bBh7WZKmAROLamqOnEKUuU2wYFs5ogoMeRsliTsBAplqKohBokviR8bqxdAWgGtlSEWL61JfakGQFSsc05ECQLxmYKhIGr0vQq8Wwv/pwJzxMNH494nAwqleXrq0Knx9euRKkgK/6T6gMaDLZH+PXpm0c+0+IMerbLEQzvbrXpdowR3gnf8WqOKXhKTwKqYmk2O7joy0By46eabjTFLFuI7bqwogL+fQSnXTFo/dXoOqjAgESaipI8cC9dnZrITu7qvPPGynj6aLbQM583B/sFVg6vWTgyvXTWwerQ+PJAn1jWEuGuMqFUwORFSQ+xEHIjUkKgiFwKJgRMn2iXt2FYOFicuMSBhQt0FSUiWYNRVWhGEl55itLLMzlLfSdy/Mq6Q+lrUdhZ4sJXIRGKJOs01UWXpykwuQokRFaMJKzvnvBnLPphBpYBXrxfgw77RW6IC1Y0pcnG65eTDRz3vPlAoKSZWp0ogC/GGZKJwCuTCDBWfrZZDfFqDE6NqSEKRUBJVw5G8adTCMQxp4OiTKhUZDN4Q90rKezog5VAUiKhYQAarwtdbUyOSKpgoy6wIuZwN4IQgCNV8IBUVWfECCq4AKGxKSkzkqGFr2Zmss9AdXzcCY6PriKp+jKvWM5aATEvH8qjQYteMzhI1QOUeViGgs4QBSsjxbIdaNI3eyLYBGaJ81s4en92544bVE2MUAmnf0WNFAfx9DAL19w+MDq86dfqMY9RYNJM61eDkxef3f+7jX5l6dn96oHv1udvX9Q8ODQwmtT6XSef4ialH97y0MDedt3Wg3tw2sfGqc865aPPI2vFGbTBH5jS35IxvfkWSixJ7CiLZzAGwQuJckrCX9wk4ArWaWwlIDZniPS2seoEHJgqmv1aUhFRlRWFaw5OZPIKgsXiwACHFlNQ5JMTCqhCJHV/hSw9VX18lIqm8zBrBm0AEoqqYWE4aVISIFtg4EPWI/5CZWNUBSC2gQmBHjhjio7K+SZW/eS4clgp0XYSZVMWvksZcovgvQZ3Cc+1JISBmgJVJKVE2vjqQ38GJsloFpbRmYE2fzM21W23Tme92c2eFjEfzCzKsz2pWEPkKzRoA+mLZQ/0gZRCYTJonx4+dguG+4T7fSqiAxMrlKv8ptUgVVzrreI3vX58TQL1y/yybF18uf8LiAmjxxwT4CoxJZs4cne/OZtdce02tZlakP1YUwN/PIHA9qZ236ZyHd325NTPfHGv2m7EDT+z/09/+9MGn9127ZeP92y7beGn/gKqhJK0xsZIycZqYMavkOD05Ozs913nkw198rD2fTfRf9447Lr3zimZ/ynDOWpiaiFVRUutUSQ0xGGwFbFjA6jLDCVQFjgE13iQV51mIoTcrsefvkJIQMcdKDuyhZwnQhI/ThixbKBFYqxTSgECreitM1UcxDRknAlKwFlkOQISYvQADopT35CApK6JpiQi9ht8eZHxEjigIPAUQW68pPB0IROS9Md+LXmMyhAIeOBZfNI0osHAVoACiM1jV191jQx5bSbwOZGJVYlanIKWaMSmSxBmSFFmKLG11cObM/Fx79siBY4cOHZo6M31qZmqg2RxZNbJm0/rBiYnhWidruDPzc4IUbASwVlNiYhJnQWBiEY16NkzSryPDWGtRIwNTR3pwz6HBsYGkny1Zr8CgS41yfwer/M+l8vgs6vY1H39a4l8sOlhhXSzSO+FXisco+WcFL7YSz6BqtCLQABgQX+kqodwc2n0o4eTW224lold/iL5DxooC+HsaCSXbt2z73CMfz1uSDDae/pvHP/6bnxuYqX3g6jvXJjIqmszNNurEXGOOOfzEmndYkZj6xjpvqI9dcuObz7Tx0vEjj/3eg5/5H7//vT/x/Rffdm2Wmrm8I0YoYaiqGIEQsZUQsXWixImI76xHTgFfSkFAEEfMUA/4O4i32dm/tUQcI5sOvqS9pn52GjrjAgTlkOhaCN2iG20kr4owyHCoeqDsCxSEegRAmXdGEcJZxCfU6GzErcO5zyJ/CoGgMWtAi7Qq79AAvv+thqzlwBYCUYjNhpkzIlU8TCvwtKBMPu+hCEYQVMT4Pl+JAUjVJmT6KDWWuy07dfz0K88dfOmp/a88f/DksZmFM52806HcJZoSm9yplTzLHFJggi+6css1t12zZmTjgm3Pd1u55oZBAg46C8pKzJ7QGfMk/NoDCjYMUrGuvZBNnTpz3pVbKIEWCV+I3sOiJSsC1Msv6/IYzCKN/Goa4fUK3XhjvfQvPo1zXhSrWDQFLY4BeJo1CGTZnZGp/afOWbvp0ksv9Q/9ig5YUQB/T8OQuey8Cxqozc/0vfjykd/7+d+5Yf1F111xQZNcw1rAijNiaqaRqlOFEDOYwawkDg5Ok4S0szBByZrJ/hvWXvvC9OY/+uU/fPjrz3/3T3ywPtjM8zmXUO4ECgMIqYP1bd29Te8LE4iSsoZSoQr4zlAeiukRIOI1kIF4sIIISgxw4JAQKVgDOiQSSzFwhaAoXjIGZRB76MK7FP6kBR1WYymJCN4rF4mqMWhZynw9u+TvGctDz1p+E5wC/3sV+kAQPCWAFJN3g09ErCyEPFEIWNjXvGAkzA6ASBfGKGrs6p2W/frfPPknH/nsqZeP6wKGmxgbbm7qH5zctKlhao1arV6rG2LD1HG21c2OTp0+cPLEc3+1+4UHjl1688Wbrzy/b2w4wYLVzKpYqDGkrE6FocQQ8WsbPB2PpzEBoITMiaOnWGR0cpXjQnguk0gX1/Z1OFi6zG+v42ZUdlgK/fdOofI5FRv3AHsoKJ7+WzrbEUTVMBJKkrw+fWy2M9e+4213DA40mcyK9MeKAvjWxrdgOySgi87fMoLhRz7z8vOf/fr16y68YcN5w9o1kvt2d6ZRz60YS8wgY8gpKUGEiJTJwKg4mK5zll23j9IrR8c33/Y9H3320d/4yV/4xz//j4Y3TpzJ264OdiQi3cxyQkwqAnIKkJIRhSMHb0UGl9/EtCMtnGqPfAuBSR2UIUxM6o32aN1HLEa9tlBlMgjQQqzgoAUWEymmFFsbagE1VKo/hrV9ldWtGvYROTrbKPHnReZhVZ0U2/gLA4iLbQhV+UOAb0wY1BUH5URCwiwq3pAnITVEiVIKk1Gyb9fBj/7nP9r90IF+Hblm7bZNk5P9tVrNCMEZ0YTgcsuwaoUM2MlgPZ3YsPaiDRtOzWUvHjj60mcef+EbT199+6XnXLK12azNYb5DzsZuu9GkJzB5UzdqX1WGYU7EHNl3on9wqDnWFLaecOV1WqxMob0L8Tr4NVTQkpZBcapDl/lSF33R8+syQYf4u3+UejV6BfxB9Ucq/gGIWMmpaK2TnNx7sm5qV195XZLUvqWX+P+Hg5YkdK+MVxvB3qnyJV/3jvPt6fve8/bnX3zugnT4H157+ahbqLFNwH1pPROb9HHNoG5SA0Hiy6UREakoJaQkAuI0EUFKSF0iXdGk0R1IH5re/buPffkn/su/Gt+x6VRrhut1cRCnpOqLTyZgVaswjhRwnlYe3mCEtrpAlMlBNHAgLyoRwERMbMgQwbAEqN8ow3g4SCAGiWjReVsLymKkrQR4WrRo7xvqvsVScEpCvoRaWOOiPk1psheGe2F30hKBHqXIWR5srZaKqKSOLRGE0eOgkndYFvBWeM2ZJXCEVBlOTaJWhBKok37TZMEffeRTj33i4XW6utZOxoeHm6Y2UE+hTjRXCBRkyM9H1KmqYQOoy8VQwsQiyVSePXtgz0unTg6dM3b5nddOnjOWmfaCtETjA1g6SN55Ig/vMEyiTGfsl/78sXMu2Lr5xnNsX0cko1iwAqhCQIWeLPOYX+1RLrIGlhuExbekPOCr7bf0GMXP6PUCK1to+XPI+/M7BGSOwGoYNWmYo7Uv/eEDq2rDn/7C5zetGze0YvsCK8Xg3tBQwKlMz55pa+YEvszv69+70ei/6tJrTp9YmFi1PnWSkLJRZurmuRpSJefLgIFVDHzlIAETQ0HCiSbaBazaTNvOcZ3h8npn4caRje8479pf+8n/eOLxPaPJoMvzLuUgYmYIKVTI93dSKEgilVIhvolwcW0I1RQDVz7gASq+5KhILi63LrM2E2uhVtSKy0UEooADRJ0Tn8jq/QBVCNSBVNQSRMSKOA3SnYAi45lAKiRCqqFWQUXix7JolURfP6o2+mLsoPe3CG54VKpyzeHroiQALd03EkuooD+Rr/KppOBQ1ZWYiITJkXMNbhzdM/2hn/utp3/ngfs3X/v2i2/c2r9qvMYD9RzSATKosDKI1DEJwTG5hDXRnMQZYxIwxFkmO9YwO7dvv+eiy/uOu6/9v5/d/eVnajM6qM2UjCqU2KcKE+BJugwQkTIrwI5PHZ52zo1MjCvDiVUoc3zly0UoUite1aNCcT8QCaTVP5VtFi3ukvVcbix3+5bcgyItsQciio8HQmwqqjZVCrkTSrnMHpvN5/JLL71q7ZrVTCtyL4yVhXhjw5H81p/+rw/81D/8wtMPtJic+iSe1x4EMOj2m24xad+Bo8dzhTpRC+uE04RAIspgElLxYpoF6kgs1CmLGBFmIp/+hdR1yNokR55xK7v3/IvfsubSX/vpX5o9ams8BEsKI+J7lmimzpJCHatlDcWRFSQEV+Z/At5OD9FY/wMR+cLNnh8iArWiudPMua6VjpWOc12VTNF1YiGOoQxHcHBOnFNRVksigBpi5jpMTcg4sAOcr2sNVahwrP/sUwFUEdiXETSqjCh2SihfEcIQVfGjy0j0IC+DsqPygBRLjJbSiErN4Y+mHOArJUewRCKcoKbIyVgI+qneZ1Y99eCBD/+fH5576PAPX3X9PRu35IcPDKaZMapQB6uBSKu+4rRXsqxiFERKKqrqIJJSZqzTLIFsHhq8+4Idlw2vf+4Lz3z5jx7oHJShbKDPpXAea2MJDBkmJYhCJDFct8nhXUf7BwbGJoeJHQBfFhUaLy5ca1gIej0AUE/lNDqL4H6Vj2jJN1T5T3vlu1YmVplivHexnC4CTSvOX4sjskLFKNc43fPi3sTybbfcwiuQR2WsKIA3MBSYbc199jN/9bmvfOL7/un7/tP//OXjrdMKV6Aor7qvEvj6624cG10905q3eRaMF0AUDlADCxWBr/gvGgU2q4Q0I0B8PzBlISfOsSJBCsH8zP0XXXFFes5v/dx/wZQOykAipCJpUmMiJwokCC+ThugsVAlC7HNRQb5vjBSd3QMXSYMlBfVKQZRIiJyqqFpAAOuQO4dEhNVCMmRCDgwhceosLCn3JQODjeHR5sRo/+RoY81QbcygrkwudCtwohIqXoun7igACaWRF6HJ1T/L+AUVqaQVo7YUCwhhiorIC95IcYDKIbU8G1EhglRYLEkCTgBmA4JRSrL0gT/76p//h49snKYP3HTntWs2LRw9vrAwp8YAREKs7Bed1OdNh9rPSr40iPpb5AvrQb0bp5B8oMbXnLf1tvMulkPzD3z0M6efOzpoGw1NIFAEJ0xFWZHAiBMSdOc7J46enFi/2gySJgqAmEWKHL4CkKma8K+N/1SAmMW2f++GZ/tYl25IYdFL6b+4BsmrjeV1DcH7ZCRd5GfszInW+OjE29/73YmhlQyAYqwsxOsd/rHdtXv/oYNHLr10w7nbR/7b//hPP/qLP/bIC093VPOzJjuGQSAmXjW26t577pvrtDPN4nMrzlPj1RE5DyarMNRHGUXJxfR9361XABJfdEbUQXPNUwG3uz94y62DLx/8/Ec/XqeBJAeIujlE60w1VdbKi8UKBrNyKglrQpKypCycKqVKJgAbREocSxr4ZKSgpZiImIRJCJJAE0M1J3DqlH1VHOfUgg2bgRpN1O2qY3uyD/37v/rAW//tB97+f//Gf/rY809NOzfRSMeHasMNTgyEYH2Qj8hXxCOGQSA9LuJs90orbx2Wlugi3IACrWkZMVGohFBbhgo5FM+w6LQa/AcF+ch4QnAER4CKuHTok3/xtS/++p9dz83vvWz7ef3JwvzsK6dPzyXIufB1iqhbkWMdj05AaBTn1YAaBTmoWLAKHEl386qRuy+9aG1S/9pffGX3N15q2lq/1IxlA/axfSdKihonxvG+XfuE3OR5a7N6brlr2NeeI4CjTq/+WxHrVLnm3hVYYvDT8lue1TVY/MGrCvqzKxEU2FXvZt4PIAoKFZRSmmrj2IGphbn2zltuGWg2eSUBoDJWIiGvdxDggGdefmW203rH7XdefsNF33zp4Cc/9bkf+8V/8MPf80+++953r+obScB81qeZABjgPW/7rs/+yR8cm5lbu7q/BgKR2NzAwMApSQlkKJwApL5XIwmr8QJIPLlZQSCxQiAlSWH7MveP7r7vZ//0ry666frzLpqwDsQs6hhKrCKBEk8ASESFyEAhEookM3txGbwZz03yYCoF9g4UapgUIiKGA+3HkQMZkFqoIWElEU41YWrML5gXH3/xS3/x5acefc4u1OtmKOHkM7sf/MQff2FkTd+VN++49fZrN5+3eXB02KHdzTo5ulZcoCGp+lw0EQ29CtArmJV6JVVvcWNUN69WTdACUNZyz2Vt4OB8+JIOPi7ig5FOhCkB+SxhK5YIjc/98Ve/8ut/ee/4mnu2XTiesltoHZjvHrcdRwnDSLwMlCWTl8Qi4tn89KIDQwKvwZUkG2rUbjhv+8DBI89+7un507Pbb76kOZR2JAOYWHM4MTCW3YIeePnQqrVr+ieaypmoLxYOw1SE3RHLeFD1giszeU3kvrLHq6FAtOhulIuw7PEXo3ZnHwHwocA9QLwWBcCGoeJy13CNQ7uO1NPGjTff2Neoh8yAlQFgRQG8oaGQvYdfpGZ3/ZZVmzZPTKweveryrQ9++cu/8Yl//YXHv/RD7/7RnRddN5A0CUjPdgClnTuvveii7XuP7Nkxur1ZYxgwGVX2lQ4cKcUW6qRKSqIGxJUMHQJKeroChkkAFZtCJurNu9dd8me/9tFf/M2frEvSZo+gWwOGkoAU3vxXJo8IW2dAyqQ+McCzyw2RjwAoR2g9WOKC0PyFidiLXAeCsBJZNkQgI32pmvnTrSe+8fIXPv3Y7kde6rON9cPnNIf76qaWEPleuKcWZh7+iye/8rFH15yz7sY7r7py5yUbN61JTLdLLasZoKEkAwgM8ikDosGcL1/2uLD+3wIBLkVr/IiCzIm7lckEheVbgBE9ToMvvhn39NsYkypE4VhdApX68Gf/5KEv/erHb1+9+U0XbJlQpq7MUXZsbtY6SpUcnM+7KCpBR+9lKeQSrjEqPfWVJxQAQ1lJpE585dbNI0N9jzy+a6Ezf/GdN/YN9bWplbPVBKpSw9CBfYdnFlrbrruU+yRjV5j4PvZeSGSf2YZFS9mjE6nyRTni18WRFjlfPVsu2duvxNJ0hGUdtSWb9BzOL2ClGHZIJgkhA1Vpz3RmT81uXLvlrnvuMKy0EgGujBUF8AaGQqamT1IN6UAC2IQxNlS/+803r92x+aGHXvyJX/5HV6299p1veedN19+8qm+EQClMz/sBAmGgr/HBf/CBD//8j0/n1EyozoiES4iQkihDApnDI/bkkYFI1VcpmX8CwImAwEYNnHHdu7Zf9PVP/PlzDzx37k07Msk0VVUSsQaxUE9JfxQVQUj7jXm5xDErCl4+ecKOAixEQoZVFJwY67WVIcCSMYZQM33GmNljC1/6zJNf+euHD7w03W+GNzbPadJAvZYwQ7KMUxYFq1nbnFjTXN0VN3Ns/lMf/sLn//jL2y7efMs911584wVDw6vy7kJbFzJYJ05VjWeuB2FeEjn9IMCXzax+omGzyt3r0RpUlXBBlkWSZ4WtWNjHRUiEfP02ToxKztCGNL/0yW9+5r987M7R8fsu3DZmkHRtl/iM7U45KNU1dL0pJlQmy0Wdo4WY1Phf2XZBEYuCAgQYR2AWe+7kRGqSR194+YXWI5e96ZL+Nck0SQKqS8oLtPeFfcNjo+MbV7vECYl63E4C4qPx6ovEgWUe9mJhlhPLuuj/cEmFfl4K9fRsfRYx//qk/9J9on4ubrhn1xIjFT564HQ2n19729WrV4/yCujdO1YUwOsf6tTNz8w3qG66cE4sO7hun+FLtm7cuGb1zst3PPipx37ml//xurHt737b99xz272bVm80qnUyVVzIEO54090f+dWNu04vjG8YqjtLhg2xFYLxtrSvjq8KJmGfceAplRKN+FhqB0QUyymTJTKMVQb3bD7/r//X5378yotrdXScE0HC/ijqY77E8DXFSBnK4kRACQfKPwDAV3rwBBwQwAF0Cia1BJnlyDe0FU5RmzudffULj332T788s6c7lI6eM7w1obQOTpmdZAJO6knunDIlTN3M1RKTMo83B0YGB+YW5p9/ZP9jD78wsWnwlntuufHWy8fXDxvTzpNWLg6igVQamxZQhP4XSXhEWzeWcyu0QkQeNLJ74jclw2iRS1DsVORckRrmwI8SxwrDI08+tvtT/+aPd6aD37Vjy7gRwHVTZGqmZ22WASTCsVAGSvFffa7Qe85o5Mac6ADUUIjBs3EiZICubB4dHb/sqi8/8fSjf/PEle/e2aizUTdAjT27jkwfm778zmvMEOfUJQgHt8LHPyMGVNzOZSRrL4a/5HtaXhr7r84qx6nn/zeIwyzvghRfUbkJKZRYwZJoRgf3HKubxm233sK8gv0sHisK4A0MZ13edaxkxCSgDE5YrXW1hEf7+oa3nbN+3ZpL9xx+5GvP/Oc/+A//9cP/z5tvect73v6Oay69sk51VmOIARBjfO26m9/yXY9+7KNXbhjkBB54UZBoaCfrVIjIqFENlE0q2oKAAOKicxcBxFAByDeMabC7csvWv3zg87te2LX1qnWkEsrVCEVbCcH7BzMlTMZBSK2owie4FuYnBCE0KqpMPu6sPrvAgQkOBqau9dYZ+5UvPvT5j3/t+K7Z8cG1W1aNplpLiQ2ruFzZeRNWnGOQZ6LW6ybLcwDEDOtG+gYGBgayzJ45PvNXv/WZz/7Jp6++9aLb7r5+84Wbmw3T4VbmOj4pgUt0mqBK7C+/yE4FgJB+XAiGSoGKCAAtFl9V2VGUuA5GO2lEgMipMDOTEGUNW3vxhan/9Qu/v71L77zyovX1OqntJtxN0OrKqVYbSInIhmZh4Tw9oj64FktN42CaF64H+R8oVIkCRFTY5v2JuXrb+V87vPv5Lz12+R1XNU09X7CvPL17cNXIpvPXC7o++66S2CGhGl48Q0GGLWGzJSuzVGzqIhEcIba/k7HoNEu/iDGAynYKQDNkM2722Py5a7bcdPPNJmZKr6iBYqwogNc/1BgeHOl3hNOnZ1SMgSIh1UQEhjRJadVI/brLNl+8dd2h26588rHnP/OFP/jE5/7i+qtuf/fb3nHHjbcP1pspEkNg0Jve9PbPf/T3Zlp21WDKlMNBwRHtZYIJFqd4ES/qa+koAUi8G0DqO0h66qAIlImhItn4cN+59dGvfPqhbVe/K3HqTDxa7FXixAHGdm1n1o4MDqVpn5BTzS1y+ELNauHLYxKI4NSX7LeUkGhXjCbGkNb66s35U+4rn37mc3/5tfkjs6PNsfPHNxoYBjErkBPYsBERGAKRiDMgMkZFcidkCAo2UBBYybqmMX2DY+PNwTOd+Yc+/vwDn/nmhVfsuOvNN557xbmN/rrjOYGIqgtCWmPZagBCsfy+QomCr+BvnI9rh0zRGGONIqM3kNDjFWhF9nk5KapKDKPgPOm25c//w0f7D8+89dqL1g/WAVEDOAA812q3MquJrxJEKCZWniosb3FOf4MEpeqFomiBEJwAQNT55GSkpArJ88lVw9en5zz63IunVu+78rrrH3r8yZOzU1fecbkZpC6rkjJERZR8+W/fs9KfPFyfagURqz7xFTCnKvEJ5acUC/+9btbmGxxLpD9VBX+YZuFTkHrMkpUpTbR+dP9h17X3ve2tE6tWsfeDV6R/ZawogNc/mNmMjU3Oz3dbCwuSW7BnDzKxusyxcSlxLU1qA2Zo28aN68cvu/rC5144+PDXH/sX//cXt67b+o573/Hu+945MTZpKN164fZkeNULxw5PDpw7yEosxHAiHsYRUWM8/9/jGaqhKDOiIAieu8R6O0QgIRgF2SRNL51c84knduVzjhsswZIkB+9QkAKGkXCyd/euxx78xoYNG2646frJNaMp9+V5R2FB7NRyqDjGZFJ16jQnI6xZzdSb9Xo2q1/9m8f/5mOPnnxlbqw5vnl0nMSkMMSqYok94KCqxMwOPuvABFnM5JuhE0icGCYFjPF1Q6Vu0omB0fH+4bmsc+gbh/77I78zsXXs5jsvv+6ea0bHh3PtdLWTs3WwrL43DTP5zGmDGC4prD0NLM/IwfHgUCntF4m9ioiOixxTxISZWFmc1hIDV/vMb3/26Nf3vOOybReuXdVwuXAqhqywU3Nqtuso8eW3PXuXishE75nC3fUluReBUSXwH6YacG6CT+MmEhiCzTcOjOTrth5++tjR+qHnn949vmZ84wWbrLGhR5sSlClWcC2TsFExnHuWoYCHqJCtizaJAKQiYj7LOgrfhrHE0Vg6z7CNFnEi8pU2km7t+Csnh/uGr7/mxiRJVwigS8eKAngDI0V6zuTG8cFV33z86TvvvcUoibocYkWTWqJCVoSEOYEhHa03B8c2bt22+vLLN+966dgjDzz1S//93/yPj/z6O+9713ve+YHV4+uvu+uWz//ehy7etLneB5PnSJKk1rCqgCaqzlkG++RQaNkWVomEfEcQR94nUADwwVxVVQOIPX9yNH3qlRe+eXDz9VtF2w5i1YB8dXRHgHNiTHL55TsuOH/Hgw98/b996DfSvsbNO2+/6orLB/oHQN0k7cJaYYFaiFVQLUnYoV4fnWvrlz713IOfevToi6ebZmjT4DqjSQKYhMQ5AhEbQMVnK7CGQnIeePDV1nx/MfWlC1iDt0GAJmyIoVaIabDebJq044am957+ww995pN/+tD1t15x/d3XrNu+rmHa1rSgWS6WoOR8bQtfubTIJAtpr0ylmEUUar0FhcJPQUEWQlEpUEhJvfhwhFSY8mTf86e//McPXDLUuGbTqhoLQdlZuLRW65uabx3vZC0YB/KqOSiliEj5yRQ1mDw2p1piMdWWt8X8QlUiZlH1QWErMCBAyOLcybXJXONrX3qwJe1r33INDaZdaiuEJR6iTEFAZBr1XvuryvCq+R9npiVUVvgQ33YZq4t/C+yoimYsLiuWbAKpgeXWmc7U4dlrLrn6uuuvYQhgvt2T+99+rCiANzAYuGT7RZS5fQdPLOTdRiqAGtSVVZxAhcRbtAQiZiWiZrO2/YLNazeuufja7UeOnnj+iZf/6Au/9zt/+Ntb1lzYmWsfaddenuoOrakNGzYMkZzYiM/mN6LRnAk1dQoeSgEOBzs18NR9B1tiNcDEYN8k+OGvPbXxxgvh5hEiuCLRdlMmJ9akpj6W3n3/HVfedN03HvnG5z7z15/8q49fdsGlN153zdr1qwYH+7meUZoZVRV1WTY/p5/64vOPfunp9r75Jg2u69vEztTAYBFyog4cZK4K+eZkCoAksNpDFyxwtM8lRlkjSk8AVJCYRFRVxZi0DqwdWTMmEzNn5r7ypw8+/IXHtu7YdOVNl2y76tyhycHBpuScUTd3UCvOKqAact0841urMfhCzlbk3qKfohYo62NU3AeQaKqUDT74sWfyw/NXX7ZlXV/KImyMtaCE26onW+22wiYGkJDRvPhcXhnGu6jwpqvnslexlHjSGLUgOC2Be99VTQVIWDI3kDRm5turL984es6Ypa5jR6FStGeVBvYUFc2eF4/ivEWFkGUkuiIGJCqxlb8rF2CJ9MfiexZUUFA/PmhFysQ1Sfa8cpQ7etPOm4Yade/9rLgAi8aKAnhjY/vWLeeu3/T43q8e2T21/aLJXFqiyhzyprysVkhI9XfETGTc0GjSHB5cs67/8ku2vvX+W5584NlP/tkDu144BjQfOnho0/Bko5nUSY3COnCaQslZRQqQMlwiwmAKFZWdghypD/5S4HUH155BJOwyqdeSjQNDX33ilfaZbr3fFwpTXzUgMCFEhEgpt9IhMsPjtXvevnPnHVc8+/jTn/3Lrzz52BOrR9ffsPOWdRvHJyZW9dWxf8/Bp5564ZuPvNQ5pSP1gbX1CaaE1ZAhIZ90acQJcYixRt+kCHF6rMpEUaGxjxi02DLYbhBRJd9/i4hgiE1iEk3qQzximwut2We//MIzD74wuXXVph3nXnrdtsnzV4+PDyR1y2me5bmG9GFAySgzEQuTb//om6B5YI2qPkCpTquYSzUE4JEFNgzF/NTMS1//xtokPW/dZI0JxL4DpJB0xZ5udUPFn4g5RwcE0WIlxOZiFLsfhxOUQyvk0CIRwaMcvhUNEVRdcE3E1A4dPpH3J1uvPc/VbJdyduIngeIaKpkSFNVt9YwozheWYnmBGQG28MzHJfw7HxQXRXvuTiAFx4pASkpw7HI5svfQ5Niqd779bexL6/7dz/B/u7GiAN7YGBsZuv6q6z//2Ff37j5xwaXrnQiIPdG7qBNjKERmmTzVXhNlYzgj1RqNbxi8+/03X3z7xV/+ykuf+q3PP3/w0ONHublxw2RfkgBMZG3ORtMElhwRsW/LJRTfS1USy8SIqalKCHlSUAcYynNxoNHB4bmDu48fOr7lotEcuYiGEsRgkE9LJSEBQ8jmKnlX643k+luvvvr6aw89f/yrn37443/28fkznZSG8pbMnZqvcW3z5JqtzbEG9xlYIdd1VhMVHxsVLZoUMgW4VVUBDtnFxCEMG0xdlEIf0cMBAJiERRSqzIaYmFlEVDVJE064rzE+MjTabnVnXpl55NmnH/7zJ0fWD5978bpLb7hg68Vrx9aMga2jTIWdSKbqxHriERuupQakINEo/70O8lQrAIsMcMQYAHyira8Haujk7kOdw6cuGByaGBwwyJM0ESuGSUxyerYz37XB5VIgxN0D6BSRFoVn5PZEBHrboFFw/1AY5FTuGW88ETOYrGLe2hdOHNyw87yBjUNdbvtIeRmqrVj3iGb7ciM6cL1fL8Lhy3lWci20svG3ZyyZwzKTpsqGGoJKpATLpw/Pdea6N19z03nbzzN/t2GK/43HigJ4Y8OYZOfOGwd+Z81Tj+1983dd5UusSURiIrAc6BZsOLI5oA5JLcmsIzgRt3pt473vvWLn5es/8u/+8KuPnRysjV+3ZniIXDO1SS0RqIPRTGsJQAiJuj4/mGNVgCD0PUgtXl5YUaewgpa4NG2Yttv9/EubLrzW19lXAiuHejAAQUSE2DDgxDLIukxEROTcS9adf/77Tx2f2vXc/uce3f3YA882GmYgSWzmuibjhOpp6iyM6bOaw0lChERI1YmEdQCph1GIVJkisq4xrTayRwK1iUJwFlU4mlRJKAZQyVpnjKn3pXXFYHN49eiEip6enj91dOaBl5994BPPT2wYetPbb9t2yfozC6ePnzx1YurUmemZjus6YxsjtQ3nrb3y8h2TE8PGhAQ3DxaQFtWpewr0VLigfor+F2OUTuw7mnTbG1av6UtrDFGnhlmEusDJhVbLF8NjUnFAlNblQ+SlUeSGVjyDOCqURvWtFQqopeihRiJiGGBWTjhNn3z2BYzVL7jhki5yJUmINJfQJRjFRVXjEMuOakGF8t/i2/CYo+DdUuX7s+iUb9MoZqKLPg0AIgGwziUJJ1zTPDn40hG27s1vfUuSmBBdWZH+S8aKAijH67APlMFXXnHthedd9MhjL50+uTAwnjpPrrEeiDQxpkYEqIA1ZKiKiHZhODWqyip5Ls5t3jrywX/5zv/28x/72p7dDbflktWDCSkSC5MQm5QYBFHJBUzKJAKCLxjss5IIECZmIYhARB2oZW07d9Nz7a6VUdPc841XbnvrdVKDkCvs3SB4VBNOnEBVIMaRJMSOISRzMs/SHlrXuHbDRVffetnb3n/Xy08deO6xZ154etcrJw8n4GZtqNno62v015JamqTEIGIRYTIBMRdPxTEetgLYN9sqIBAABZAQTGSPm3mmPxOUJZCciJCkiamlLII8F8OSq7XWZt2uI5cCg1Rrt7qHnj/wP1/+yPDqZnM8aYz0rV4/MTYxvnlyS32oMTTWbIzUGrV6bn3lIsDnQPuUuoiRxychmtzVQaECB4vOnJxKgLHBJhs2UFVWJhhqZZ25VsfBKFSljNNUkowL/YZKrBnR9FbEWE8JPlXiAAT4djwqYGMAFWZnkmOzM/vmT13+zp00bJy02edoUJEqUurUytUtsuvRK2CXeRVCMlqE/gUsIUrub22oElI5yN/d6IHt4uzUpOxUEqsyk08fOLVm1eSb73trLUnwut7u78Txna4A/DsnDoZf1wNCoP7+wQ/+gx/88X/744899MTd9+90MhOQSQXBspcT/g1RUkq81GY2uWQEp2QkkyRNyJC47raLNv3jX3zXf/vJ//7VA/uazR1ESGxGqfZzvclImRNCwhBiozF+6fl/ML4Qj7Nw0Eykk+cda+dsPjPfzbrqYNYMju5/+WjeUdQTpVxFDCUgUOhiwKpC/qU2GgpHO8eGBYKGdNSSOqolQ5vNdZvOu/z2cxYW5o7vO3Z416FnH91z8tCpY9P7bYcalDZqQ8PDY/19zYQTVTDYgNgkqs6pI0++EIUq+YhsLDhR3AUUUtKYAFkwOGEnYsgYg27X5lmn3em2OvOt1ux8ezanLieukdb6m4Nr1/Zzvb9/cuPg2qG1G1ev3joxuXF12kjq9dSkzASGKIuSCodUNkZskiu+9ZpWZCPgIxMKJWWwQlXFK6lE0JqZJWBosGnYk2ZVxVgyp2bbC7kIJz6H2kSfJpbuQCHHo6wkofLRistQkPKJQrjXgRygSqwwKpKoJsTChk1yRvKHXnlpzeXbVl+4rkttQFigKmRMLBsSDtxzfcsI6V5fodRMxYJEHD0Y/+rrJKkKgblX3Sx5mapOxbc+KoenGBUJFCsCSJGAjcXxPcezM603/8C9Q80m0TKlKVaGH9/pCuD01PT0wulV61Y30UdWmJLE0NlVAQFg8D133nbOh9Z++W+euv3+25nSXLs+RBvYcAEFUoI6dQoisCoSk/gHldn45ldGa9LFhRdt+KGfeed//ZcfffLkScjQALmaceZMp7+W9vfXm31JatgYNkwEmICZsEJFIM5lVjvWtl2+kGXzWbcj0smVJbHEjSRtn5hzjnKnlIiqeKISIBxiaaH+j3cJPGJALrjUChAnVsWxsBEdpYGxgYF15597/bZb3ndv3ranp6aOHJw6uuvwk4++vGvP3vbheWMpTerDfcOD/QONepMYtSRREyIPHA7MEGUyGkRvNOaISNlJpkqZzfPcZjbrZN12t53n7Xa3LciQYGSsf/XFqy45/9zNO7au3zyxamy02T9AdadCnIASdpw738eMHcGCPDBl1OfxRgkL+AqnHjiO2He40TGzqUzIhZfYKgpHWaubMho1hioZn5DGjpLZVtc6KFOo3sQFmh5DCRHkKtRBaceGpKoyPhtSEPxxgoLQEGgicsJITUZ4avee+SFz4x1X5EnHqRivzZRFVFUYJrpdr/k2BIyu8ntBByq0UkgphJIKGYU6gJhMoeXikRTfLqFfTo4WXUXpskQ4SsixafGJvScbWr/zrjfV6gm8pl9RAcuN72gFIIpPfvrjP/vLvzC+cfK2m255/9u+95ILLlVNObRDWX4YpdWjI+++795f/8Pf2vP86XO311RyZfKixCpATAoWBPAbShCBsmMiVVhliCqDmJlATO6GN1375KP7vvi7X+PO1gsmJvqoS5yddinnmZlBPU0SwymzIWJf7FxFVcWps5qL7QpydRZwogIlQ7mIg+lv1NzRzsFdh9ZcvQmUxaQCDfRM/y6xQoXEfxaqjpnCIHY+ssbqiHz/KiNq2NS7tT6zYXR8w9Y15saLvvsH7806+fSpMy8+t2f/3oP7Xjg8dWTqwOmDtp07C/KNCGMZMgYDSmQ4lhdSVWJy8BUzhYnEaFpPav21gcnm6tVjk+tG1m5YO7F2bP2GyVXjQ/X+urB1iYoRJuR5l42KdQQShNyvCFV7kj2LqhIMs4p4QD+UuQgNWRRUxGo1RhyL5F0FQAwIVNRl6jI1hEZfQ6M4FuaZrj3RlTxKfWbSgnaLqHHQG7eMioGCiBbvSQYHiRRwoeIGG5/AAXEMw8xWoJw8e2D/ns709e+7iYeRkYIJvucLkS8Qstjsf61REHJR3ZNQSFBlCBRKiVDSRW5Ra9YsOSFH5cbV8a1K/4riCJpkKfwPCumGUKga1NglM6faU8enL7/88utv2OmJZCvS/2zjO1oBMGHtxGQ7z/bNH/r1v/jNP/jT3779+ts/+K5/fPM1tzf6E1/tmLWngGB8gfk973j///rox/74dz/1r3/l+9RKV62q7/ahKp5oTd6yDhALDEMUIGUVz6PzvQ8dq4Had3zgjgf/5pFH9h5tNkc2D5Kppc5Zb54uiHBMYvFSGiIAqy8fClWQkBdgLE7UiRLDgBk18PSxqbW6SbzsDaRD9dlRBB/BiJA8SpodVcxUn0GkDk5zJg4n0cyChASJJpyIwWj/wA0br7iRryFrslxac+3WTKs7lzkH282ybp51bbvTtpntdjpZlnc7mYhjNkktaTQatb5as782NjrU7Gs0h5uNgUa9WTM1wwkpOSEIWxVRsi3tCqs6FXGGGRCPdagngUtIlTNkBIICdtJo2fu7o/EjoLT2F41Yf0cBOJASk4EV1xVmpKlXYkRkHDDd6S64YB4LQs2jInaMuMT+tGWwNVJ8IrIRqV7EEW73cKIAysxkSZksFJTsO3HimeMHz731kvHzJjtZR4woREPZJih6qe+vZohrKWMXb1PsFvRCSCtnMZK4TufUkal1mzbSELyxEx6gYK6jR/T2lO57fSOWvCqeSv+hFuWqqnYMqbMuaddO7DnJubnlpjv76g1asf5fdXxHKwAAV1133bYLL9inu9/7Y++efunwy888/OP/5qubxrZff/1tt9x67yUXXTY2MGyApHg5lEBgMhs2n//B93/wV37/l19++obN29f5hlZgMpQQEcHX6xEO1D/24pUAZQ5MdBLP3HfMZGl87egP/l/v+0//5PcfPriref65q9NaLeHM5TkoMQkxuk6YCOT7RgKkvrBjAKpVALAEeaVMIpomSS2p739x78X3XikJhcz+2D41AtSlRVq+6D1EzeL103gNPg4NZoYSmBwAVisOxgKWEqWa6W9yc2LQWE44gVPyzYUDJdtjbSG5wZczFVZRJ+qszZXUiQWyXH1GlxKzQGAEJBAYioXhfBOsHuyeovyRYL4XgdQiy7eC9dCysj+mtvqDFwa8IZAjySVlGA65ZgRywOmFBSsoqegBptbFhy2miTI0WcDmFMVnsRsRxXbJCXJNWS3gOD29kD95+NDYJWu337S9K5mwEqk6V8D0HIV1z92tTqVHOaCkPfWoiugPUGRwAaTwNOS01pif7s7V54cHhoUcfJ0iVI4akx1KSf2GRixcHlGxCrYEqpB3Q31TQ4a7fGLvqZHm0P3337/S/fE1x3e6AhgdGfnB7/mef/6r/7yepvd/8J3dzj2nj0699OTur+763Ef/60f7O33bN2674dKbrr382gu2XTg8MFDjOmAIaJj0vrfd8+/+2y888fQ3N21b1+1kVAN72yuY/ghAgxedUARaCKgQwipCUCepMaJy452XvefHj/7Bf/zrbxwZuP6cDeNJwomSOFGX52Bm5xSGPX1RJMYZEDtcaKj8ApA68VZ+I61PHZliEIGYCQ6AEhOkUvIA5AGQiFEDlUyfOKiCV0cajYDgy58JgRSOlAQ5gUQtEalRNWwBShFEaQF9R4hJYzg4dMCh4HKpiXwYf3GFRPNmpgODQMZD+R6AC3weLSzHYO9HBaG66Kp0EZOkWI5CZJbZQ0X5BBISK4aZjRcvrKQd66Y7eU7Gl1wKhutyNdZimKiYHaIaqmwS8fegCtQzi4nIqObEfKrjnjh4INk4tuOuS2yaiThwMAKKRYvRh4oOWHyVhemvPTd/maFQLudGBBEhmzSawyMTxw6dGDtnnFm0UBflxVQ9AERlerbzlPqwUCTRWkE0+6Ph7xOAQ2krKMAMtpg/Od+d6Vxy6bU7LriQ2ayY/68+vtMVgCrecttb/vtHP/QXH/2rddvWrR0b33jexsnN49fnV+TZwvFXju1+Yd/nXv6zP/n6b9fs0NrR9ZNjq7ds2DIxPrFh/frDR3atmpSubc115zu2DSFDppYIEQwnRD5BOMQGoAAHaJqLd5sZJCC1IoaNyeUHfuxtAv2Tf//pWoqrz1k3YJhJ1Tr1gSwFiZASMamEJOAQJyx6NsZ4pqgS02Bz8NjRk5RbSgDjtxXAVGVxtPFQQBSVFaoiADEGFwATJY0GliKyGrVAnX3jSg1hT795TGyKejD69BUz0evMQogCqmDiGLGN0qG06zUmXEWrO045lisozOBCnRS0kGgiR+xFK3uVjJbKdvEuUELsk4wBOOGZdredOee7WDLFCfREVKMojpGAct0puBpx08pKhuslVeMzKziZ7din9x9d6OfL77wqGXKiuXJMBSxvlwa9EnQm0dnkezDv/Y2pTGrJoPKJ8EklAMv4xMiBl3a7mU46bPKaUbg4a6VSw1Us9/JmLDpP5TGLBTuWmwhpMEaK2m8AyCgbMeSSfbv3qtWdN96aJAkVpUZWxlnGd7oCYKK1o6t/4gf++Y/90k999s+/+O4fuAfNOhp5M2Vq9I1cuWXbVRfkQLednznRmpmePXPm9ONTD8w9PZU93DJIvuuH7738tkvn8zYRnFgD6eS5ITJsmJPEmISMAZNhL+0oxGEp5j+Z8FYQHGCFbJ69+4fvnD3d+tSHv5K5fOfWzYMGxKTirPggMvne30Ag8RCK6jaKUE84JHsxUV+tfubkkXyuaxomtHcBSmdeY8YqlbKoVwVU06GCAI7gUNQDCLhHkRpUZM9GQ5qKg4XrlRKWiV8VtqX/hasziogUBW+qkNdaTqQqQUKWWXngChBPkd6J8uhxy7B91EsaPQ4/dwF7S9gwUCMwhU67Djzb6uaOAAY7raxgj0DGIpGm5Q/FapUGPHzwhSjgbKoQxVzuHt+792TirnjLrfV1NcWsC4q1ZBlVVGoUqGeT/lROrpqGXC5CcWeCXKdwOwyD4GDrg/31et/+lw6cf8151tfjAwRVhyacKK6IVs57tgkVX8YHWxdtUXhH4teJiDRXO9udO9WaHF/znu95T8IUUxNWxlnHd7oCIFCN07fd8Y6Pf/JTn//8J7dfsvH6nZewOiEfWbWGMlbpH6TBwf4N2u+wltQkCrUKlzqbI8kzWJ8NLKoKOAWsJThmMsSGk0QMiDhJWLhPKYGyqBrYGJJVsSCDBDbLlOgHfuZ9eZJ+8Tc/30zdVRs39zETia/eA6iIq1CyQ1JrVbL7h96XLWg2mnbaTp04sXZ8fSYWvuGvC74CQgpPIWCjBRgdguKXqLEK1CiiKsEMq4o3WgKsKFB24g4wgDdJe/SNVjeIWidaw0G+hR38XApZoJG3CR8xiFopchP9XlzqFCooMhUztTLd8j9vRhd8SCKhlI1hlsBpIrQdTbWyILHjjrFCTQ8GEueL6jVT5QJjUAExBY4AGIYQhE3b0Td27zlCrUveunPg/Jp1cyAlZYUrnofiXGU4trjms9n21VEVzYUWCwrCB2SCjycChSSNfGLNyOFXDpx/4fkmMdZI7F2ki56A8uSLjYyetfdaLJo1flF7Ng/eJIyCAKfkmNTAGEqmp87MnGzfe+9bNqxfs1L7//WMlQgJiGh4oP8Xf/znBrrjH/9/v3J412lDhgDV3ACAUxXnXGY7TjLJOyy5cxmTKDJKYlw0dPAlUTgVgQo0F9d1tp115rqt+W5rtt1ayDrtvNPOux1n29bm4nI4hVqgbdtdyayydSyu+wP/7K23/YNrHzt8/LF9u1uUmqQBIhFnnfMcVVFfIEdFA+4tFSPOv0JEqNWMzXD66HRKSSGCCzubqAwJV9ajBGdAIXhdYD9FkABxX5SSp1fgBDHv/3DADXytIDBp8IdKr0j9h8w+8dl/4neP4rE4XDx8uJYyQUkr/kb1LhcfaPWT5SVE8RXFtfIgW1FrJkmJWUmVlUV5IcdCps4fmSkqZe+dabw/Xg9WJH+EsKJuKaSsAlAJOo2ZHaCUzDo8vm//QW1dcN+1YztWWV1QVmiRYV29j1FnV5Zgeemv0e/wd6H6SY9aD6Eaz4oKk1RSotzko+tHbO5O7j9pxDCZIuF78UnjjetRf4GIVaxGMR2/S29FQUB9IIdCWA2hiamqKLo4vn+6kTQvu/yqJDHL+hcrY9FYUQAgUMJ00fYdP/uBn84Ouo/9r090ZzklGCNgEjgruahTUhVLIBFREStOyQIWCl+KM9q5SiCBOBFRdSJW1TqxVnKbtbPWdD5/KmudzBamstZ01prptKcXWgt5t+PsQrebw+awoi41/EM/931v/cnbnzo59bmnn5jpZElaVzgmEYgDBBRKAgWZHF9TggfnfSnIWmIG09r+Vw6xlJQUhYcyQhkcUKDOUJSkBIRK9nGJKERAKwXGNJKiKtY/9fwprV8vSQvNQuU7TuUGiC98KSh6ty0PBipw4FKvxMlq3L5UVksAkEqhy8oJyj39n4icaAHfiTjTgOlPLFTBCs7UnG515q0DE5GDSCQiFTBFPGVY+Mqf8KWAJJTMA5RKuoBPViaTtoSe2ndk9+zsxXffOHnJOqddIk3ALAoJlLNoPUffSaPyrqzb0qe/vJ0ViVma2xV9H5kBUuh6MpST7Z/oGxpo7n95H1kmxz7wgAIyDKtass38k1qR91r8TiiouRXXqfBv420jgEmI1DdKJrBBahf45OEzo0PDb7n/zWbF/H99Y0UBAACD0zT94e//wPfe/e7nH977+b/6SoK+GmpE6siZNCUmiKoDKamIinqEhSrkiRiQIoRsJ40CShVwUBGxKl21Xc26kmcu79qsm2ddl7W6ndw6J2Ild5oriXU5p3rPD9x20wdu3Eetzz7z5JHZhbrpN2CIqIiHPSTEflW8EUUhMOBxEZAyYbAxMH3oNPvpcZwthVyFCs2ikLMF+loMDa9xj6cQwZOqvO8R11r8WqRTLRqF/4Qe21ArB9TyT6lqgnSOXyGUnlv8p3L20qoPXy1nH1LP/72eBIFIiEQl0WS45ggkCjWZo6lWJ9NEQ8qpt4C1WC1afIriLGWgwidzqAr5RC4GEYFJCCBecO7pvXv3njl5yT071126waGbq4UQQZlJFIH+E0VuYUQrqqqvxx0oF/JspvIivREeCiWNhWih5LOeE55Yv2bm9JnudNc4f/sK36n60FTO2rvS8YdKBANa6sdCK/vHLRaiCK1SfUWnHPPHF/K5bMcFOyYmRtlnUayM1xorCiAMJmrW6j/7Yz9760V3fOoPPv+VTz6MLE1YQbkonKi6opu2gEnhRbBEYmJ82IO5E+3z4PVrACq8QgiVm8WpWHFWRFRycQKx4tSTHCFwWX9/864P3n3NB24/OpR85uXn959ZIBo2rm6cCdmtXgrBW+e+/UhhWwY7c7x/eP/LB7sLlj08oRFkCKJfKxKqB/pdMqoWfLH9khetOEblJT+r8RmVaO95qbIRUWHml0cvz/pq5u3iEy6dbQV/qKoNVA3nqG9CuimQoDbaZxnOqgjPde2ZTmahylXUvWKM98wtqF8tXZ14mQTftoGJQkcJApmkK/zUvgOvzJ/cce81629Yl5s5UZeyIVVxEtsdczXfrHctlrtHVIp+XXaDyoblj+Ho4RH3aY+kEOjYmjGn7vShqSRPIpRXqvOqs1M56rJqOLCXlrvdxVwYygIViEKJiQVk5fihk2Rx7913psZEPGtlvMZYUQAoqCTMZt3qNb/2i/95q7noL/773zz24Au2S8akFk5UTcpgdSJAIPsH67tEMQNAGR7WKtgBBFwm0GhUKFQJCF4CQmKwr1CmTkQtWVOTxsBA36V3bd/5wdv0/OEv7nrmiYP7lRpNrkvXiViBE4LAOdVgJBeyHwDAREONvvbp9tzpBYKB8Vz4KuaBimnmp15Y84VNHz0FhFTaJSKmgpxQcbTCVEfVHo9/CmlZ3X05bVHF/eOXvUpLK1tRddUXHa3nhhSOQXHuxU9Fz2fezlZAIUMTI11CO0eX+HSrPZ9nAY8I+r2ELGLago8hlB5h8X2Fn0Mq8GR/5lTAoHTe0ZN79+1pnb7wvmvX37ChjTOOMoIoOX8vJN7yyNXR6hJR6Yf03q6K2qSe1Vp+ULllz13yFyCqjeFm31D/wT0H2TKFhgXxXL32QuX/ZfT9koku1p0xyB7y6JWUoIkyLeipg2cmhtfcededafL6KjuujBUFAJSeOoMN89YtF/z6f/zQoIz//v/4y0e+utu1+vq0hkyttVA1xCQhQAf4F67Xs44MF1RkjUbz0Rt5XuhQ5NDHjYPkCulOBCUHZwcaKZPw6toNH7h3431XfmPhwBdefH4WSd0kiWNS45wvLdFjTRVKicH9tYYsZIf2HyVKVJSEWA0JkYtlXyL6X5Rm65XXFMLbATnx2iru0SNUS5+/Io6XhJmX3IC4V2/MYJntqKI2yj3j0hdOTylgChGMxY5Oj3yPEZyoxRfDJr03Wd3Y5KhlzGeuJZjqdrPgiZWyP/JmQrwyrEevTgxbhugFxZ4OagAiNUmjbfnxXUdenp275M23rr1mU5cXHOXECjjfRVkWydfKL+XlLTXul2rvINzP6gdoXKTid4oXSUSUAE0dWzs+OzPrWo7EFDiYRoO+CNFUDlk9XRkLLk6oceVLbVU2SiBEP8o45rw2O2UXZrLrbrx5zZrVK82/Xv9YUQBA5dEmcFJLr7th56/90q/Ws8E//MgnvvHYK2mWNtIGG8o0D2iQOilN3dKiLP7TYBhF6UMAqbIUIEMMqKl/jWLXD/KFngECqaiApWYx2GjOdRemdWbypnMufvcNB8dan3rukRPtLDUNsT5WS059pbJ4Su9ogEhRU65ZPrjniAusfzDAPpEntspbbD4XQU8qXP6guoqobyHWKYiCGJMu5FyJ3WORcKkojUIQxB8qpvtSNVCKESoUZ080orD/NRbWLO9y2FErWwZDspytVxnFh1H2R62izESi42tWoZGcyd1sbmfbmaVYWqkymaosKw6DYg4azVhST6oJ2cwmESIlM2/dIy+9uHv++BX33zp5xcZM2pl0CarwydP+6ri4d4uvEoVEX06sVx7Zypav5gH0uBKlolYAIurqOrJupNPNzxybh41mRRnML7jES85ROSgtuV3as5VfouIt8zF4ERF0zMn9p/q475Ybb62lKZW+wsp4jbGiAIDKo60AA2lCt+5803/+579ippt/+ZG/fvqRfWj1JbbBlMCkhtPU1MIb46vz9FjOlSIvFAx9jv/6343CKBFYyUeKDZTUNz/xrXABFQiswrFoHzglm+eteVoYvXjtld9z2+wW88Cep493u6lpsPOBaBshqYAthYpgIo3EjDUH9718oJs5wDBCl0IlwDjAeQ4moAQh/6+XSwpSsCpBGcoKVjWB9VQ1t6vkolIVVMzsxe/iYgs8iIsiklKag1o5agF0hBdcI66CUEaitBajxb/oxBW0jVRZ1Iga1URhFCxgBQezvfBeQnGPeFkKgoyuGuX+5sGZ2enczue5hQqIo3enUGESX/M0yOhIyw1PSWREeV0s8CtsPI6U1BYcPfDS8wcwfe333Dx53UiXpoSyWpKABFCwUKBo9ay/X7tyAXp0wdKHvrJ1j0+1yJ9D7+6FVtPADVMIYMkOjDVraXLq8EnjjEeB/CoWqSW9ZZHCWxLNoOIJBEolHpRpnE1pkwSXSRwp1ZIUmTm+98SmybX33nOXoUBwWPECXs9YUQA9owAjaml63z33/5sf/7dzr9BH/vMffeMLz9CCadp+zhop99dNmlKijtTFalUVH6K0SqOlXLzupDELqrAyw7MeWv+qqm+DRRSKpymUFXVJUqQiZrbd4vG+a95xK++Y+PyLT+w7M00mdU6LM6vPR1OVkCIgJBisNU8cPNHt2FCKGUJwFNNy4eV5CfvE4UntFKVNgeYwKFQX0gpwFFmehWHe81spWhb9QWUSxZtffLko7Fz5r5hjz/2LYqdk4PS4NkCF+aRxBQrfpVfmqULFI/8UCPoqcCpucKh/eHL1/ump6U6365SYEaqQhp2DegumdeEH9ThDgbPrg0FE4NSSEU5PtrOvvfT8yTS7/r13TFwy0XKzYvKUiFT88wblwq06q5R7Tfv3NTZYpAN6dwvLFhO0WB0k6a81hwbPTM2yNexK/VmoberV+v7u9qx7sKYqofHl5oTwuIKZCGRs0jrdXZhq3XzTTRMjQ8zmta58ZZTjOz0TuDoqiAEZoK/e9/3v+YDmjX/zSz/12T/6/IF9e+qDAydn8vpIdu75w9dcdSVToqrMxdNakkIDzECFQxBFIEg98hKQ9WB3K4HK2r0BxmGQiiixkCh8RWRiMp3uQqO/ftl33/Ykf/2hbzzfPWfrprFViaq4XBVkjBeCoQ6RqDE8VGvuWTg1PX1mcGRYVX1DLAdiTSmGQsuEgFDSWIl8s0YUZquqr8CpUITmJRE+CFay94EKMzF882rLXhygMF570SJCkd9WmvfL3r2INvRAc9ojN1C5WeqvyedHKMAcrsoHVVVDklGRQEsggZCSkMH4uesefPSrl7XPN5JaEjVUNBjwWJ4HePzpGJASZClSp0mdMhwxExKlREzt0PTs4/v32mFz4zvf0txU6+YtGGeUVSyIjGEJxJ9oeSyvAsoaQJVlPtvSvbayKLfUMmQVV0l9fRLTTIbHR469fNzNZ6bBkpCqqE+WKIpreK8lviWhTXRw+CIrtpx2ufDlJ8UXCqaExWjO+3ft66s177rjzsRQebKV8TrGigcQhn+aRX2zxECrbiR05123XHXZVcf2TX/tk9/4+me/fmT3XtvOIanNHIGMMUUANR5oEeeNNMbCECweInCwl0uIwX8bPvQdpfy3qprn1pIImMX70pw5O2/sJW+9Zmzn1of27XnpyAlxtTpqCacgOIFC1ZNMnbBqf6PebnVPTZ2hBCCnJELO9xkGOHISEXMZiLw/UYnxFgYdR+ckCvYiTlBh4gfXpgxuFGtcsbL1LH+W3pnq+99zhDJDqOdX9B5t0c9hEYu4BSCRlqmqtrD3Y3d7je6aMkCk7EidW3/5xukaDh+bIk2Nb7SQsIaFKrKU4DVC6POCkLARWraIMpiIRY2jREyy+9jxB198RsfqN7/nnoH1g7lre3VDodiqqmh0JYtHrserKJ/l12Pi09IfX2OUYKnndhEjRp6IMTw+7MTNn1owjrUSsY0TXF4JxbMvB9pTRcsVRYQK/8/nindwfN+p7efv2HnTTlqR/m9wrHgAYXgMA4DEIsezrdaXv/7FX/vQL7+w+4Wrr73krnfcvvH8tY3+OtVVTcchd+K8rUiBDgFv4Bc1bkobrTSaStc9NqAiICCmKI1gAqmKMqBW2622U0fkCMxCxCSkXelqwjveehXX6ekv7pbc7dhwTpM1t21f50aUDAfqasrGZHTm1Bmbr2F1vnl94sAkAoBIyHq/pDS3i3WpGOhlykOVZx1QgYBHF4a2f09jNR4UucQVeOH1B+oUKKCBQh1Qr+DwbkvP1BbdYvhbFIqVUbgTFTNIUVFtCPwoUHErlUjBEJE1567lUTo0NTu5bkK0a+HUGVKmcPQyiZjIZ40ENeoRf6gaKIiEmJO0w/WnX9nz8on9G6/beundN+oQd92cGN+nTaEgBknhm4Ul7o0z967W8oOW/a7qJb3GqLrJ8LY5FfBW/1CTGccPnRjdPEwNIlPg99U+M1XosThq5dclz19wwkrZHp5SFkq1Nn3iTL6Q33ffW/oaCcc67CvjdY4VBQCFisAwOejc3NzRY4cffvThL37xSy/ufXbBzW84d82/+KEfO+/8c2oDqmQtupZyKzlImIxCqGgn25PeqIoKQO3FV8D2URQ2LgQhhbqP4dUIyLsSgbs2Pz09R8LETGpI2YAAUZgcuoB8x71X9Jm+Rz/37JE8u2XL+ROadl1mDZFhgpAgBTXSWpLx3MlWu+sSypMECRIRFViBUYKBECCh+SzCW1qVrMH2il47wn8AyEtdBNkUPfboBESvPZiMhWMfwCMqqN1x917Mpnqjip8Ui4REcRoKyrf4rtQRPZvFA1bo89DY1WqJL1L0aASgIGNVRteODq8fOfD0zKXsb4qX+yRVFql3f+L6xNMRkfGN5BVGkuR0p/Ponr2HO2eufctVG687zzVdrguaeEJYBHwkPGFl4KhnEeI1R9laoCZL5P0yOoAq/y7Z5awS1Yt+QlHnXOsDadIwc9PzjJQ0BwgqUYn20JSXBWoqiFZ5eVT5LvyoxExKFgpkfGTPyYHG0O2331GrpWeb6so42/hOVwAKOMVXv/7gr/zaLx06eKDVPt3hvDleO2fLuju+94prbr22f7gfqrm0uq7j1DlxYBAHIJOjbVI83VRxksuzVBAiwJeLVwCxQD5rkWVMHmDxAVgjglMzMyfPnGE17JNeGHCWwQIBUabZbG7Pf9OlgxsmvvQXD5x87vHv2rxt48BoK5t3ImCFU2apMw1RMj81O7eQE2eNmjYMPPyvCpBzKkQG5Dt8e3YTAiGJwJGrHuYYXmB/wUHf+e7HlZeYPDjguTFKUZRR4Ql5YaiFxxETbYOnsMRIrPxYkeVFlDm6HsUMKhIwHhFUSPGKpAlmdKgwQNF9UdWic4N6/EVhVbvkSCip166+ZtuXn3js2ML8qqGaFWvIIVJZVEshFs/nU/WEmInIkXGoW01fOnrmiUOH8qHk1u+/Z/W5gx3TEu0KhNWEPPMYE6ouRNXUOMuj/SoOVsVvWORTnVXex7tNxW7FsqnHMsUoN7l/qJnN5W5ezKDxze+cWp/dHFd/GSVPPf+VdkN1MuF0RFAVARuT1hr5tDu69/i1F9+w/YJtdNbJr4yzju90BQCACXNz81977IFrbr/80uuuGN/Yv/G8jaNjI2k9yfKsg2m16uBgRK1yYlRV1JWufZBB/jWvGJVRE5SGY9wsaAQPkfs3IgaAvfT3kQJV6dh875HDC508gWGweHuZFFAmCIQktYIz3YXh80bv/JE3Pfixr//5c9+8ffP2C0ZHa+JPbclJLjTUqHMnnz7dtrxQS6m/0ddf66uZWsqq7NSpN149SoFo78OXj0CF7kqG4Jsdo4hkcHh3qSKY/NJKJBOpMsoYohccMUiuwStCCXEUwFpYw4qFGyU3VRUGYqyBFpuWpYgLWWzFbQnfc2mbxvBs7G6rnlIlEJXc2UxsZvNWN887to9q26684MtjT+2bOTHS3JCyappJqFFcKjUi8rQuPxEmCLFTdmwWcrywb883jx4avWjbbffdMDipXcw7tWBDCiAyzAIQFb0loopCKK5ikag/Kwa0zKhs+yoCNKjYqo6N90KhpBAV6qP+sb5TJ6bnp9uNyUQDh6riO2pxJZWzFatVzWNGPHKcYHAavB43pAp0eOHYjJvr3H7zjY1aQkSV52tlvK7xna4A/IN40807JzdvuPe9916xcx032q1sNrPtLtSxWCsEMgQIyACk6pQM+ZAxBZYIEMrLF6ZrxIzjm0Ph5S1afGgBoERpqsQMhYgaMgxa6HR3Hzly6NgpowmDScGhnERCPkKgIBaQOpJM0RxJ7nzfLbsfeP4rj750am782i07+iBsUU8SMa7GVMttu53NaJtI6qbbn3SajXqzhnotSTklgNlPMDJ8ijWKmA2BSA0iqzJ0vQd7PijI5zoQB/3GTGRYQMpkAg5A6h2MMncginARZYaoMJOIEpdkqmBLF0EVj7GjpCFRIZV63YaqQlJv5ROJioZuylHvAOGAqk7UORHR3Dqr4px2u3nmbCfLW3nezvNWlnW6jtu6dnjNuis2H3lw/458dX8tyR0pOSWmQJKCFuXefJd6YkMMNl2p7T126tlDu88gu+F9N268egunC23tSMKqTCqG4USIIT48pFrIwiVs+qBoCu6T/7zAg17r8Y9x6mUOutwORYG5KMKJihQMohqaw33AmenjpzduX2t9owJvQvQcP6ZqxRvrb1bwpEsNXqBAod+Dxq7crDBqqOOO7ToyMTT23ne9mwvHbWW8kfGdrgD8GBoc/K577/+ND/3mf7z0p8b7QDVS53KrUOLIxxEIE8Ona4lnt0jFEKsYrKTxDUThlkZ4Ab7iV/iKWH0VXwITBwiaWEVbWXbwxMkX9+7POhYgCu3tYiEfAcNz+lXggMQSWadJg86/6+KhDWN7v/CNo898dcfqLdtXb2RKc3RbyOsGmXMdVYXt5nlLO7WWaTD66mlfvZEmaa2WsiHj5yK+B32V0ymAR3uDKe2znKJv4zODODgLCt882XDMegrgFiLOUgpHoEcUFyCNly9V+FiLIhQSbx5T5d0PdmrVHwuEGM9JVBXAiZfw4nxDBV+424V/rXN5bvPc5lasOuc0y5wVzZzLVZ0gV3ECuyCUn1m7Y8PxZ/cdmZ9qjq41lCp1VbuKJJJrVVWJjDdOFYkYzHQ6T+3et//MqdqG5u333T5+zoSttTKxlBC8BgGs1ShcyxrRy0E6FVeg97uIzxSe6NJ9zyosl5X+lSc9BlYQTXSv6whKrtZfT+u19lyb1TdG8BSmyB0uD+YfrGUuqDi0514FXe8tJI3JwEScMS3wzIn5nVfcOL56DXNE91bGGxkrCgAEGOgH3/89H/307/32b/3Zj/7MO5v1PmOcERtQa0OkjpVYVcl3rOXwyKN8s6LEp4hLFGBpeaIQD4zkzwJBEqgEu9Yo6UK3s+vQwb0Hj8+3MgUZY0Qcw3e+C/RqYqMQZRiw+A6RKZyqVbf64g1rN6164evPPvjQgX2nZ88dG2sMNo+1un1q8hyGYaEimjvnrOsqz3WFqJOapJ4maZr01dIkMY1aYpgN+StV8tUjoFAHeLKsKhiqEpchegxRw/lgnToiYvIke4KvPaG9HkZQBqoKZg4XWGBMQRYU4hyoIDMU4ZpCxAeBFJp4BUdNRBWkTh3UiVjrnIgTiKqIWhHnnKqvtC3OOSvinCggAufK2k/B7RA0mylptvrcNVuu3777b3YP9fWv6xs2XWVSYlWIZ3opITaHTrtqXjp8+oXDhzq1bMudF553w8X14SRHJirEiUgATOBFJiJztCJqC2O9FJyLpWivLV+mUhdqnCpbUsXboyJd6zXeluppgmMb9JNA+obqaT2Zn2mRYwYrQSVyv8rqJwGn8+crTYGeeQIotiKgCLSJAglSY+vHDpzKO/KWt31XaphphdH+rYwVBQAABNq64fx33/G+3/jEh5OafP+PvGN8aJQ0yyV31gpUHRtAKfcUfkRDRBEQiiLqp6QRoKBg9BSotKovHeG/VCWKVjyTEcApsiybmZs9eOTYoWMn51oOSA2UQa70LSKXSBQc3j8l8f0dRSGkTnMzkmy69+LxCzft+/yLX3/xlUST+Zqew43MWYEoV2iKQrkISLtZZ6ENIiRMhqmWmDRJammtliRpjQ2blA0zYisaL3YDT0UR045UVa2i2kKxgLE9PhQzeKPvX0ZQxNvLAekNEp8K61dFFQrfCE1EnYiIr4ap6sUPpOCiIlRrClBRyIsWFUCcOCfOOeeTPhSq6pwER0LDuYIQUl8OQxkMARkyAJjFSid31NCNl2xv759+7Jlnt9tt64ZGBmpNlTzRXCAOAiKr7BzlwBN7dj9zYrq2emjnXdesvWCtHUCOFnGADyXAWaQiBojUSS2kIlVM6B7sJIzlYZte4J0WfVPRBq+JF1XOU6iBQEyKGA1Qa6RJPe1Md6Qr1EdkSCJAF2JS0VNDBQtCz8yicgngV/xEI9uaRKyYbjK1f2q0f9WtO29LkqRwuFfGGxorCgAACKhx+i/+4T/7zOc/89k/eGTuJL73+9++dsvqtJYkyHMn4FzFqbJKjUgAG7NSouEKRHHmrZXwTPcSnLkQ/F7KEPvMWlZoN3cL7fbxk6cOHT0yN9vOMmVNgyuhykVlm4LsHr4hhXohouRAUGILJyCYpH/zqsvee+vcEyee+/JTThzqiRPnjIqIUaiyqJBKEOUxbJHnDgRo18P5hmDYJGmSGGPY1GtJkhhjDDMbYsMwzFBv2SsHi1U0WPUhVVQ0AO4UmC0RJephbZIi9DSICQQRcY5sGAVExMM1iBi+hp/Dono3QOJXEurmqwhERL1FLypQkdi9XH3HZaKQ2F0xdDVmaagoSJ1PEhMAMLyQZYOD/Re99YYXBp945tlX9pxubB7dODwwPNRsGiLDyMRmSqcW5p7Y/dyx7sK6q8675u6rhlb1daQdwjjKTBAVitGOsgExAkxOBZOo4gP1mv5L4qfFEXoec+3dz9PQSohNK9stfUdeTUXEnHeTJo1mX+tUe2G6VR/kwBBCgdhRIcxp0dQWzb4sC1RRGQSFMjETd+Y6M8fO3H/P21ePjxZJcSvjjY4VBRCGQjesW/8vf+Qnf/hf/fSDf/3CwZcO3fzWq2687aoNG9emjT7bmRPbdqpWIJ4ws5htUj7NkfGu6Eko8k6DFmamB0WsuMy6+XZnanrmxKlTp6fPtNuZCAPGu8wCCc93EVcuyOBRJEA8tZSVIKJERI4JbDXh4drG67Y+//Tzbq6Vu1xF1MM54uMYhrxlFpn93oWBQn3DWetyBUGok3s5y8QEYWJiGDZskDAbNmmaGGMSwwzPYyUCkYEWGlEjHBGTHorlqqiAKlYdJqQVcpWG4ZsxBF0RDHzx0p7EqRNRiCiciDhfvtWpg3cdvB+lqLSFD1UfIIG3HmMIGo5fTE7U1+4AEcCsKm3p1AbN9rdeM7Z97Z6HXnxi/4v5fuGcB+v9owODBjy10J5amG3V8mvuvn77DRe5RtbSFhKrSiSIRoCC1DswCDBjuC2lqR8nVP5effx6oZ349VK5raW0XiLSX0WMlideslF8yBWASUyj2Zdn+fyZhebmkbwamwnRGADUezil3gsrv1rC6iEiBifOHD96xi7kd915Zz1NVuT/tzxWFEAYBFbIu9797t/83d9//MWXW1P66Jee3Htg3+YdW7bvuGzblg0jfQPiWjnleQ6hRGMP8MgsDCBPJbAZwGvyeoCYNOacMvnWX90sn2u1zszOnZyaPjMzNzfXtqrMiRITjLpKW94KFKokFE1nAAySAmhSJVYGB5oLKUh3vbTr5LHTzYGmncvFwqVkQCRgX5YgwErsQfUImUTEgUKFOYkvqPWFhERBouoCfqGqBGbmqEkKpIfJBNaoNwaZiUOeARsTCEMU/0bHv+JJFbKuYuwHRQDvCDjnQSFxIioqqiKFyxH7pEU0IV5eATsUvwewRQsKanl+iiY2BUhKxDDBOiXuku0qDMyq8zdOnrOpNTU9fWDq5K4zJ/aceP7QcSjnNYysHnr7ffeMbh1s1Vod6UgCIjJqFNaR9RiJFhkWQPyxV+IXFeDi31J9Va6vVxgulf49mxR8hart/2rStMd/CFtKfDwZMDVOG2ytnZ+ZX0NjPdP3K9qjkopqWSUViQoDh+ITH6qLKIGMMjkDa44dPDExPnnlldcSvdp8V8arjxUFUAwFaKA58Av/6l999/vf357myY1DfX0jx49Pnzz90HPP1s7fMLl968aRoaGEG05z4a5TbxAWb0y0aiOrHZE7H3n+otDcuTyTbidbaHfOzMydnpmZnpmbn2+pMpSJjSf6M1RIiDiWY4vvSkwdKKJoPhytAXf3aQQAAJYakvq8Pv+1FwBsOndTK1/odDqUMozHw4k40FE9mh9eJYkC058lRF+9TuMiDSxEwgNp3ZvnFJtS+sOqF2zF8aJSVHgPgCOuVcqf8J9GwzsKEEUpfCK0FNRsyUoJDkGQHlR+BmivZCvcjriIGkmIVORjRwu8enD1VXkITA4hlUHgjFjOIZwwaKK+dsN5W3cOyJQ98Ozhh77yjYm1I7fedVtjtG8BMx3qKIeHw6kUUCFCSjkVmWdF0DPS5wvXqPyxsKjPitD0uqkx0lusNlWlf3GgVxlLdYxW7guBmCltJA55t50zTLwLceLlo4ug0Rcb+OGI/kUKdy0qDVK/8tQ6tTA3tfC+N79v7bo1ZwW/VsbrGCsKoBiB13nXHbfef/utH/vkJxbW1dFNBkabtkZnsvZDr7z89OH9myY3nLvpwvHBRi1JGbknimpIAoiPKQBAo3Tz5EPnRES6ebdr3Wy7PTfbOj0zNzMzNz/fViVj6v5QIVlKQBBmKFz5JsRSEai+I4iuvlc+RCKeSo8E9UZ38Nhz+2cPT2+/5IJzrzz/m0dfGDw1NdlcnTA7aEVQxlBtMN45CHBEg5TA5IO0EA15TaGVjH+PGZ7LoTHfIcri2HgACg4qTAVErNHIrjDxK8K6IpN6zVKKwHC0D4vPe9ajPNpySEI1NVujpRmQl4pILQ8YGSwU9yBHIGILAZOSEZWWdo1yYkg0E8m7+cILx1465/oNl990paPujCxk6DISn0ahLidfP0GK9uXhLleUYUEE0qqNjACiVa6nuHdaiNy4ZlUfIRZXK0CbssF6IW0rz3DPClR+Rs8Ui8fIk/gpTQ2R2sx6PoQAqD4XhZYmlFhUvH/xMIgPBhE48K5I/EPW4PqJYydlQW7eeROroxUh9rcYK2vXMxiocfJvf+EXP/nZTx85MjVyYqJ/skmGOamldc6de2Xfvv2HjkwOj5y7ccOGtRMpJ2wIRKQUemR7kiOVstKp5NZ1u1mr3Zmbn5+Za83Mzc8ttNvtXJUMpQp2ooCHU5gCbFSYar3mFuDhkcL+j8WICICIsCGGgWhdOWmbZx5+ZnjtyMU7L28PWDPfmJ2aWb9ukhIiQFl8EDna9IGsUaSfBv8mpCqAACZiYoWnZEe0IpI4K/ZhkPlVKaUhTgtCYIYgwBgBBCpcgUVJTVgij1B6JpUR9y+YuCXoEyZTzWkodg4Uxd4zRaVTuQFBUnpmu3pnK5b+s0oMkHEqpKxIjx499tAXvrp27cYLd16cp9ZJ7ljYMUMZvnk8Q13kOGmJ/cSzFm5ZZcbVeQeToDpdIDCovFQv/aDSRY3CvRpKjgd9TSvaa4kloGShoZTAxGTSBEDeySveR5Hujt5VpeJhQXETohoiKjwwgXocUeAILTpzZG5ibOKGm25MzEr1/7/VWFEAiwYRcMGOC/+PH/nRX/2fv3b80PE1WyfSJqk4n6Bbr6UsemJ65vjMmcE9AxvXrp0cGR4aaSaGSZEkxncFceKsOutsq9Pp2s5CqzPfas8tdOfm251WnmdWIQCDSKC+zVNBx9AIgXpOOC81yCrYSI989W6AECkz0KTa7udfnj926pb33CWrkq7p9K8Zmj11SnNQzfPZlYlJXGF4RYmuRVaNrz6fBNJR0BPqaykXNM0grQpPHXF6GkmvUdqEYvvejGdP2UEUb54lGrVaOEop1Sswh9cuhQqMQqSUaYXcE0X1i0V4RymIdPESVy8iXF4U/6RqQC44AjDGWHK+njRzWqM0O73w2OcfWjMycePNN+TIbZ6rIVYmQFWduGDYk/fDJCpciFctPqdKIYXBrOVUI42gFP3FapAHSRTCEApuQnlDKNS59avFWKJBzz6qi6OLUnVjxEgBzytgw+IksIV9eKNw1sKVhgvx38S0EMTkj0IPelfP515CRBM2NVPvzNqj+078wDv+weBAcwX//1uOFQWweBDAwP/5s//yrz7913te2rv+vHVrRlal9cQa6ztsgf0mOtfqvLx3z37DA0N9gwMDQ82BvnqdiKAus1mW552s3el02912u9PtdPNupuoAMHkmWyhnAI8rVwRVJEKWkyokWPHiaZG5E/D0EAUIeVY1NWjlzzz85KrNa0c3rOlQ5lLUmukZI612e7jZYIaoi85GOGY0IyNKwMwoKPkhwC1aePGB5xTFZFWKFoiS+rL6FCRUD67NPh20gmhX+I/hXy/ICwiCer+vCOoeYVZgGdW9lso76vlx8SKjBw0JcsyrSQqZ22CCiFAgPpma1mXWPvK5rw02Bm65+U4HsciVAVElEdJYHEICqgYoQyEBHIsq0S+XqCIk1RVMKSquw8eDQqosAfA1+6CsygAKPoK3oCk8YxK0cuFcVNbpdQ2q/ESVpS6+MkyAEHO1c00RTqJi6lVPorryGlPLKfhtgAJqmEgIHZo9NkcdvebKK2s184ZmvjKWjhUFsMwgYHxs9MP/9Ve+64e+75UnXhlaUx+cHAInVnNxwfolIohYVevc/Kns5NR8ahL2jR1BTpyqOHEiIiHF1DeMIkIUiSXEE03n0tIFsEimLZ1jBevw7wuRc5oYSsk0pe/gS3taM/NX3Xldt086RkDUSGuNWmN2YWFopMbMSr5rLZcnC4m3hQ2t8Lx4T4EX9TlkfqKBs+5bJoY5lcK9sNgJWkQBUP2wCHdSpWnAMle51DwH0JMV1TsqwdLys9IpUFQmU0yoUChUbF7Ml0qJ69fIn1zAbAARZzkhw0kifSZLv/HVh6lLN7z5+rwvb1PHkRoAmov4gDGT8U2WyYmSCAPeWoaC4AMCQeGSeothyWyJyFOpmFRZSZ2AQAJfcQgxmKxckAUIos4X41DlkMdQ6u4lK3bW0eNp9X7jFVpgANdqtaiE4t0N0BQFziuqqx4UW/gq6ENP+1cHMf6aBcbp0f1Hz9m8+aYbbyKsOAB/27GiAJYdpMBtd975D7//H/4/f/Abu5/Yt2Pn+fXRAVVmEYWADdTTdGCICXCq1uUEUriAFUSuiie7MBkGizrAN/CNTnGPUVuI0MJmpUqArmKraWV/ivwaQWoShZAY08Zzjz03snVseMvqtrG++btJkkZff7fddm6YjRCXbPxoK1aMX3/QUCeAQwU4VvjEsWiboerWBzmrMQU3qoIeNntUXlpeV2HOn03lFZerSz7r3azQRKqLVIRPQ+v5sMfSDxcQoZaqDih3KMomAIAoGHBJkio5EjKu9sKj35w6duqm229qDve1peMpupT4np9QdUbFs3ahYowyITGUJEmScM3U0oQTTpjZMIuIinrtHCtEwInkzubOZZltdbu5kzy3IGKwQaKh8jYJxIkDfMMur980tJj2aX/xrvRe51lcgfjxoshMtNTj2oQV8r6oGGMUYPLFjYoFLzC/6o3QeLxo73N4g4p7qiRQSlxCbZw5NnvXfe+eXDO5Uvzhbz9WFMDyg0FA8hM/+U8/8+XPH3rxpWMDA1uu2ZqkPgFMRYTJVwVQcc4/9YH4QhqY597XDWE5UlFSjZWHqTDlAAQ5HiVh1TSKn1QURclnjAK6QIUDx5oM+Pi+Q6cPz1/zfbdlfeqoqyBVYUZffzNrt61zdW9oKUlh/lIBMJRGcQH4BkSWIOqrXUrUAxWEN0ywGswLkiNQWKOwiYtQGveFLSoV/YdCP5TCGYW12CuwtKrACoCsYJkoqv8Vp4sRigpz1+/kvTSurAAQF0cVREwsVpH6o9T6ksaBZ/buevrZq667amLj5IK087olYSe5c8QAyBqWVNQYrdfTwUb/UKM+Ojw0PNTf39/X7OtLOanVUgYlxihgqGxRTGSYSUVMYiAQqHOaW9vOsnY3PzU9c3p27szswskzM/OdrOustxmUyRKpEyIDo2Rh1Otd8dZ41P/RYSivsBxhVcrlLjaI9kp1a3+bRMRpUkvKXL3i3lIF5+lVAPF3jZyC6o1VAowadrWpY/Oa8w07bzTkEcqV8bcaKwrgrEOh68Yn//VP/9wP/qMfPPbSyYGx5sQFkxmzhDwhUWElYSaVyI4IKGdBxY/ySxRQgaOC/aLyKqcu962ILC0N0HCWIIAJiIa6wEE47ZoXHnuxb2xwfMOaDtoK9X3giTlNE+lQZl3qb70C7OfS49dH2U0RXQIAMEQFyiEb119cIIAEIY+q9C98glJmFPBLRdx6yw+V81cFQFRDpUosuT0VG51691pkYVaEVmVJy22qxqwCJd1WQ8Z2KeZCAQtVFWYjBIFrUGPh5Pw3vvLoxg0btl2yrSWdrJZbsQShFExaS7iZmqGB/uH+5ujA4NjwyFCjPtzsq7Gp11NjyDDDF4UIXl+wCRQqLjRpUAWRg2FVIGGt14b76wLasHqVsnFCuchCuzM1P3f81Km9hw+fnpldyDqOCKy5IyZS31RONIQAohsQPNUIxZSLu8QjW/xB4SsVN0qh1kElrSWOpOpelLfI55uXO/fokFBHJB7SWx5CYp2rd82ZIzOrBscvuvASYwwKwtMKEvStjhUFcNZBIAbuv/+tP/Hkj/3m7/7m/qf3D61eZcZqqrkyjC8w6XteUcgQhU+Ois+6F2/xAVcHD2yyFmGuJS9Y7wTKoYHhF6Rq8fJEMiVUlQksbLL63Mm5gwdPXnzj5ZJqTmrgCCoqDE4Sk6UmU+kDtEzSKtBUirAVCLFsJ4Ut4DPHCODANFXxkUuo06pRF3bRCk8oXkZh/6PHQC8grx7JHangVKqLyvIUfkvlmIu0QDhNyU+qkukrC01Ld4kzoZggVp0AE4taQ5RokrKhFn35sw8mfcnVd17TqtkF0xWyBEnI1gyG6n0To4MbVw1NjA6PDg030lo9bRjyXFwyRAqfI+EZvRLdSQJEFInh4uKJyFcm9XwpZSGAWYlESfpqPJiaieGR89eMXHfhufOt7tFTJ5/ZtfvgySkGWyVRgqjxJSwQgXgUvN9y5c8mUZdsQwXdNDyJorZrSXlgeLBIAaTwvSv2oyLO5N3W6AlSSDCMN8GzzdQxUZokcHT84PH7bv2uc7duVhWEJhNnmevKeB1jRQG82iCgbpKf+al/+dhDjzz4/NcPrj286fpNXGOF5DZnNsy+5K+EvK/42EZCm9cBPj4XHlTpqUDzqkMrNlqU+xqFZMml8xR+ZgVIuc+lu59/GQnWbd+Ya0ahuA2Hog+snCQiCM3FguKAD+f6FscgxJY1ITHWiwr2kAiH1H82BCYSiMJp7rmnRTVOrxS1aJAetRaibwHfgSziPPFyJF5yoIuGS9ZCGZWrRpUfglm+ZAXDkaNOqwBcVNmgopKKHm3xzkHLNggSbgWrgg0DapwZpqFHH3rqzJHT97zrrnQ4aWlXyTLljTQdHxjasHpsw+TkWH9ztK9WT0xiEkO+oCwMiEAxCE+kqk7Zs2FBoThdsNQppmvHmYV11mAFqBhWUQtShiaJqTHVBxurBjZu33Lu4ROnHnnqyb3Hj1txjlJ1EJQwe4wzoRS7vTp1yTtRXa2SfhuCAMLSFRJOGzWB63G3lv+hks4Q7gcFkMpPgpSUSNmIyeZtPptdcdml9ZoxK2b/t2OsKIDXGAQMDgz9ws//+/d+8B0vPPnC0PqBVZvHLTuFqBIiEF4a8xE0DSacf7MLuCYQYNBrS5393ECwzXrf01LZhG1IRIgNq5GOe+W5XRObNzSGmgvaNkTirBLYkBVL4HqSKokVx4aYSCW8gkQAGKFYabS5/Q8MhLLPUbpH15sMSMCUOmu5DFmHEK8hY9WSD05qqBkTwXmORi0AiChBQSwq0CB5laq6ATG2vHSdAl7VI9rRs8ARZwrgfrFrKPET7c3Cri2Emtc6WoReiFRUVAxpirSB5qn9U88/9fylV108vmWiZVrKtgE32myet37NlsnVY0P9A/VGklAtSX02VKwBDhGBKlfcOvi0PBIoQmFV5mhiA2p88wWNl+3rVIsCSqos6ogTjvrQsBDYSGfr6uF1d9y878Sxh5569ujUGaupCFRTVRGONkqxlhUMqly9csl6Vz/0PStySAhOOwvtJE1qgzVHeaWYU/CEI04Yi55qVXEXHomoPzbBcyhISHI9dvj4YP/wBRde5Pdb0v1zZbzhsRJFee3BhJ03X/fT//xnuFN78asvdE7Nc8RHgNjcxQ+fxEPeiqHIZIP4xk5UATLOTmOsDvVkwPhiKJZjRAZoiJhBgpOHTndPtc7dca5jC/Ixai/cNSFOwEouy+atzZwvBs2qBo7VBWOMIFBfX9OjseFCw4tM4cWE73LiJSsRjKewmCRhkxiTGpMakxiqJ/XUJKkx9SRNkyRN/Z/UGGLDxvjiQgVopowYmCRQJMerhiJvyy6axm2i+4EKxbYkZKFYRF/UIhrAvYoi3kfE4hqFAA5JagpCkiQkpLmiax574PFVowOXXn2Rpl2qd5sDbts5626/5qprL7xw0+rx8cFmf2rqZEhAwr48DjkmxwyTcEpqSJnA4VkJ10ihdWaI7TMpqxCKMqgKLYoKKcWkBAOFOPU9WLyOIQMl26hjy/qJu2+++rIdGxu1rjG5UysRtonXf3ZXqvqw9a48ok3gFTQrzZ2e66v3mTorROK6hxoUXg8X9VXDI1U5XYWJpqqeO81MTIZzOrr7xI7tl+y45GLCiuz/9owVD+C1hxd2P/D9P/TQww/9xV/92a7H9227/cK0LxV1TkU5ZvRAStpdafZ4xFM9mBwkyLJjOVCopLWEiRS4EApjVhWkZDhRJ4nS3pd3JYP1iTVrcnIKZSgpOVFiBZNChwf6h4dqnFLXZs7BEEl0AQSq4vmCABlP/6dYCsifPADSIbsrpBFoKH4BmGDfA/BGbAhdAKq+ZTAAqCgZKhisAmFRhQFgC4tYq7pSGbR04ZYF/INnFDePQfJwD8rPencqXbIgVQtrODpZ8OoIzEQqRElfrX/fM/tOHjv1lrff1Ddmun0Lk2MD501u3LFh81i9wSQJgzSkiakSfHs1pZgWx5BoPGvl4ajiMVH5FDON4Xf2ljI0ZnJ471JjqhVBVADNHZTEkKaMyZG+ay/ZlhrzzRf35VZBMc+s+OfsQrXUnxWHIVRAEfjMOFK4XOZm5oZXDZtarGNY1C0qPBd/d2Jh0AIqRWQde5pVeMJVIcxqJEfekp033Nisp4bKuMjK+NuMFQXwOocO9w/84s/9/ANf+/r+Z48kQ40Lrttq0kRhRaFimRIUHcBRpNJGLnwQm0VNnSUmZ/Xn8rXwv0V8IL43hflUEHX8Nsxk29mhvYfXbdpk+usdnoUvR0SUGHaSqzCRjI6O3nPj9Vm7tfvg3uPTU5nNuk7FCSeGQI4UzKK+/VUExIt3l2KbVi+tok9PgA8QaEwL86FhACKOilLPgcoUmsl7hmNCLI5AoaEjB4M9YEn+5Exx3bT8HxUJsAhEKCWhlzxFQCFupYv2B6GIdofjaJD3FP4tMCRv8Bo11LYvf/P5Cy/aumnbGgzk4+vGLjzn/HPGJuuidc+4IQFxaJPTw2wq8KQi1BJVeQi4EKlvChmWQQEXfZZgTaCYcnhiSv6B1wG+h67XuiIEYWgzTc85Z/OuQ6dmF9olGFZI9EX+UPkhLftdCR95/QbO5jut+YXV529CTYWdpx6gMAIQlW9BHisanxW3VSvZzRQAMbF65uhMX9o877zzjOFyYis64G83VhTA6xoe696+fcdP/Yuf/tl/9/N7n9i/avWqteeOGCWwgg3gi1yCtIDVCsQzvDnLvGRV6kUJXRcwEYDSpgtHooosK45ExGoolzPHT9tWfs6F51hjHamKGG+qS2g5rOqajWS4pmMjo1vG+uY683tPHj96ZnZ2ZqGdZdaF/q2kCKWKfAFo5RJwDUHZgLyX1jbBgxWxhJzvAiZsuILOhHAwRZfA/88ADJGE4jUiqhRKbZOgKBi0GCum6uJRlM+I32k0qatqImpXKm9Cz02pqnD4BNoizqnBmiUylBhb27fniMtmbrn1jrHJ+tD6gcsuvHDINFJR9uCfItj6DBViNhRkZWQEhCqyVEw9aiwCYlVqqKgo4HyLg4qsrjxScRHCLOMXPlSjYkAqJApHQgqSBGKg6hkJKIDgXssknmW5T6PKEn+F7EO1xuRmdnous3ZwYoCMxBxzLWpAeDpEeGQ1MA+kGvQPXbJ95F7FqUk4MTV0zeG9R4eaY9fdcG1Va6+Mv+VYUQCvd3gg6Ef+0Q9/8atf+NQX/uaVx/Yyb5zYPA6IlZx8Hf5QOodK3L5q32i0MwuNEKUUgJCWVA1kVuUdxagCiqqKoNDNC2BAYDI+vOsI6hhdO9ZBV2A5WFiqKsTghERg864xrmak2V8faKbDQ0PnO5mdnT955sz07Mzp+bl2lme5c+oxWAJIEJpNAvEygnGK4H8UHrlS6E+pPnLqaewamwwUcjd4CRSvDlAyBIEhNh78FVGvDHwfYIkQR3E/tOIY9eRKB24qBeS5yJXr0cHFUaK4LCVn4WH4T0qQBOprOKnNbbfxzSeeu/2OS865cGRs/dAF27Y1UYPNjcIwRf6UB0l89SiK9jrHmGkw1BF6fGpUjeVMvc50Kk4DO14UxEUkO5AKKGgThKsvtCWDfMFBj8cbOItTJ6bn5jq+JJAvSaiVG1NZFsRlocqHhT/qf/ahXIVRcS7t1s8cnjEm6R8bkNgDWOPjS4FpFlYSFNmu8UnyzwNRbIxG8NYDLNM8dabmLz33sqHhQeMJb8u8oyvjDY8VBfAGBgFN0/yVX/ovB773PS8e2rX7MR1o9g+saUCRi4TCKwRSX43RKGKabWm6xRc7GHqxz0p5BlSs2MLxDtY3w7c/LOUYM/vetgYwOR/fd2L1htWm3zjqIJRTAYhVkQTRpWqddQ5EqpwoDyToT9LxVcm6kaE8s/NZZ3a+NbOwMNdaaHU67dxlLu86caJWyYmA/RsfnJHoFEhocelxMD9RJYBMTC0oArMBNfLlP2M3qcgngWeAsIEyQcln+xhhEREfCi66QVYRnUKqVIg9njMDqgq0Qg3FHLeQwxcdKy3ciGL1i0w2BXy4hWtp+soz++p9ev1dV5x7wdoN69fXnWErUG8JeKNaAAYMqSEigmNfJgRcKcQj3iEQ9Sxa8bCeP63AOXVOxYe/w9R90AVFzjRF8Kf0C5WgEFZiolytqjKpYXXWtBbkpb1H5rsWXGDzpR9RaJOKJqxK2sqDquUjbNUlyglSzmonDkwNjA43RmsZtxWOKD7PYVkpAlxx5UPWffAB1Kc3ILSmY29hWJ0+NN2d7rz3ve9OmJfkH6+Mb32sKIA3MLxwOW/Tlt/6tQ/d/653zh/q7P7GwXOvWNO/pimsOTkAKo6VA1BC5XsS+6UHI7aI+BWiaQmtmShGUP0f8qI3iilvVpHAG0TsOJ/PZ6fmt158nkusSSBKDBIVXyNUnYA0cjGZYEiJRBM2AjFESZqKSQcajVXNQQdk1rba7VaWL2StuW631c3aWd7NrROxvsadihNxIqJwGjJ2QEREKhLmCaEQP/bYvvf7FQE28JwXLaRNaeMXlqKHrggmMQwNhS5F4du+R+NeA5YQZ1HZPxw5zk18k3ef6ADAJ2GpInSn7BF4CP6TB2XY30oWQ9Yceunld33P3Rdedv7qoYGG1Ekcq7Axok697inYqX46pArygHjFvyOfiu39FEZMuwCcioNfXhE//ZgwEK/HH6UAwfyzFAhMqqSkFo4ZzvmyRZQ7vHjgyKGTp0UpAVeIlBSxHkKvyD/ru+CZnCrEwgoCTG4609n86daWi7ZwA2SIQKrEVOj20mUsckFQgHfeQ/BOUfFAsJKyyXnuxILpJlddcXWarFQA/XaOFQXwxob3l6++8tpf/aX/+s/+2f9xcvcJgVx0y/Z0iMSpQokTK05JDTMp2KNCAfUMBlU0sQo5EFNmK0ogvAEipXwDVIJQJbB3OCBgJQEZ4VMnTsNictO6HFbEkQpQMG+CXcwwEHK+sCeMz23ybWxIyad2JTVSULOWDDbqTiXXkcy5zObdLHPOtjuZFc1zm7s8E9fuZq08b3e7We4c4MSDNia0D5Mg4Dx9NKT5e4alqELJcGCSIpDtY9xXAonQ09w5Kg6/OKxQGMTQI7zkYw29awJYLxJC5kHGKVSFyQTIIchL9WFqhDBJYCp6MQoEIclKzqkxmnLN2PrRfcdWr0pvuX7H+PBILamT87m1opprLJsHJQ5tHW2Bc2hEdcgru3CvvZdE4lMjlASaa+78Ivpnp4SptMAEK0qOAAgJVHyPHwUrnKgk5MvEGiJz4PSJR198sZs7YxJRKeIMBVDpR4FIFqK2AgnF35QAX6eTGUw5pXly8MBRyWXV+klXUzVS+CZagH1EHFIQVSvZbZFp7BeDFQJSdaqWVci23clDp+64+02r10yupH99e8eKAnjDgwADes+73tGfJj/0oz9y9KWTfcPNTRdN1gfTDNaqY/ZGsHrhRyW0UETxdPExNRqp/vfI5yDfYczjNSFySSwk3rAk9c3cCZQiOXV4qtbX1xxsdrirLLG3iIJYS2INqao6IQbEM8cDB1+LNzv47QRWZk7BDU0VNZGmaLBJnXNWNBM33+m2smyh02lnWe6kmzsnzjoRJ9aKU+scxKmoOOdzvKAKUQf2lHYXQQEA5DlLgNdv7GEjUa9LFEGgR+M3MnXibfHEmQLNIRNURsDbVQAJORNBLHneIZOqEFhi/xQKUFRwI5ihqsYQwGKRWHPglVfe+923bd4wWTP1RNnfPDKIWj5o9yKRoKdcQgU7C7MrtgVAEHKi4tRJeVNQ6KLyASkUm/+A4MET/1iIigGSpOZcJqICmWllDz793JmFHEjIgQxTBciLAWRUZlYO7fnZZ+76ubFXJAlx4vjwnqP1voH+sUElpzGcGyDBiDAR4r4VFmhgBhQnUIgRNqROqUsLUy3t6rVX3pCYpJzQiib4dowVBfCtDJ8D9ea33vc7H/7IB3/0gy899DIyOf+68+r9idq2GhAHsoN41reawMmDj/oF17cUy5HbF8uyaXCTS5VQQbIVDIh6rCAe1umJ48dXrR5FyoKArYR3K6aXejmjqk5yQAADRLXj65aSZ2FC1fnMsuJ7gJWUmB1bBdSQMDulRr2eOdfpZl3nurntZrlVZ0WcE3HOiVgLa8VZZ8VaK06cdS53VhTWehKtiFBxJk9W8jKaw4UGm1li3qsPBvhdgpQODdUqUJpKNJy9E6ZEIMOIPEqCCfsGdCgKWWhsZRxcNufrPbHXT3rm9OlGnW65ZWd/o8nksSyr6vyKcwB7wlJzSVXiGPmJ5nbxTQj+BgPeibNqJQBavluQL6nhtVf0lzxk43vMAAIWiiyCUOeTxKmA2JiFXL/y5FMHjp5gSj3uRJ5khVLw+1FJAl5exFLMi9agkpQ1IaG5mdnp6ak1kxvTfmPZSvB2gHgSv28BfPpFKNKcA0gUT+/fjIRMXevHj51qJPV77r3XcAySr0j/b9NYUQDf6lCkJr3/rW/53Q//z+/9oR966eu71fGWy9Y2x+tdsT7D1kMeHrzwdNGyeLN/SSvGnX+kvaMcodEgIdSzOUrngdj7DOJLSMLA5K1sZurMBZddhhS+xrOX9wg2sxBFNF3gRKLbH49ZvvVEitCdXhDZNR7eZ6iSCpGCffMbsDENMn1krGrX2m7dOhGnzqk6cU6cCDlHKuLEOSe5c9brAGutuFxcnjubi7POORGnnvISMX3WiOsTG1YARlUZ6mvqibD3C4LiFK1QKwEoCVWIs95mpQrsRkH6xk73wRwvTHYlH8/2yL0xSV3T3fteuebqyycmhqHWUApRYhWC86m63gKPVNcigF+dQsSDKJag8ACOCtSps+qciMBb8/5ZCGhUIaqVKkn8Qd1QdI+UiYnY9xMgTee77qFnXnhm1z5IquSbl8W+xtSjj6peaO8oTh0eQ3jM3q9LrqlLjhw65vJ8Yv0ENUmNFM9q0BTwiRRVjw3leRUFI8h/52lDsKAOdWc6wwNjG9ZvSHilA/C3eawogG91ePFH/KZ73/Ir/+FX/sk//ccvP/aS2HzbdVuTIc4ckNL/x957x2t2XuWhz1rv3t936syZqtEUTVEvVpdsWW5yA4OxwQQTk9AvJDekQMJNIMklhYSE/HLDDcXmJiQQAqEFCJCAjcG4YbnIsqxmdY1GZfqc+p3v23u/a637x3rfvfcZiXuTH5bGf5zX8pxzvrrrKs961rOMk7ICmZEKU18LAkCOerxnh7K1Rhvvu/Vws90S7jxfJ06SMRYsFEYr55brqNsv3q4QYiMzIk4dRD4osoV2jUTVK6VdvJwFN89rkaJsHNh7WYFAwfmhfuuWzMQ0YFay6QFHKaJpIxoliqcBRqamULWg6lrxGlVFTM0aiRJVREW0biRKbGppGmlijCIqJEoiakbiYJEZXEs4AEbGEGV16MMApDEFvTqm9fCXhEVTcCuIjKuTWdCMw/lRMj85igCoWuCgaiEUQcLK2Rfe9pb3lUwgNhFSE6Suhw1Scpn1ko5mgqbaRKz1Avk7DWJ+xDSF5TkXQv60VPbJmGEysebPJtdFRGDnjJKKVCg/8/Cjn/riI4YhFcHFh6CKrgFh42X9UvhKjstbmIqA3CJRcEAox8MXnjhdFsX2vduoVPOejvYjW8wOaaMT1pbr9RkOAnJOS6AALqisxvH5o8ff9fb3cvB0aNMHfDnXpgP4cy0zK0L4ju/6VtL6e//W9z3++SeowKHr9xZbi0hRAowl3S2c4IWM/uTbOaEXPRZQd0umVvjMmoABxkZqIDI1JlgATLmxpRNny8Dz27aIShKJSV1cni6oU+pgULMooi5tSaG73bPzyZm6R2yWgjHn3ECTVSMOLd0GINe2NA5MpYUhmxaFiNaxiapC6uJ5xmQgRVAzVRIR1VKhamhEVU1Uo1qMGiWKWFSNUWOUGKOqqkLcgwjUfF49DCym6tQbg9cn8u4kC0eWbTEllmdma7ZBeRvre3aABOQoJS0H4ggNgVaeX961ff7I/n0wCwGAiUkb67ZRuaVaRA/ES6e0F8TnVMTdgSZWlbSngb2RwJExtHlMetK8pyoTNX3aMIMJSU+ITM343ocf/fR995sVAiahwMhegvqXWnvdtf/Si57a+NOcp0mGQnl0ZrJ8Ym1hYWF6a1lTFFWP4i3rpLc+OWFCvXIXfDeyT0tgqSd+tVbLsZDwpje+cVCWtHGDN9eff206gD/fIgowZv7L3/Edyvw3//bffPq+pxHtytuPlDvCWCaqIDZhUeWQsFvrB97JgGbcuG/9jQxgMpAxm2avQaA0aENMApgoDDF97vi5uYUtw63DdVpzjAgZD7dUi/RQjAyIJuJK9OZk+VSj7t3hRnkuh5kpaaoVmgKhmyIAsqw3QDAGk1kwI2ajINBhCGIWEaOqwsQV3dQIUIOIqkM+QCMSzdRMoFHEbOCSbSImUUTN4X8TE9Uoomr+ehWYOWii3rxmUVU01TLUTAydwGjqMO7bZjIgCTq1fpgMRkxsDDWNSoGKsiApHz/2wmtuv3FmyEwGggmIWSEmZmbMbve4A1TaTiq389pG7hnxIAZM1dRUk5cFmBOOlHDxXE9O8T6TMRGU0qy5tBNetcnZZG3lfU8+/dF7v1hFAnEIgdSCi8tpqqD/L2DpiSJLgLcVG3MgQQBPYfj40Sebqr7oksvCHFuoQIq2xaOdleFXVt6bnGRBfSqdU6UzJKamASFQOHPizNCGt996Wz8b2lxfrrXpAP5cKwWRZoNB+Z3f8Z1M+Gt/628+++AJVbv8lv3DXUVFKiagVEoEQXMJLKf01LoBN8JEyJMkPY1P0V/rGswACFh91jBqYGyLJ88tXLxTgyAwIMQ9pBkgZlOBQU0J5JMMrHFWfa7DdZi4fw8nvr2bHkOaUN7zXwmpR+tEOkSbgEDMTiYiLoN5hdMdgAMmVkAUYlEIUU1MxZMAjpY5/laolOr2yu1EymDU3KNIVInpI6N59dm5SuKbp2Jw3qtnBVyo80OdkuqoiqlLeRCApIXHIGIlUkZJKNgMwzC7fOrM1V/31qIMHMwsqgnDCe9KKemgpH2US6xtL20fXzcAloEamNfCU407XxgbQ3Tq+LLd5Udi6gWb3EgbDKSKRuj+o898/N77YlM4l4yV2mJPIuZvCMJ7CdGfsSxxQ42gSmRkJYUwYVqzE0+f4KLYsn/BpoUKIwUZm2kXWuTbJQ0ja6s06RpNqJfTuJKrUQoNr54ZHT50+d6D+zYbwF6OtekAvgyLiAIQCv62b//Oqen57/mr3/3co89R0Kted/Vwa6zrxkyNLJoGL8C1sVHGXbKxT4wW/1spsdwJQOpCUhCxBYCgqiykNOTBaHltdXl01e03C8w1PRM+C0sfYGACUeAigEklRYz52zkZfwC5G4c6zR+HgtJ0J4GmIDOTmPzV/RGX2po6MqaARFWyghNS4Rx9eGiOQh0ggomZd/zGKGqq3gZl0GSzTdT3LaiRmqoZRKOIKdS0IVWR2IiqmZailjrVLCcQQIwCK1qKp1e2k6nywNS5PgayAGMuClAU4kEYjE5NZFQdOXgo0dYNzGQGUXE2JBOpGcHIHH9jELLuveZ6SsfhJIJCxTTtC2XZO6YkhYwO/EsYFpSQZ6QBAaQmBim88UpMjRoaPvTMsT+5577lUcU0SPKuCQxDmwB11Wnkn46a9S1tjgwMnnSYEcOYTIhloGVohmeeX1o+u7x1546Zi+brgShJ6vBuPyDFEP1CgrWdy0itv7ljxDQQlxSssXq9WV5af/fXvc6BuD/HPbq5XnptOoAv3zIMCv6Lf/GbLr5o+3d93/cefeC54czMoev2T8+XNcXaYjf6CWoGTpQRam1DisEcaGnlhCwZY8t4RS7tAkwKY6VTL5yJwgu7tpML6FDCe7oezFRzVA7BYI3UUeOAinxb9sCftkcZGWJuqwM9XCOFgy2m1UaHHajV4tNKKfYj/1AysoS4GIO8SRjE/ilewLBgDmWLCuBjx0hUBFkLwrdPlV3zwd2DqRv9KCrRsw0VEUluBRLFjBwvShmGZySWasC+zw7PszKUzWeqlWHL1Ny5J08PaLB1ywybWKKuQlV889zOcjaZzoP0/u8O9ss9yn3+r/+HdNrazMFJuATqXpkEwFOvBhFBLHqyoQbRWKAgLr909NjH7v3C6lokngZpIHRzZPyL2mksf8bKpZEuTklvTm13xIGZVBsZ1OH5J05q1L2H9/EcCyqxiKRClB1Kei86zSbkC6d1MZblrpwTFa3gsHh6OYBvufXWshz8r92Mm+t/bm06gC/fIhCsZLzxrrt++9f/21//wb/+6Y9/eu3Myo1vunVqm8LWzcy0UUJgNjVRYeYU85Nr7wMZ6c+/BMfihbNZ9yDV0uRU01gMwtFnnpnfMV/ODoQbSygrkrJWmjhFYB/pSDE2MUaf+NIjgrthSEaQACfn54jQWnsPf3nbu2ZeMOimYKILLLkFDVpykaMV7auSBcgdUwCDzdTM0wYi0wJECjGiKLEx0TQix7RXv0gpj09EcTcALxSbiIh6TzVUxCTVs9t/YeRv8mMgSGO3WAkCgVIxAGxuajhaX5mbnZoqC5ioemHfPDlhZqZAPcA7kFlGeey8g5ZcauczATBB0T2QGKy5Mw3ZDiu5H6CcPjKIA0G1NlhEcfT4yT+97wuLqyPioWXGAScXn1OPzI7tX73+b3bfOd7u0TYNLv5J3sIXUAQr15eqE8+cnp6bvejQbi5M2EjJSK1XYEgn3DZ8W6I3+P7lNkfvTjQ1ER0Ww/W19dnB9B133M6b8M/LszYdwJd3EcwK5puuf9Wv/6df/oEf/Lu/+du/cfbE8qvuvG7fVbsl1DVZA4uxCVyGECxH2HBait+vzvrJzUmZx544LZl6bQYjRQhBqubUidOHr7yCZsiC5Jmy6CGv1vsL8Bl7yJhvJqMAbdDe3au+eQmJSLlBh2J0Ow2HK7Lxy2O+iJC0HFI4mS1hYr5Swpfy5yC/JsXCCUpigkkOQnP/Qm84lvNciVw3zczIBdiQRjYTQSn3zRFC3n4zA9xWBTLPh0gpOwADjFS1KIgDhlxO1ta2zM+VUwUo2fb2AKafOW2DweXxjBzdaDE2P5kJ/zH4WebAJpY61txDMlrNy77t8wvAEyBlAMRkpKIhFET8pWePf/K++88srQOFl7ETW7hXUchUgBetHkCz4cF8reQGZkMaasOllc88eWwymhy4fN/8rtlJqEQlpzOW4pBcBCd0TRHt9eV9AWxgb71270QIHKzB2tL6oYOXb906x7Q5u/BlWZuH9cu9iAjEsH179v/sT77/b/+Nv7P43Nm7/8cnn773OayGoQwLMFsbHBozMQcCGBRywxcbiKwAytzARUi6Lc5sISZisNBQps4dX67Xm/1H9lGpQj59JTNI2ve4xU32Vp1L39mA7DHcnjmdMhvm9H+D95ChQ3dsg+XecF8nkMahjdzU0MOP2uYmQs5q+lFnm1uQLzCFVFpVULK2zMzM6UggTSQjgJiIKcAImhi0DAqcvjO1VBkIxkSB4FMpC+aCqCAKxEXggrkouCy5LENBXCLMTc0sL61Oz88zEZkxc66Iph1yilOLphmZuiI2ctNycgNEBGZiYiYOPvGQKJBveS7atJV2tB+i5FoWlqJnsBEJKBpiI/zMycVP3f+lZ08vRyuMC8/LnCzKadxm5tn0V4fQvMj8917LRmTBR7dZkIKY62KyWD396NGyKPZdcQgzjJAKzdReMxtP6oa4xNJ/yPlBygTNYFwUZT2OZ08u3nrrreWg/P9AqzbXn2dtOoCXZxkTYceOhX/54z/2q7/wyxfN7P7073ziE7/92XNHV4exHIZBQO7ucrtAaO0TMTzKT7QNZ6YYszF3sbiCyQhDnXn+yZMYYutFWxtESTY36UMk50FGZETEzEQQE5GYBoxY38xnUDxTdZKRT1CJiqq149rbPc3b3f1ALvq1qI5bvewI/G3Z6HfvzFBT/zAmaXgzN6++a5RRLmIK7BJ2ydBzIGYw1KBGBhPLHPukkZa4qGZOS/J6ALodBrxrzV0gFGQIpEj1aCI2i+asT/d0DrSBvDqbN9yrC60nbMP47LySa8t0fJ/J6Qwkag1iPsBoDSUUau2hNKgamRUcTiwufeq+B46fXQZNAawxcgBnqeW+aaeNdj4TX9F+S88nWNd9mEEkK8RYCRg0xYlHnl9fGs3vWth5eGfDtSBynvWGVAynNtnYsDvtdnRuLl0TiS2rNF5sZC3edP31RZFn6WyuL/fadAAvw7JsFmEF6Ju+8b1/+MEPvftrvv74/Uc/+t8++dRnj/FKOcSg1EDGaBkxpOa0bpgyjMCu+dnybIyC5wdOOhSewnAQp45+6dglV+znGW1CFcqg5kNc2Aip65/geAgMxByImRnmsHDCYAyqUDUVb/L11/sNmlAqghOMWvA8pRY9dkbCq9smI8sFZmqdSdsVihTSWiYnJiuYCIqcEwTybZNGY1RREyPLPKJM8ifNDQmSPpYM7PMwfRdVVSTGqIKcGCVBIUpMq7bpTtOmQ9VimkJApkwo6kmcmZlppFGIQsV5sUSUIn0vJ0Azym95oA2ykyAYkVEanNlScswH5mQP1B6ihLWg9U0ggAmBAIgLQlGU4uTS+DMPP/b0qXNN4+4WRSjYYKr+Zc5y7Q40kMgBcDt/HibkyRq1fppABlJK2hysRdCiXoxHH3meiS+79jCmJJJEabwvo3/+zvP66D1KCZ/0uMeJZMSBATVpls+sbpleuP3WW3Petrm+/GvTAbwMqzOITGA2veLyK//Tf/xP/+CH/kE823z+w1+896MPrj67HpQLDgA7pG5pAHaLnrIDNcRsBCU1SkaXWMsimFiQsHp6vHJ6+dJXXSZlQyUZlDmVDtx0E8Cc8CCDMaMsi1AU7G2syOLBORCj1nAjJwgJ6umh/4bkTnoGIsM5ObDPIWXv3k0v4+5VyIiItyclAAfuBjPwpElTyFIOkGNqpDFjOdWxzhKrKQxMzOQjdCwPVIQH8ulPsGNxqhD/V1TURMz8ZWIASYwiUldCIBGbNA0CnKHk7jY1PHQb7gGwqSvZZRKQpQPU5Vuafk3UeG+Lc6kHzUfdsqcVMiFjqLf4uiyTRDq1tP65Lz3+1LOnYsMEkCqgTLkAnSoOBtJ+DnZ+ZP4Sl7HlH2TkbhUECUasIVSDFx4/vnxufcfFu3cc3o5pRaHMlHKefC37JdV6mhevDmGEpRKAGYMGPDj3wsrh/ZcvLGzPdOXN9eVfmw7g5VzJxHEAbZ3f8iP/5B//7m/+zkVbLnrqi0/d98kHx2eVI7MV4MJAQqZMQj4WhMmUiEMgNeMklGhgspCC6cA0YH7owftmd81tu3h34+AxhBNZyBuwiDjjCmXBgUOwUIA4qCUsOQWcbdSZAs0cowJo/QmyMfN7dSNI3X6Mf2hboLUMk6N9NbITQRvdInOKNBUHcoDqmUKjjZoqkgZRYkimD3SDrtnueg2YUtVaTUViFFWLolHaITamZoqkualQIWtYYsighx8G9x9kxihCCcXayurqeBTJjMmYlNg4cTN9BEGAum90C+4ZlZJGFcvZhQLiEw9cDQkmsMYaRTSoIDq5U2Fti4AYnMvELIQaEA6ojU8sjz/zyGMPPvPsqBY1I0hwMW3VxCRuyz3WHTK8lDXunaUWujfAWIkQDGZoQHGAUE5m1k/HZx99lokO3niEtnHDY7Xaa97MQcVS1pcKx5Qvgg4i81lo7eE2Upd601otUmHF6pnRq19752BY5ktic33516YDeGUWmdmgKN75te+49wuffc9X/4UTDx3/wkfuPf3kEtc0IA4hACRpkBaImYiFNEIdxWELbD5aNhhByFhZGzv21NGDV15qhRmiKyQbGwcwcwipSAoCBVfvJCYqysDMud/Vet2plBHYVKtEa9d9ZRi7Jfi9+K5MliaTPtGa0jb/773WWsOQS8JElGYkOrcdqU8qIzaufODNDqkSYanB2Emf3g+Q6Opi5m3C3mDsmI/2TJ9lDIaUkhSpf4RDXejQLlWrq2owNRONQ1lEa4RqI/+63C9N3TFtKyupKJJYt6kfwc2fpHjfxBLWlMJnI1GpNeZ+OseBmMFEQdSimppNannuzNI9jz72xDPHx2OXLDUQU9Kd5h5xl1o3DqArv3eXZ/K1vb87z02U1GUtCMhYMKiGzz90aunU6s59Fy0c2GahbqwWVa9nmyn3cjw6/1O7MnBvO8gICjVWClxSuXpiFCJffeQqBm9WgF++tekAXqGVqpzQ3Tv3/PzP/dwHfvoDWMO9H/780tFRWC4H1bC0cmDFQAJFIBqBJaBmcSYHEwIRg3xEWMFcWnHihbOV6eXXHjFutOVHEhkl+54awtDdhQZjnxcomnk8nbH2F/iQmWTU0mNuk9HyaHJW8OLILH1oih9bT0HIJB1qtwwZV9r4Xp8Fpp3NdMdmrblX9SheNSb9n/ZBE/H2MTVTldg0sWmiuJiQphwlfW6ubPvyYrDHpZ5QSPp6mJoVpAWVZVhaPtdUsa6TOhE4yVdHmOT/UoAPQ1vi8KYE08b/E21Ua5NKNJqD+V6cRq7GgPJ4AIWaE/BVrIlRTRHWann6zJl7H3vs8WPPjSbRDE4QIHeADKSkhNEe677nO9/8t7+86AkfUIMIVmhBcXpgs8snlo89cpQGxaHrDxZzFksJTmLLMg+cfeFLfUt7JXXPGLlH9C7GMKDByunVKYRbb7m5LYtvrpdjbfYBvJKLYMRkW2anvuu7vuPO19zxvd/3v3/+D+45ct2lOw9uXdi7pZwaGFO0WMMaExCIKRXvkOqwIYVWNF2Uj3/piT0HLx5sn5qUIzNHfJBkjinPQ8lvTmQZVVWtqyrGWBYBlBt+KX0+iEyVuPUm+RdHRGB5mq53puYgPHesWgYPsi/K2UQO+tIPb2jy5/I39U2EG0M1U5iQSS4f+9zfBG1oGjWJXHH2VCLpAkVpmtiIiIh/j9PxTVtxvISwczq6vjHJ6qX4110LawB4yMUUnzl3+vSJxcFgYboMMBAVRnBlVknUl/bD03F1f9l5uuTlLJX31dKEzhbSMqixz4aRlG25hrMRLCrWquqpF049euzZ4yeXqhpioe3lMGp3wrLR79v3vDbY0/waAiwz9V1/EFAyQENBMGEJIQ5sVBx94OnlxaVLX3XprsM741A0ie0l/0pA362n+CK3mFF+sGMQpPE/ICIjmGhoivVz413bd+3bt599LNEmBPTyrM0M4JVdifdHJfiaa675nd/67e/+5u++/0P3fPEj9z9195PrxyZYRdEMghUuMcbKjrgkUjkps4/Z5dWl9ePPnbr0mqubMgqZlwACkUMFjkeAjANRoBDIACIORVDTqpqISC/8T9uW7IDfclnRMjdrGbKiGjLu31r1zP9sgYZUwHXHkJoRqC0xWBIBAvzuJ8oWzMxVDdRcGlpFzYF7NUgWjItqUU1SppCgFcuRoiqiaFXXtYf/ampwuQZNKJJTi3LEDSjARqEVyW+9UWrO46aKq+PRTa+5ZjxZvf8Lj8RJ2URSkHsmkIlKO8xGWxFS3xH44xSNo6ExbVSjiZiISmMdSBVVG7NoiObbrI1EgSo3GpqG4prEZxdX7nny6OefePLoybOjaJEIZqwgNTJi82pRB6y9ZKyfk7jujGUqQPKNbCAEWCANBFZVMpTKU9X084+ceOaxo/MLMwevPxi2BAuS5Oy6ihF6znxDBroxI6D+RedkLCYKFLThcyeXr7n2+iJwzio218uyNh3ABVlExGy2Y/v2H/8X/+J3f/t/7J/b/+AnH/rTD/7p4/c8vX58XE7CQIuBhsJnSYKSng6ZQQOVQ5p94qGnZ+fnt120oKEGAeyNq+TFSW+EYgZIic2S4qS4HHzdNOuTyrs5nXuaQzK/JxOtvr1BUyU5wQnIFHbaCOFsMDXW/ueDfakrrdqGdyVfotl8OAKfZaK10TRBTLQthybGqsFcIk/zJ4pZVKtjUzd1E1PtV3z4jM8nE2s/x6WnFRah0ZRUSdpiNCnSDOFAPnahgNmuPXODId3/wFOjJtRqkcygYqLwgrC/w3LLln9FCu8VGjU2Ko2KmPjeaaIg5eKEk4mYmIPAKtMamJiMY1wZ16dWlp9+4cR9jzz50KNHT51ZbSL7WDQH3illaX4qOlivPZXJEHeNBqBOrcF32oMMtD7ew3I1YzJSDGVqfHz85L1PShMPX39oy4GFiipin5lgbfpBbedH3oQukMhflj/d3Q4RKMk5QRlcj2T93OjGG24oAtPG62pzfXnXJgR0IZbn3MxsOizCO772q2+68fqf+Q8/8+9//t8/ePfDLxw7ffjqSy46uHu4fUABkTVylRuBjCmwDmJNTzzy+LU3vYqmNDKpaChSTU3VnCaemf8WAszABGbMThfTZUnMVSNRjAKFPAm9j39k9AIJ2nFj7zT2Nk7MnCAj7xzgvHMJBvIMPyEzPf5JSwfPNNP2CYCgKsn6m8bExnQsiABoC2+5ZhEBTg0yMxUfFdDEWMfGjDsOZuKZEOWirzNq8zh3EDhh/grLEJcL7ogJsXERVOLs1nD4in1PPn3i2RNnD83OzVJwNq2auJIPkbElSClBHJq+2SiXR9LW+EG0NOErVYi8I4PUbKyoGhVAVOvYrKyPT54+9/zJ0+dWJk2jpiUxQZTB7NRTaj0tpU5zIk6SUdk799mU+bi3xV5fnCVHQIBaNAusjKKMU7LCT97/pZXTS7su2XXguv02WwsLQ2HCBJ+9DGvVr9uroP26zi9Z3xtYgivVjJQIWF1cLa24+YYbeZP+8zKvTQdwIVaXibOZhcD7L9n3j//RP/3qt3ztP/1nP/qRj//J6WMn91568WXXHt57eBe2shRkHqKyGYeimH7miefHVbXnsotsqjFycWcCs9d4OaDwQnFBXNBgEIqyHJYDAuampwZFGE4No0ojEpgDccvK9G3K8Cz1AGRqAWXeSBdJT1AmreeXta/om39/SwKGstEDweNfzhQgycQYUU12U7NJajeRiLw+3Cvn1k0TGx8x2Y4zbvk4Le7sdQRKCYyfBUChqRcsKwUxsQu3GoHFBDo1Pbjlzht/8f4/fuDhJ3defOVgahqpX47E30lUsLsd+JYhDXVvyVBAqvcae6ceSM0gSTBHzCTapB5Xhog4is3qyvjs4tK5xdVzK2ujSSViMIIRiwHGCbj3GUF+SDUjK6aO5G0E4ZAr/L7B6TQ5NKlgsEHB7nC9S4VKK3hSPvelZ5959JlyZnj5zVfO7pkeFWMikEK1BX3IkgBqe27Ph6D6NYKsDETwUowpcVHqYPXcyuxgy7U3XMub5v9lXpsO4IItT7UzO4iGXL7hDa/77f/227/yS7/8f7//px555IGTx08eetXBw7cdnNs9Ewq2Wpi4lGILlY9+9t69h/YMt8xOwtjYmLiWejgYMMAWmWnAtHVuduvM1NYts3PTUxy4GJZ1XbtGvRGNYxyLBC6KgjoD3ZXqejPCiHLvldmGWzpbNgNaeTW0FV8zgHy6X88dWJ4NaJBMF/ImXkTTaNpoTIxPU1XTdoKO5I0hc8VQ59XEqGqJ4N9E0ShJ1dNMOZdytReHMrcuKlNdlYgV5vNmTS2PO1BipoKaRkomLmnSyKEjl+7Y8vnPf/pzN99+YGq6mJ4JUGNwCGxmRQg+DTEqXI/VyEgJZqTGqZHWj5Mp0Dh3l4ORRtWqiQ5emcVJo+OmWlwdnVtcXVqarK6tx5j6r8m1QpzrCSinXKrTj6bUzsFOIU0xubdZGJErryUH2IfwAgK0IA5ijaAJAYWGop6bbmZf+NILj33+6Vr0ylsO7bpiZz1U5jSfgYkUIpmtkM16Kipn3/LiLNBScpmUay2EYCCmsLI8PnLZlUyb9v9lX5sO4IKu82pjZvMzM9/zvd/z9e/7hp//uZ//uV/8D49/8dFTiyeuuuXQjl07tixsG4bZLTw89aUTZx45+5bvfBOmY1NotFiUXAxLSM1EW2eHC1vndm7Zsn1u6/z0cGZ6MCymzLQhHU1GdXQM2sy0aRrhwgrqIJt+tExJjNjhnhQU56JdCh8T64Uy+77VjXA4uEWKkLEk6jRushlOFgoaTaNJVAFIVExbBVD1soMYMoVG1UxFomrTSHTJfxPJlQwzZXQjWHKmQjkWdhxJu3qlqY/MYfjwNoWBmKKpRSqpgFpsIhdzi+eWL9q19czZxUcefXwwc6WFqYJDUUBVA4UmCplxYPVeXspyCmY+fNkABXmtWAy1ai1aNaOqaeoYo4jBRLSu67W1am00Xl4bra6M62gaU5FWocScD3AaFGFZRjA/6MW9XL93kKU9KZl/hPSRrSAIxJRJnXWAwlRlYGFQl2tPLz9xz1OrZ9YvvuyiS647yPNorFITMwsU4JKrULBrzWU2by7zZhlWSjhQyzmjlptFIHghXypZObN22zfcMRwOzssmN9eXfW06gK+QZdk8GQG75nf8Hz/wg9/+nd/2B3/wuz/3Sz/3xMe+9NzWkxcfvmRufmHPzOwf/8FH57cPdh3YFVkNMRSi0JJ4fmqwY2HLRVtnd+3Yvn1+y9xwqgzeChaiSW0iOlCrjYwNFrVumqYYiAb4KL7O9LdNuL1oP/9l+Z7sdYn14mvrRo1Q7zlCDvPSQ0lSwjmUZiomUTVqFNftb7mSOU5XU2vZQSKioqKNSBTv7e1w9WzXEjaCxFZ1H5aFrC25O4bHz0o+A9JbHiyAYaaIKBhEFpnn5reder755V/89b/6zd904rlTf/zRP9135GBZErFNTU2TYnrAMOOCVYRQMFy3FQwnuEPAMFGjGlqpVFHGTRxP6kldNXUUJ72a1U1cG4+XV8bjtfGkajQakoiDl2u453p9B5PSq189Zpp0h1ogKkN8lrAz6k4KMiPIAB/tDEQT4ghjVh7E6fpM9cBnHzz7/OnZbdOHbj0ydWCmLsfGVihpmk7tcy28fcRTNssQXx+Aa/1xOlGJM5qTzhipKII0onW89MiRotisAL/sa9MBfIWsNheg/IftXtj1rd/8bV//7nd97I8+8d9++7//6ec++cTSY6B66eTotvfcUGwZrtuyWjTVwYB3b9tyaOeuPdsWdi/Mz00NCRSYOOOtpIBZQUVAjEaU8YcYJUokZqAnuJ559mmLMk6QDXqy/t3dS8Qpn3eTStlJdLWETPtpq4AJf3D0P0qMqtEkmuTvpmwfkiynqIhBRJoYY4xpLLDl7lnkDMV8tGMaMpu/2qfn+GYnSKItc3gZVn0ImcvzBBcIRUElm2nUYnpqNKrf/89/9sbrr3j7m2598slnfuX3/+hzn3ni5tfsX5ifFipJbVILAeX00MQGIbBFJgQjJEW4YBAxqUXWYzOu66qpqiZWtUZRjVCzKDKe1Cuj8fp4MhlHbdS7QUKK0M28yNOdhQ2XT5KTcoZPYu6aeeE85Tqc9hxegjBCmlmQJQTZRIkMQQtQaVPVaXv8M0+dPbbERbjsxsN7rtrTTDXKPjeTGNxnA7OxwXX5vI/Nk4+UAeYKeAuC5Y3yH4yiKMh4tLI+Pz1z6JKDjPDnvq021//P2nQAX7GLAoi43Daz8xve9Z6v+er3rC2f/Q+/+O//3g/+g22Xbr32+mvHMp4UYxQ6N1Uc2bPv0n0H9y9sGbBOD4d5pgxBjZgNVDArghQamCHwOVhRpI4xCicByQST9KThgFy0c3OcrSgA14OwhDJ0kVpLEspYgFuC9un0BMRSx6wjPyamUR29MedCKgxsaqaiUbVuGlETiTFKbu01uCtrIWd0/VRJujQZm7TVaq3Zd5YLFMpscGFSJSgTwaBNE0tmABoHszRcPMPv/5n/cMm22X/2Q98zM1Xvv2TnW9/22j/444/v3Ps1dHCqiiPRZnV9PJia2oKFOS4DxEzVSFOJWyVYZaibetJUk0lV1bU0IqqaLCA3dbM2WV9ZXR+N6liLCZmkagwZkXoHm/Vx8eyAkTKulmSUmjSSu005QmJncq4TmB8kTvE6G1GjQlwTUyE8jFN2durhTz9+9JHnKNq+6/cfue1ImNWGIzFYs5Jdl+2lgY/EnmKwmZIROHN7narUK/0mTAhQ0wKhLIcW+ezi8vaFHde+6mpvz/jz3USb6/9nbTqAr+jllAyBFINyZmHmvgceRok3fO2dYQFjrPBU2Ltj+7WHLju0fddcUcyEdrAuEYiJXTnZodWSy8jKxAwSVROLLHVs6iYwUZo53yYg1hHGN3D2O0h2g0JjUmvoQUZtVKjoJoJlGMCRCouev0AFJm7bXB/boR6zKBJFY2yiaF03jvVIL+a3jFhZxnXIq67eWWspi8i+wlpIyVlEOcVQImiaHaZqKk3jAjiN1HM2u7TS/OxP/Fqs5cd/5vu3zljdNHPzU3/hm97+J5/43B/990999TvfVA7jWhzXdcMc5uaWD158cV3WM8NBoGAhimkjsRGtEKvYVHXdxEZEKabNi43WzWRlbX15db0e1yogZWiiRZmZ5j7odBB7R7krsOa5M+moZwIvKA2Xd3DOfM6m64WYcHLZMAoKtaBMWhoGzdBO0TP3nXjqi8dkUm/ds+2K264Y7gzrNFGVkOoa6SzQhuwwaW6kzvC0fUQE4+DXTYL+OCFYAJjJCKbKKJbOLF+568ZBOeBN6//yr00H8JW+CBzAVaz+9Y/9+K/88i9f/vpLdt+wcBrn5raWVx85/KpLrtgSBqUp+SwwIlVzZmISKTP4hCwxZaJQMAtrQ06imUSZEgkaiBAIbA4woBOQQJ4fDjippL9l1iYERJr4/h3ygmSNEuzugj5qyMgPxFTUG3HJjJ3Kb2ai1qg2GiVKVTdRYuOID6C5ruvItf9FmlioLVxlSK1lbqMcBHdEPEFLLck1izSYCTNEGzJBkFAMVcvZMHv2TPPvf/aXqtWln/2pf7Bv3zZtzEJBxgcWtvz1b3/Xv/mZX73v8w/O7Ni+3qwUXASz6dnl5bPjhS3zu/fsmJ4tDU3JQZqalKM2US26QwYbQ6M0Tb2+Pl5cXK0mTWzIlFgDMYQikboQrAaKlsx7S7xFatZO8H2yxt453uu7S3F/IukbyNudFRbI+wYp/Y2iYUJh5bRM65mph+9++MkvPiNjmdpZ3HTX1dsOTE9CjaGQQEVC1zWST3TL/QTYr4e2luRcN7+yvO2BXLCboZZdkAaGTrB8cvmqt10xGBZ9GvLmepnWpgP4Sl8CQPDrv/IbP/HT//aiK7ff/s6bFoeLOy5eePXVV18yv3tOA0tkKABhBRMJIykAs8PwCahVkLlKF7vpVBXEWMUmBG8/zZIyIMv9921zFygT6/PKEEOLQzBaKmJ+WSdw4+aHEv4kpiBK41YoyTn4vJcYpW6kik3C+hvJwm0tecTFMfyLjIg0WfgM7nveI+kFG9uvjKgtUSREyryEgVpBAZEMJMEU82F6soz/9FO/sXZm5QP/5geuu+wiKKgAGavoNA2+6q47P/vZB//kngdvesNdMzNbiGESo9jxEyeeO/7C4Lly69b5hS2zC/NbS7apYqgQM7JABjKl2MRqUi8uLq+PR3Vj1sCMvEJvAoMaLCAwZXlRAjzH60i3ROZn19pWDmp3C8RATGbavOWWUmaQRvEQwMQKgJUIQ4RQ0/iF+rG7jz59/9OyFsuFqStuvHzfNXvqaREWUyMVIxZvlU6tgQn0S1lXW69IZ95HHXuJyBLW2NWICQAHghlbaNa4Wdc9F+3ZtP6vzNp0AF/pywx3333PD//ID2OKrr7j4EVXbDt07a5L9++f1yHXEpQAUYACKZmpBYDMgWxKDH0yB2Ypz6F16QARY9XRpHLjISiGzOyixuST2L2AmAjePlayzfMTERRZej9RetDBwtkWuOi0wUTN9TtjovnACK700MQYRWqJdRPrOjYSfR6LJWudWEjUWsMcbKoTT4w8C1HJkj/OlUf2DrnAbqqZMQMzYyaLRgEWEOsabMwDyPQMQjMe/JcP/Mbisef/xY/+jVuvP2hRqBjGpkGgEMxQz5RT3/c93/L40/9kcvb5vQcuq21cBZXYFAUVXJjo4snFlTPLz/HJ4VRRhmI4HJZlCSaD1nVdV3Xd1KqkUbyBj2CphQDm4h4EtlRpYcfQRTWlP5nIn2azUU9y38lBqi4Km/XdLCBJhQQq1OfFB6mtCgWYmZvBsJpdfmzpobsfPvn0GdLAW/mqOw5feueRuNWUG4b5DDQhNSKmYGYKTRuZdfA6npilhr/kvpP/JoMqGWDsU49UTcFMZrZ0etUauvnm6ymfr831sq5NB/CVvEwsLi3X//Cf/N2VevXa2w9/1XvecsUt+7dsGUwph+hjT4woKKmZFwx6aKxllouj35RmZrnSjxGpaiUR/giTESiEgpktDRFAUqGkhOUSVLWtFAAdwu4vSGYe3h+alpoakNT2TVxuJ6qqWRNFTJoodYx108QoUaWJrszcmn4QkagRcw502zQk8Yha7CdNoVHzoqPbxha1at9oamCvcZCouvCOiRFBScXiQAaD4dbf+YX/8cT9j/3kv/j+17/hOqGaGCAhIiaDz1vRat+uhb/9Pd/6Yx/4hW3bd9JMUZSsg2C1kIGNmEszU+HxWrOuFfHYKy0ed4P9+BEQCAbNlHhib97lbEZ91CV7yTs3ZSdVi1ToBSUKvhdclUDErK1eBhQZFgoEUTOIOwQKBLOhFHYunHj4ucfvObr4/FLBpRLv3L/j0puPFLuGFU9cG1BUOTDDBIkC4C656/2ynvVP5fa20ttua3v+TGEc2IIpGcewdmY00MHhSw7wZhPYK7I2HcBX7lIAxP/mJ//l5x76/CVX7Pym73jHLXdcQUU1JGqaRpWDU8Ezq9vSYHHJBboUt/udmNGd1BGlaWiXTsS0sjR8qiwHHMrAgZkdHMj9poR2bG0mFQKZedJC/xnqSao2ruhgCo2iYtqoeddu1Chmk7r22L9poku2SY4x0yIApB2u4NC+f1OiPLYgjyF3CqQ6s6Wyp+92t7GUwQeDW0kDhAo2H802PZgaljv/x3/92Bc/f98P/d1vfeObrxWqUTAIGoUoaycxlGIp+tpbbvq6Nzz+kfs+f8l118eGi8CNNaZgKl0ciA1KzGWh0DTexBMYJQabKShF9fBGr0T1Mfe//tqQJIbyiSQj7zPOuVYCXLKRTU+SqzvAKZVKSSaIiCioAgHEWhRNsCU+/cC5x+95euXMChFJGcNseeObr5/aM9OEWn1yaB7aRUDRayawrqKSUhFY0pTtmALtBDkvAnhgkhnDRWCNkaKOV9YP7ju4deuWTfP/yqxNB/AVutyS3v/Q53/6//mpuT3D9/2Vd938+supaEJQUWLjRJjxkehgj8IInaXPUZdXOFtcJulEu72MIiJSk9YxVk1TlYOpshyW5bAsgloISWW6SO4ldRl5XThX/Frb78oHqjBRE0BNvOYZVWOMjUhMYm3SNLFRiTG6EnIe6EgZ5edkrEHZXPqw3WT981QABUi1LfFm9qMvcrfghrUtSbfRJxnU6UMEYvZRCTTkmaHMP3jPg5/9xJ/8ze99z3vf9/ZGRqEAFWyiFBgwg2ROjgkbrPmOb/0L9z/x8LFjTx44dI1oRUVpQqqpDZiJjaGqTJwyszZvUqFec3SGxZKOm0I0k7MIZKapWts7x22t13cwk4LISJXFgIJCujCQJpeZKRXBCMHAiqEVtsRP3fPUsw+cHC/VhEK0jlH2XrZ/+pKFOBSFuaZdOhE+mQ3UYj6AdduUjm7+f9qxRCrzjmFHJluumRHEjIghNDq3ctuNNzFv2qVXaG0e6K/QJYi1Vn/vh/+hDeI3f9fXve6rXkVTNUFUjS042oOs+25ppoY5y86ZdTlUTNGuWb7zMobjXbUNUEk9rm29rodlPT0YTA+HQymnBoNCqeBQENx0hVRVBoDMKQQAlwJ2nreoRtMo2qioSCPSiDUSE5Mn9+56QmBmmox+m09kXCNDB2YZVW5phznqd0hBu9onLDfEutwnZXOYPjxVEpJf4ewInfgfmILpkLadeHb0h//1v/2d7/0L7/sLd9VxdTAMHEKsYyBSR8Q80nUCfyCTpmD9R3//+7/7b/3IqfKZXfsOiAkHayBQFFw2ojALqRRC/l6vZ7Q9DF5qybAOURooRnlrFQxNz7T4T66OZHvqYT8lzM6r/ShSygMFg9X9lpEFGBNzXdoqPf25Z48+8Nz60mQQphoyEAWynRdtCVPccE2k7vAUapbI+ZlRyuf3HrRuoE2zcq0ibWX2UvBrxz1bRABrjWbUXHvddWVZ8GYB4BVZmw7gK3EpANB/+vn//Cd3f/wbv+PN7/zG15Zz0bQxRQBHkUEIaewqEuiew2VNesQOh6ebyABy6TTbECGrqEQgRhFR1E0ZqrWimBlODYfF9GA4VRSDsigDT3EomArvSHX0PBtlH8YrKmoQaCOxjrFppJEmRolRKhdri6Jp6G6i+SdblpmbHlRqG+OnzSfz9gBzAqhlZ9Zy+k01YeVebkTixrSuqguPU8jpfVHpee+N1cDMylM0XFkZ/eZ/+aUf+L5vffdX38nluDCAEGOktlkO4o1tTAxYU8cyFIBsm5395z/8w3/17/yD6eHWLTt2NDpiDoCpRgI7udYhNCMlN+dtMRrZowJtiYVaFnyLqni8bzlv8eeS/ma7eUgNFVl2yb2ekc+7VGMEUACVEoomTE7pE/c9ffT+Y/Wo2bZ924FDBx595HFZk1CUoSisICVhRDBICEoG4fZAZ6y/d+W2DtbawQLUnopuuzNjCwCnowGh0blxKXzVkSsDbVr/V2htOoCvxNVIPH7y1D/9l//qmluu+Kbvevv0PJM1liS3nC+RwNwU+meKhd9UQkixXv4fqG3dMU1IuapZFK1F1cetwBq1IFI1cToORuuT4aCcKgeDshgWoeAQOBS8cX6XQs1EJJpE1TrGGJtGJEaNKipR1KIrWOYmJTM4YMXESX/fyYNmfX3QDDkoYNGrlz0eaAJKLKu75WREiSmDDIY0Hgv9wrivZIkYBFUtgxWBS5sPNv17v/xL3/zOt77zHbcp1huVoiw0wvuDsxHzbohAhKhiBZpgJkqIV1560Y/98Pf//R/9yWtuu23H3ovVxkaNmCgAZTPOAtUMuHkWuBZ0Bwy5JzO07Re+7+m3/Eokzo8XAvKm5YoAQQyWa8ySkgQzM2JjhMDMGng0WHt+/akvHjv26LFqPN558cKR6/fvPbz35NILJx9bKyTwxAjRILnMwkTECJY1nSydp3xQe1BU55/ShqVHkTwdZXqB5EEPzFqsL48K4Vtuvqnz2pvrZV6bDuArbinMEH7kH//IWOp3f/M7du2dK0KjoGBEzBotKbZ39Aqn3rSquwxAUxWQFfBE2zQh7c6LhPsCkaZuoogklpCZ1gVxVVeDoijGoSyKYRHKInBgJgouM0mmRpYkOVXUXMotjWZ3sNnpfo7KJL0x8oougU1FLI3m8vzEAX7LtJZc3DSDgRPM3KY01ivspuJpxqeTNMWLKIReYmjBpECBQGrCxEwoag6Y/rVf/I1brrzqW7/xXYGWmGEcYoxFKMkKM8m9bs5rtNSpaySiDAvMbPbG1976ne/9xp/6xd987ZvetGP3bCSxorRIpr5zxokU70Y8zTukXD/NqZylgo2RpoYM86YtRhq/m48RU/ozOUICCRkxWTr6xiQ+/4ARCrWAIkiBNTvzyJlHPv/42ecXwXTRkT1X3HHZ1r3buRhuv2TXqadPkoV6HEXUexbMlAwgbqvnLR3Lzu8PyacjP9i9EGTtqfJnErIkoShpjeJaNTc7v7CwbbMH+BVbmw7gK24Z9I8+8qHf/dDvvuGrX3fnXdeXYV1hHioppa4uNfMR8DDNmH7WTwPDlNw0kndZiRoULGbis0g8vo5WN1FNo5nFXGIlrsyqOlakw7IclGiCBtZciBRTcdWclJEYDKTQXN3raKbB0eeEmLsenbZ0zWT+nYoEQM1cNCZH8NSWHNJQd294SvhIqkTDWlwnI0MgELhVH+IMkntQbaylERM0SiwGysyFzc4OtvzGz//eQjn9fX/lm2HLRKAQTBGMO94RUvuSqYfqCgOYgoG58M5axOq7vu0bB8PB+z/wyze95XVbdu8mNIFrjSoEZjJ1nJ7JzA9qAnASgyYVWtPuZM6OkU/2DACR93ukGlCHdRFAxgrzsZJQIGoAcfDtZkJZTsqiKepzzdGHn3zi/seqUT2Ymz547cG9N1xSbi2aIhqqPVfuefyeh23V1s6NueEwCMTiJl/TtB7KbloZnBWvM/yfO0Haq9l/dLlAC0smz+0elS1i8fTKq+98U1GGzS6wV2xtOoCvrNVYXFo89fd/+O/uObjwF7/z7bNbGk2gKZDFLmFw22+pJbat9WoCfFOYC4+FXRdAfHJirg4AsCypr9qGcGSW7JuYTeomig6Hg0FJRSi8lwySOgecwEGAGWv68BwepuKkw00QVbKeOKQmBAomlrFqsqwQnQCEXDnMfBKYUvZ0QK4QWz9fQJKeyaASJazHwGRmbI6OAWTKGko2FTJjmfrEH336zNHjP/ajPzQ7HYMfJVGAmJ0Bm7wsgb0BwMxCHiUPZjOhQCIxBApc/+X3fg019H//3C/edOebd128W4OYzzJIEwgoad4lFCe5bUpwSk4GWuA8o/lIChJkZIwcPSefAHOHzK7oaTAumZmCyyqxFgOb0iV77skXnrr/ybMnTjNj9/6L9t9wePuRHTQLK1SImZqZPYP5PVsWl8drSxXGMcxAWi+bcrjkT7tt7NbGEL9bXRNYerNZbjcnA0yM6mC13nrzLUXgTQjoFVubDuAraBmspOJf/8RPHF96/i9/y7svPjjFQyPOiEBCX8iynWj7rlqo2wBNgFAyHa6044Qf9RYrR2lg0TQmabVWYSZLOvikPxUj01rFilKlLIrAKMDkpD3AI1lRC2DzQS5p2K6aQjXPX0eu2WrL58nQRkuqSaWMXCRMQWCqZlJSr3SLQa3yfVv38OOQ4GlKTBqylENIchrkcghRIkIkJhYe0uzRh05+7IOf/zt/432HD88RReVE8nTJBiDRaAgw0mzICNCAYEYEVphF44BoMZhwKP7yX3rH3v07/v7/+RNX3HDLocuuLAZ1FcYqda7UKwHEHCHpvJmko+/7bOwlDs0dHESqpEwMQZJM9RNsILVA7A0fSJgZUWBp1JRCMxgiVOeap5949vlHn148dbagsGvfjgNXHd5xeAdvL3QQwU4RUlev23PZnuWnjk2WJ/VSNbVtEEPIpFzK8FPL5W89ft9oZ6rniy9wPzOdP6BEEDVrKp0qp1/zmtvbJsTN9QqsTQfwFbTM9KFHHvvV3/ivV1938PVvuWYwh9Rjb20dLb2S8r8pdqbu2axx2fFgjCCKaBpVo/fdA4B5AS4Hm8mSti28BDaQRjOJ0sSamJkZ5nVg5kCB2DFoJVUDWVRVqCHNcXfBYFEP+x2wcUtKAMzryZxVioFEWKEXDQK3rtmNWpTE99VfTtkaZmiGkOJpR1AsNQwzFErRq8kiMtSpuGYf/rWPven2N77jba8uB6sKZhAxkwWDUJazRFf6dOAdSG25DLHAAQRTDWwulRxC87a33l6P/tqP/8QvLp9Yvvbma2d2TFUFVEQsTTYjnz5JyffBkPyqzybzDts8z9GgRKQwL22TEbdWOQ1MY3Xup3KgQBZMUK/Vy6dXnnn0+dPHTlRrk9np8pLDB/Zcsmfhkt28tdTpaCRgteB9XjASULH34J4np1+QSVw9szR7cE/SmSMzjQoF2BtCOtebLpj+aesxlbIj9+YRbxlPowvccTMVCJOxNBPdv2/Ppu1/JdemA/hKWQYY2f/1E/9KEO96x+2792/jEDXl35YrfEBKpz0GRroJe233SbIz60AYoEbi1l/F671eFhCXYjOcdy+7Mm9qmGVASVU1jWrP5D72Ts8E+ydZsBwSWiIHdtGhfyzBiChYagrygR8+GD3ViYHsIjokoQWB/K82akxeMR+VfBQpzXdRnySjSfWHTdSMyKgmJoo0xVPcbPnwb32MG/nub/u6opxQoEFZxkaJiaCZVJlD8Cwk6gbZaZwOyAQk0QWYA/FKVlmsv/qdr92xY+FH/uFPfuZjZ6+84fZt+3aVU6sTnXiBXFQTqua9tGqcavltNmTZH1NA8JyOWdRIhYzzyScBJ4WfoimoLkJNo+Xx6ROnn3vy2NLpRRgWdizsvfLSbQd3Tu+c5pkilqohNtYEJmZSNVeVUwTVOLdrbuvO2eVnV86eXrpILobmInw6wrm7EOg1g7Wdgd35xobTmH7Nr/dCMTGYFGhQj0VrjlWz6QBeybXpAL5SVtT4wP33/sGHP3jTG1910+uvGgxUGSZGrf2HY7/ZTMP/bUFwo9ZFdL03ZDAxjSaNxCgi5nx8aVSiiGm+sRPunswwUdGmEklhKGu8a7L1jvJQpusbzGvPKe5Lsa1vJJGHtHAl0l6DVgY4kPYQyLIClDOCtK8JKQJau5z2PzckADmOVsegiNrDR6YwNoqkgJgZKxc2//Bnnn7wc4/82N/73w8emlJaMUMToxNmWj3jVsmY0LN25mkCAeAEPRlxUBhEODCIuAyw5jV3XveB9//oj/yjf3v3hz964LLLL73mwNaL5mOIE63AkmUr3CQaLHDCfMxg3lYNEJubZ4UqAgGBDCZExkYobEgSOMIaHq/WZ184e+rp588cf6Ge1Fu2zl961eHdBy6e3b01zIVmUNeFGtVEQoSCSVSg7dx4MlOmEAratW/bmWfPrJxdQyQrU9EpZXBdhaIP96cIogv602VlWSwon67UtWDp0PmeClYW1y/au2/Hrh1fnttpc/3PrU0H8BWxvOD40+//WczVb/iqa3dcPJBSQgic5155KN1133YxsaXwLINAbeJNxmriMsu11HVsGkk6ayJW1dEpQLDACWOhHhc9FxUy49L1RY2zIIRRvpOBth03dzYRoeDUI+o2tIV7AW+hTaCOqkHNRUpTMVgzJNKCRabsHaeZUoSU9KRSrbNJU11VE4pFasoQzgVUMzOLVSwGVpYD1ukzZ0af+NBH3v66G17/+suLciJlQawaNTB3ovVm3lmQgKpk+5W99EIUkp9MRVkigNhEjUmgTADksisuev9P/pN/9W9+/qOfuuezJ09fcd0VF+/fMZiZKaaaJlYITEkVO3lDJShQAJ5bAOz0pcBEYDVix4GESilN2SZYXRmdO3Hy1HPHV86dsxi37Nhy5U0Hdx65eLB9xoaFFlaF2siYTFU8t1MxGFxzzbLPDsxQtUALu7cZ62h5PU6Epnzuo9dX2s5f6nF1LKdK3bWTL0J3aBmjtMxNa727gUCDoqxH9c7tu5FHxGyuV2ZtOoCviGWmDz/06Ic/8pFbXnft9a8+RLOlWpNql2QwdoZPC4q090g2uO1Tqcsm5AqpwB2ANFEaFeejS5SmiVHFBYOt/eR8R6vBxYk5aeU7KORf2DX/ZBYOshCFb0uSifAZVBnRT+hAauBtIRxKb1VLrUvq1KJU+fV99PkB6EoVLYvJVKWPSACgoOybISZiZlAWgwCmZSi0abRRbmY/84f3bCnDd33n108vBKWGwCZIfW4GhbmVB5jTOJX2GCeWKgwKyeeFkxt0JQVHn8hBnXrr7sGP/Iu/dtdHvvCbv/LBL973hccfLvZefmDfpfumt85IYRIrEmN3XCagAJcq8iqwEYgYKKUMMnCsqKqq9ZXR0umVc8eXzp0411TjqZly78UXXXH9ke27dhQzQQZWc9MMVWxiwczFYs0YRMYw8fPb7+MlMjUNCAbMzc9wQaPV9XrcDHaECPFrzc8x1Hm2Lez/Ypu9oQZMieCVKiqdGwfSxdsYIm684UZqXen/5J2zuf58a9MBXPhlgJD+m5/5v8I83vr1b9yyaw5UE7OOlZi0w8TJJ3Gn6Df7AR+A0qKtIeOwRhCgMRlLXI9NrVHEiEgNtUkldZQ2mAOyME0mj3Y+JjFqmGAEZo0NZRKqG35C97/cCZAcU6ardmEmCDkaTNF9quRKxn9cwZhSddqQOK/tTKvsfpIWEHIm0SL0fnQUJgHRx6q4YIWpWQC4lMET9z3z1INP/42/8nUHLtshoaZAEJCBQsKuua1ImKZaPDqrRdmmEVMuZfrWam4wNiY2IYDACjQDq+968zVvf+stjz/29H/8L7/7sU999tgTR6+46fpte7dPzc5pUWtTU2FkJipFIAaIgocADDKhteXJyplzS6eWF5eX1ydrYnF6y/TOHdtuuOa6LdvnirmhMMA2ZlOKCmWYeQ+zpnQunwNttdw6U5uUOcj9wPTsdFkWdR3Ho/EQM87Ccg0+UO7Mc0CyLX5kCKglc7XFG83XmFl2pZTKWkwM0WYisZZXvepVbV/f5npl1qYDuPAranz4kfs/9umPXHfHlZfdeDCwOeBLCSzNfVEtDn4eSy71J8FVvhJ9ECRqjclE4iTGuomipmpEFEXHk2pSNaZELiqXSJo52CfKUJN1KTmRqGkUEKGdSZI3JbufzNQnO+/ZXCHuq/TkhML3iTKEn0P81thqMr5JMtQ3OKkJufU1j8dTmUI0ehuZigolUSQfgts0EnSIsd738S/ceOWV7/jq1wSqxECmjJAIP5yNeaYWtcg/uhpmrlVYt39Z5AC5E5lgRmwKNhOCgKNwPHLl7n/8z77/qSdP/t5vfuT3P/xHTz4qV914/dbdu8JwqsaYApHPfnHpaCYwlGh5eemZx54alvM79u06eN3hqdmpwSBoIUaqokYSrdLgNN4u/YKi26p0oPOZ6HgE/ZIKzGAq09MzxTCsr1bVuGbME+rs8XNq2aWMKetpwccN1r/73HRhUaYROLDIxADWVtchdnD/gc0e4Fd4bTqAC7wMKDj86m/+RgyTO++6ccu20kIkI21iFnKwRI/0cmAnstV3AylwZ8BYCSTGFcVxbNabelw3MaoqABZYDWliFFEYMwcf2UId+5KgagZm55d6XEgAClYC12n8YuKfIuHCidzdblOy7j2dmi5peTHKm0yr6xqkt1iqD7qUkCWVCII3EQOtL3ATpMk5pN40U7BBmIgAgSmEAwZhiqvZh+55tBkvf8v7/tL8loFyDEVQER+PqznlsERWRGpJTu7AMtrdOYKekfP9Sn1MuYYOU3XpaUDMIsCI1VWXzl/1/d/4vve+7td+63c+fPdnlpePHLzmWrBpbIoA00gBRhBTY2Zgy/btt73pIhDXSDN1aquVVEkpwMzYCM6uTEc084f892ygLfc2tCmZv5p6HSYGCmUoQhBRqSMTm1Gf+9/ubK4DtBlAxiT7ykBtOSeBlaCWt6wWTaaKQVOtz07NHD5yYBP+eYXXpgO4wEtMzi4tfujDH778yn3X33aIpxQMFeNQwHuSbENS3PF8suWhtiDnubYBZA1kYs261OtNXcdoAjZmhpo2MVZ1LdGIoJZzcXcgKQInECXmvm8Ag0DsPE4h4qCdDgTQqwajRyWlVn25d0t3Br5/mxNZykLc0LvQUDLk6SlzIXp0vWRtRQEJ5cj1BYBSwTyBQmQUYEy6jvVT8ZF7Hnvra2+49dYrFBUFAMYc2uSK2oyhzVdycTyjJCkla3Md32NLBjGbvwwaEShLeyppYbBhCdUxKFxyYPsP/sB3v+cb3/0rv/mHdz/46b2XHhnMzrnOh7FnVIxAJIrAE1MgCgvYRL2UmqlTRjC4DqzllCfb0uSz8qlFr5byUovMTIkRipAmtyFdVcwMUtHUG9Ev1HccLaQNa9OB/FtuV+l9rROEXOZ16+wWi8LDTYv0iq7Nw30hl0ebv/5ff/3s0un3/tVv2LqjDMHLlmBmgeZguJdx++IOJrE2HTAAaer6ROOoqdaqatI0ospGTjBU1bqJVRWbKGlCPOVwLo/t9m8iI1ElYuJkZOtJtXxuZX5hexiE3k0NZAihpYIjWYFEjk8bngCEXBJQZJIR+RgwE7iGQ4J6svXPJshy0Tm7Acv73nqUVl+fciuDAmTGyiBIYJRf/PR9U4y/+BffOTUFKYxDMFGfmQJiSh4F3T8bAuV8NpLDTDvXmjzrkqJUI2gVqk0jeW+xaNP4ZGYzUmuqSw/s/KHv//bHX3jhp//9f16rl7Zu36NK4qrOBIsRRBxio7kvOJ/5tvifbDC1elCGnhfqZ4o50+r9nU5QmzpkjJ8AGDPnRo1ECm6loKnnzbuLwdB/LF1L7fWFNlxwdxTKMmiFc6cW3/yqO8syYHO9smvTAVzIJdA6rv/qb/zGxUf23HzHVcNCGxERC1aoSr7BKJtWApCbKD3mA3CepSKB1RpHcbLWVOOmjj5UigkEn/s4qetJVVeThoMRcZmxerI0QybbdgNItVYyZrJamnH1+INP7dsb9xzeYyUAJWU3M9oOX8ySAW2gvhGosvzR+Rfzf320uyM9aTdzKtB/NfqmpTV9yIF/SjhcQdmdF7R1JWXNy+dWjj32xLe+750HLtutGPsLc4SeauEdvJZqAEY56kci5xjlyVzWNil3BtThFM05QoLnCg6AmTEFFvO5WiSQsghNHKOYHNm/8I/+3t/873/ykU/de//s3E4pgqThbQxwlJx1mHFy2SllArVz2fNWehrTP/y9A0fZ3G/IAqh7lau9qUQKPBgMiJjaun/qNkZqSc4pXuZlZZSpzU/bM0/IdIHklAwwVVgY8GD51PLFr93Dm4OAX/G16QAu5DLTuz/7mcefe+xb/trXbFkwKiQgzds2TVFlDtDbYN9Nokv8OjrhK/V81aqjGEd1M6nqJsaUeZO5MH0UbSptxhLH4lWFJgPZXQtASi6S7otoDGCORFLM2Pyj9z6xa9sezGulDRTMUIWRs4/QId/tPiZmSNcojM4AtSara+RKMb31tqPdJjrvY9O84/bfTColUyUqoGSmYsKkJYYDbPn8p+/etTD91W+/LQxiJC04QNXpTb7HuV8t5xAMV4NIwSu5OW63NVu2FgbqYv8WI3EJaEdWkriGpW5tM2ijDDZCg2jbp4fv+5q37Nu548Of+RzRnAgCERdcq5oph1JMAuCyEBndyocjfUcHvRgS8GRtyI7WjZ4HxqRz5CkaDKomgqIsZ+Zm1FRT96/1St0pmUgNvx3da0P4nw4LaSo8WFspafv5VGpp1upD+w+GsFkBfqXXpgO4YMsAVfzmb/3W1u0zV193RTldmk4ACuRSvyGFwVkSwBIUlHAUc8OXxoYowGISTcfSrDXVqJpUVaNeQWRSaC0KUJSoUbQxqVUt80B7mG3aMoMRFMrEgIoRGQ8w2H/wkqOPHz/37NmtR+Yai4hiZEYM8kIhp7D8fDuQrEWOEnP0mu2I5S/tLephW10OkJ7Lzipr0uXuIpcpcOPmfoktFGRKPCkni/L8E8e/+1vevO/ggpFwQUQg5Ra4z71ouUpNRL22pGzjkNE3S3azowl1JjZhM14K6RWQW5zE94qZxRjQIGUgkqYpC7rr1TctbJ3/tQ99zGjIKBpRBCJlJsqS2amWa3kcTnvaNgByG44ZNqZQLaEnn6oEXpFBjaHRmqouy3I4PfBGPNfVCOxC2z5rxzIBrUtQyRi55pyvrXY7KMcyyU0SoQjFZLVpRs3s1Mxm+P/Kr00HcMGWAUurS3/8Rx+99vZDV1y1DzYhLi3GpBpsRK6b7yiCx6CqmjFUUQXUx2xTMDGpNE5MViaT9VjXjWgEGxcUACjEk4D18Xh9NG4aUYUknkui1vemsCawIQWERg4CNFwPZoe7L9r1wBfuf+PeOwK4CQCZT6k3kAv7tyXZjavXxHz+T3ThfTYrqTiQkagUlyYr3L3RgR6kakMGhcwJTGYkLk7BMixs6v7PP7Rzy9TXfv1d5YCoAIg0WpFxa/QYr0BrtRKKTwT4XPp87jQBHZTNPNgtbFst9mYxAD4MksjAbp0TbJbxGxCrgMw4lGTgZnLzZUfCYOZXPvghES14GCPUfDi7qVlgR6C6QN7bonsl6d6x7Y5VPsD57zbhan2zQYkYhqpqYqPTW2eHc4NoE0tFevdtnNNTZI5ur4aPHoKX9zBN/ULuOMwTygxKas1Ig5aHDh/ItLLN9cqtzZzrgi2D/fFHPr7erF33qsu2zhM4iMsXk6WRKig81BIyhaqpmKlaVImqjseqmZE1qhON69Ks1dV6U1dNbBoVJWIGkRqiYFI1K6vjpeXR2npVR1UDKbN5BBvYC31tJTF5IJ9CxcGHvBAry94jFy8tr547tjiQoDAlC4xcPm7ZJj423DUAuDWU1CFa/f8SjmKGpE2UGaAp6fEsiFIFtP0vF5BzxTjbPMpImZoKxcZq1qCj8uSTz9/1xtfOLZSGiVgUM+ptAnIy0WEo+QsBKJl3M2XL5QxVtSR51EsUOiSt9VXqrRN5gg0SmOTvs1xeJTKwGohCgF59yZ53vfFO5rHahMmKIiTCFmemlO+oi6wiW/NsW9ORQ6qIoGXIbkB+0pVouW5sWYSjXm+0trm5OQouDJ4SnnavX7z6H5dzEeQjkDZGYZbRJGIQYI3Wq7Iwt/3A/ouZz8sbN9fLvjYzgAu2FPJrv/1rxSzd/NprYLXbLAZ5/G8wSTYQ6hLO3Z3m4ZSKqKcAlWEs9Xpdr1eTupZoYgQK1JjWtSksSlwfTZZX15dX1tfHtSgAcIaVOrEuZCQY3vqbvspJnxAio607tuy5aNdD9zzw+p1vGA54PQr7lGIv2WYsHF0xubUGXayNNrRPZWB/NMld9HApy7x+AOhbmbyxoDYKzXmDZwpK1FhDQFEMpmzL04+eLK2+6203hWloQQzXIGqlqPsf2C0vd6bIltKQxcwB8mGXXTEeKfTvEJlkVQ0gdZ+RFTyTozBLgmiJsqPKDGWGYQrxNVdfubK29rEv3FPVVIQpjaacvCFaIMWyEl93uDekTu2mbUCAzsfb/LVETFBlo9W1dVPbsmMbCoiJ15zyddt+YvbcyLXdjHzlU9x9myX/124gkxHDIDpaHh05clkIAS/hnDbXy7s2M4ALsDwkq5q1z33hc9sObZnbU3JhUDb1WDCrdao0IlGiREmzXMwkZe4kZg20gTjov1pPRlU1qWs1AYGYFFLFZqLN6mR85tzKydNLZ8+urK1OmiZpKGgXlydkIkMKlFr1Qdabu5iCN8LVN157+tzKsSeeC1oysbIp1A2h5W4izwB68IO1lB+0WI3/m9CHDEi0x6g9UunfNvDuf1z3xhYS8UfVhAOBYTWmZO7Yw08e3LP9yGX7eFAggEA+FhjwHIjacfft16at84PRzjjI1j9wknLesNn5GHaHNX0a57owkHvu2kObWEzOWgJDmUAFFWWMb7r55rtuu01lNQThfLN2jjAH/T09p+6fPgyUDlM+9m2KkE9V3m1NRd71lQk0zC3MgtXjgYxrcUby291tOwjbiym7/A2HpTX9eQvNNYXCyuLaRbv2bVKALsjadAAXYDne8dSjTy8trV7xqivKWatRRa0UTWNNVIkmjcWooqZpfLqYWh4IzmSswiaEsdhq3axNqknVSKOsoeCiBEOsrmLVxPG4Wl4enTm7vLyyPh7HGA2KdqirtRCC+v2fYI5E8m6Z7m4bmaxAzZOZ3TP7L9n/mbu/GJdkKg7IyGfEU3vf52ZTahuEetBN95L0jR1E0dl9mKmZ5vnB2fZa9gZooQ70DU2LRqQp6og8U8w8f/TZ088/+7avvWtmroCJqZizY9POAd2cW/Q/UZEG3RtSOSYBN8xMzAn7yV6zd4bzcfNjSSAmYwYxegbSUZmeiCb5FB73ooqCeAb06qtedcct1zXNmqoSsQ/xsRSJd0iLok1G3LimNGsDOtMCUNR/JLvQlKUAkdZW1pnD1h0LQprqUG3p/UV5WMq5cn2jh4b1HUViNHfHyIgpqNJ4Uh289HDYtP8XYm06gAuziOz+++9vtNpz8XYtZNyMG60baaoYG2mi1qIiSAIIbvXd8BNYRSqRcYxr0izVk5V6MpY6mhhAYI00mcS11cloNFlZHZ05vXL29MpoeVKNVSORBjKGUg+p9TAyhfvGZmRKCu//z4W5nPirBpqguvyWK2D04CceCiMOysqszuNPL0pG2+17a7uTC+hiaTNYGhtppim9sWT93X5l0eDWrrSpQjb+vWQAmV7jBk0NwEyYeuL+x2am6Jbbri/YSDUDIoq+58guxbIZVC8jePux5vg1uUImr4pQL3HABrOKZCr93JFzSVPHhbd4eALRt4qJi0tkxOYv1KkS115+eOvclJGaCiWkXntHIuHsOTl5SaBng9W283733C37ZBJaObMynJqa3jIAfH5ZukbO2830ARu9X/7wHhwG9DLNtnGPVC1Wwo0c2r+fabML7AKszRrAhVmE8NjTTwiNZuYVFNfryaBElKiiTFBXtEeeFsWJjy5mqqhUa5L12IyaSS0qGX0X0bqq1sdNVVVVLVVVj9fH4/VaxCTCI9AWJkmhWpahTw919zEnTIGyiIzbSwUTKWOwc+r6W6/+4kcfPHzgkq3XztWWdHNMBblS6mVNaqmSlEqVHT+RMmuE8pdSBrRdCW8jIpRt8J99UBMJJ70wUKESY62nT77wqusvn98xo4hEZirMAb2agbN4MpZkLcrSUhy9Ouy2H275TbKwTaK6dLgMoT0EIEuwiWl2p270c7bE5ONqumYz8sfMwJU0Y6kZumvbjsW1JUWWwAMs8Y5MAJBp6qBLn0I9ROw8698e/O4vMgMYHEAao03iaHVlftvsYGHQ8MQgCfBvPdqf4Vd6+9XD4zacwlwHsuQ+q4lapIMHDgTeDEYvwNp0ABdgeUD58CMP7tw92H3RrKpRMahN1JQD5Tm9TvxkkKmxqSnQqFYSx9KMmskkxmhpnoqqTOp6PKomk7VaIYLJeDJeq5oqSmPk7ArLDcR+K1MvdOuMna9O2cXNdQJzzMBkajEKc3Hw2iOP3fvMZz5+75t23zHcgXrgg39VRdRHYhkb1OM601avaMOiDeYBie+DHvoEtPhKX7Ny46e86Fc3pIapYnrpxNpkZeXyKw8PZkO0CQEMpqxZlng/uam1jfNzJN3B25waYnMYm71NGxdTKmBzch/dATWDT1Zot8w7DBKlP81J85BZATYjUrKJ1CONZ0bLSyuTgqcCFaJi6Mosbkxb6Z0+9gLqjm3eoZ7j9Tf23uXwTwAFK6uVuqkm2w/swZQpRUuNxvkMnXfEzzsD2QN1Z6onFZo611zDwnlRtVktUsUXXxub6xVYmw7gAiw3GxOprr3+0m3bt1cxoABpY+zjRQgwFTG1JqZprSqxaqSSWElsRIRMBQo0sRmPJ+OqrutaIlRtNKkmo6qaNFILlMiCqPNrnOfXj2wp+wI3+F6bADlKQpzAIcdvPaw0EBCYFaZD3Pa22/74Vz70yCfuu/mtdzRzkzpoQUTMUDATCyEPiUwGsXUqPccDOCi9QbWgpZVgw+va3639oxWwcyQfLTsfJoppKk6ePsOkV157uJjhCBmQH1G4xJ2DKZ3HA9pgnrwsSckbhRBA8LZeVSMKqVaSX97bKJjlmQq5mB6SmWYjU3Crz8lerlBOGRMDZLU1EViNkzOra2eWR+dGzUSIKLBFkBhx6lDLEF5r1bvj1Ttm1ulVbDDc+cUGELFBxYSKplxcXI0ad16yzQaKpEibWbndOerFCy9KCLKLbr1Ge6786rO2paFar63G3OzcZgXggqxNB3ABlpqeOnvuuVPHr33jxYPZclQ3MBpILSYgBpGoikQViVFF0IiqSjREk0YsileKtW7qOjaiGtVEqRlX41E9GlciZsIBBZLKASWGZQsZ95CSXuzWhq208dlcFm4RG4OSSaHz++auuuWqxz7zyO49zy7ctCvMQqkhJRCbNWbgwOzibhso6K2JzQjLS/A7s5JO97aNeFD7Rw/m8H5gawVUmRnlqedPL2yd233RNiUVE296yG0FyRG2tW6k0Dl3eSVmC9hRf6R/QZrl1nIWsXHL+rCaM0lzMOydVN44oUQEH0MGZgrKoqQRUpksjqul9dHi6nhxXJ9ZXl9ZHfs4IMtONBdH4JOi2+J3/6TaiwCZ3hFuz7pnLkaBJMowTi2fWmUKCzu2gNRy6uBhwPnJV+fR0WFv/VP14sA+552mYNBoeZ1puGPb5ijgC7M2HcAFWEy8Pl45d+7M6vpMY1aNV0ouzDSqRjNTi6J108QoGk0MUVREBTAl1/OpoxeLY6MqtUkjsWmqyaSuG4nQCNJWKdIod+Yn6aBUiATgwVg/lkMPuWjjdENLAmznPxkphEu99jXXnHnqxOc+/YW79r11avewHqqxF0+NmdpIOHHps02njD+4R7JO1qCH8mTAOW9JTlRA7Wf5kxsAo4y7u/Eteerc6XN7dy9s2zFnqNWUUHThPrkKECg7vnRkmOCofK7w5jqv7wzaVtwUPrffnDAVzc7I+wxcr41So0TuuPIzEpjNSEmNTNhqs3EtK5PJ6dF4ZTxZHVUrq+Ols2vjtXVY0x2o3motf8uzt04RiNAdxpdYG3pzAUYRZHj2+PL0/Oz8jtnGJpbFjja0Z/SuF+ujTef5h43tJWg9tCObZkS8vra+dXZux45t1kYJm+sVXJsO4AIsg00mlTKaQKNm0mBST7hqtI4iqiLqMj0uxW4UoqARETUVi2qxVhGJkghDUklT1VEsNpDIBApIoaIz+zL6A6BvPDvT2vMBvXygi9f9ldSaOjNjgIgjaZgdvOFdb/i9X/qDez9y76u/6tXlReWEmmDMDA9WFWlaDKWYOUPluZiJLl7euAVt/pH+8F2h3jZhY6ngRYGuwSysj9Z2XnrpcDYAGBQDF9jMNV2AjIk5zSFm8g64ziUg5QHJhfa2ZiPM3x6bbBIp51KJ/mheZ/WDaAogeC3AAGYzbUjH2qzV9fL6eHF9vLjerE3q1dW1pXNr55ZWqthYizadv7v5zFDnhbpXtJvePbJxEci8LgIWljUbLY127t3GM2RsqkpdorbR1KfyUM/B9DOE/HCPwAUAZgpiAxEH1BQn1b6LLhkOBrxp/S/E2nQAF2ApbDKZKDCY2bIykSauBWEaUyXRxYddhI2ZwWhEnRbaiDaNNI3GRjSKxBhrbapoqqaomggNjMK006rsEfpbQ/liA3Ceveib+g0RHeWMnthETc2YQsN1uTO8/mvu+Mh//fhDdz989Vuv46mCyybC1OKgKNrm4Iy3tCH7S0D7rXndEOSmQcgZjOhtdGuQ20SGuu1nAjeVVJN6246tU7NDMylCwSbEbeWX3e4USfLe86O0nZSRKCKfspihHp+lCU7HmXobbhkX2uCJqC0O54oCEyixjihGaIU4kmZpfbyyXi1N6tVxszySlfW1xaVzo5WVphYx52m+OPw//8zlP6kFnqi1wl0QkGP39GL2nSy0WDy9NBlNLr5knw3MSIiSklH+NstmvTtzmVDQfUGXdyRSl+djaP8goOBQrdVxEm+96zbe1AG9QGvTAVyARWARGgwGCFNN1OXR8tzWmZKnXUnRlNhLctEakXEjTYOqliZqVTciolGkkRgFyt6lz4bSggIKMUpSEm5zDQAU3KmXJbP50rTuLg3PQmbUhnDJpVA7gx4mEg02tIUrd15x+zWP3f3wcDh32auP2Lw0A5JgjUVCGdpPc80ykBr1Q9YubM0KN+cdL2t/68MKaYu7B6i1dQYKYGZpDA1mZqeKMrVZExCS3U+je5mYwVkJwTq8J+03t9WRvOOkKm1xuIdstduAzPXsNpC8vEsOAxG8+guqUa/reGkyWRyvr4yqlUmzOmnWRs1oXZYXl1dHq01VJ7fEoNSOYF3mkw5Jbr3rAvQc+mcfYVkA1H1kYp22PcSsIApCZ4+fK8Jg2+7tqdyQexe6QD9/S4sO5Rfm3e0lAb1OvYQG+kapKZibcVOPqquvumqTA3qh1qYDuACLgPnZmcHUcG29Xp8ooQxcBCuIKCAwQWFNHSexWVkbjycyqbSqRAQiAgJiA0I0mAhHZiNVTSr1bB0LsW9aU+jZycd31t96JrVraMo88lRzzCJpBKiADEwGIBCBlUyC3fCm69aXVr/0mS9u2zKz+5q9xUzg6RokcB0h76piAkwNPg0ra+f3Y0rr2yw3pHlU7fmcUXTWv7VR7fvMoIzQ1IDR1PSACyYWzyPIpU/ZR6IRg0IrAtfFyP6T89FyPMt8giGItIOAel/b7Yd3WLcfl2c8+uxiVlWo8kRlNdbnJqNz6+srk2o0juOqGVXj0erK2uJ4sl6LENNAEI1AAnTTYNCWzsmoPU8dJtQF4/1jTBny7/mKxDwiqFnUk8dP79i5fWbrjGLi7WZkMHDu8DVLPsTSpNAWRuyfGesfly7bSL2A7E5U66oOxgcP7A+bOhAXaG06gAuwCChCAaPTi+dW1ic7d8/Xk3ERpgfDaYta1fWkrtfW1tdG49Fo0iiLsEQn2YMIPgzGldzM+SgZTielNCssi291KKwbLEujflMJ0ldqENiYCmxEb1MqbyAQgzVPEwaZRhBbU8bb33LLn5xd+9MP331LfdslNx6oB9YwmAwqqhoCRdPATGA1133r4xnJyJ4XTKcw27eBehvXblgGuDzCzEbHmCwQx0ahVJRBJBYBDGUmjzcDBaLECO32nDZ8Q0vxTCFs222LdkN7QXF3rPOmUddUZqQAKVQJjUotUk3i0vpocby+WK2vNNV6FatJnIyqpZXVyWRNGos+2b0nrs3IbNc+rt792Tmu8/xqQqY6WCg9TeZk39R+MV6djJbXLrnqSJgLTaGJsw8DlBBarMeyu+lCixwyoD1A3QZuSOgs52GkqEb1oJzeuXMnNtcFWpsO4AIsgy1smd+6sOX46UUIYxKLYqjg1dXxeFKvjsaro/F4fVJHI01aONCcuqd4PU2NMRKjFKRl6DnbsBQmpvhMNpj3NFOekOBzjyIzRpvVnTcCLqlZyiulBnb7qEbECpGAsHXwune/4aP//aOf/+jniGXvrfuqEGjKjJMSJHOwxgrjwOxQiCtMpDZWOm8i70b0Kf30Z63/KBFpu5Xt7huZIBhDQRS4CETRQgCTAgFeKScfrtz1uHq+0kpkZOm9bPXTX543pIlsBN3gmjiBUHACrKopBaslglhUK8jqpF5er1bW1lcnk+UmThqpK5mMq9FotLY2mVRSNQpn3rdSpwDIekqcnYnNdYguzO/SKWQRzozapINKZJSqCmAYiSkXMjh18rRG27Vvtw5UgsKVQZJeFLdTB7IP6LCl7CoNXdbWc+XdNoNApAQjNq5GNVkYlOVm+H+h1qYDuACLgIWdC7u373z6+KnxanVq/Vw5PYhxPKon1aSq6thEUzFjYoRkdKy7kz0Ga5s7k45ZCzjnr0BnGtq14eket9PtSYq2NzBEWzp+i/z298NTBwNgShYHsdw39dp33PHR3/74PR+79xbBgZv2r4dxM7AiEEQ1WgCCq9kZqY+OcrpkNllEvW/KwSrhvJA31Xlb60x5exyuyg3MxgCMRMVMDcac4H923Dw5HMrl2c6KmSF3QCHH/SBY7u5K30kZe89bRUAAKZEQ2ExhIFaQKmsVm9H6ZGl9tDQar1Rx1DTrUapo1ViqtaZaHY8m46qO0QgICk2ZHDqvD2sHwZ8fYG8s8reJk3kM39VhOz+W25kBYiBiMBmcfubs3Pzc9NZhbbGSBmRMME1Rf98TJmSpd/20oGJLjbV+RSe5sO5iIkU9FrJik/9zAdemA7ggixh86SWHP/qle868cC7yok0p0dAIiB6fMRGbQVQZxlkhp29+LeuzdSaoC/u6L9rwA+c/1TZ2brgHe6Bx77VOULKNwbffymLGBG1YxWT+kq1vfM8bP/k7n/jCx++d3zq15crtZGJBFC4bCgmmVoOZvAvXgYhEVqVscttdavlImvfWs4O84yn4zRmNJUNHxDBjMkAn1aSJysHxayPA2PcHGaMhMwVBU5BNWVouyy7krgT2IoKBIF48dTYRsjl0tyQm5lQpmKpO6moc5czK2vLa+sqoGtVNbdqIVlHGk6Zar8ejST1uYlRNn4pku1tCUbLYL6GnsaF60X/YWi+e87k0Aa7N/mBkQhqMg03LCp97fnnP3n1T24txWPVpQ64fZYAhKkNA1NU3vO2ujRDaikd7VfWrI23bhKUjLxorMwlkmw7ggq3N4vsFWabQ177m9c0KrZxZpzCIgkasiSbGRsGMVAAlNnLsNXertv+lz0lOoecaNlhygEBsSXlm43/nvd3gqH4vnejJMwBk/ZZ/f3d3O2cTo0CFZuuhhbe8543zO2c/+sFPLT6yODsZBmME5hAU2ogoLEaNChWoQkUtmolBzMRMQAKIIf9LYiTkgTUpkl6S99UakfrwsrSnMJCjTgYzAWwyETWIqcB8XkLae6fWExLSlv8TVTHN2+KTANLSLFPtKULSSm4jaoOYCGKjsYE0JBOT5WpyYnn5iROnjp46+8K51TNr1ajSca1rVbM6qpeX1laWR2vjZhy1NoqAY2H91GLDGX/pC6p9UXb51j3ef03XLuwHEBYAGEJTrJ+q6zVduHirzlZaCHycvbpEHwAzEqM0psh6+WCXquUEwXKS0b8ccxLlEyIpINTjqNGaGDc9wIVamxnAhVkMvvbKV22Znls6uzK/dxdIRFCwkzHEUwSzPLKrfztZe1+9KOA/H6LpHu0Qkg0BvJnlinGbwL/E23sI+IZIOxscj6RNjIKRKTdSNzM7Z9/y3rd87Pc//qcf+tM77fXbrt3WRKEyqlGEFBRAIBMCk/bKk6Y9tmLXpWxA6iFLSUHiJCksC/ZQlwwApG6LbUhEIYzXqqayYgoGqELJAFUDweEHtRyeJkzJ1Hp2rQdyE8HghQMQjLOIkeVTYmpNjFGgkVA1srxenV1eOb20sjJu6gaqENIIaZo4GU/W16t60jSVRoWBFMoEU+pmtKXD3vnhlzo//Z/pGmlBu7ZQ0D/Flg8nGbEEs1DEcPzYc6Eotl+yoy5roAkwWLunRtBCusBgw3d7pmE5TVJPLimr1qXrp+tFJB9yR1KjsGI4KM/fqc31Sq1NB3BhVkC4ePe+Sy4+fPb06b2yjQim0RDIm4Q0W/zOpiVgBEAfvEV6tM8732A2kbjnPaJgu9p7cgOe3FqMLrDrIUmWepFac+KbSt7cr+YcvzJUXJcXh9e/+7Wf+r3Pf/KP/vT2+tUXX7m7mq7jAMSmEplLgrmmTvp0dZ2djVuaYJWUpSTb5WiRg9yuV+8BM7cgtwEEpqmZQRjw6sr6uKoL1SkMYD63gAkgE+RiNzKTKlN9vEaQ8CbH3rNyhh+AYDADG4RgBlPTaKIWxaSxONa4uDo+tbhyZnFtZbU2HqiSQdWskaoaV+P1ajKuBKTiBto5pklVdKOl/zMCf2Rz3ntR+9L2AG7cmQ3+xEAKBA2Y0MlnX9iysGVqYaA0rqUhAyMQHBLUXI5o08LeNUZt21dC+ruUIB+xhLERvOXOoNWkJnA5mArF5iSAC7Y2IaALsgjA/Mz8W1735sVzKyALlIF+UjMY96koyPdsG+F1VtlaW9wBQS1MxN5xmnT287wPUEZk4WQWAihhJy2IkpFngHqfb7l3qvd1BgIzyPvRAAGZsdShHlvF26ff9J437j148X1/cs+5x8+Wk7mpem5KysJCHrLOHq76viu10bYP4co2P3NOydx+UCbaONHSYR9KM3uRUGpVEW2KQbG0srY+rpo6appGkw6FQsVUzQTmCqh5SxLfyg8XAa1yRN5WUzUxiGpUraNOmnocx+NmMmmacaVLa/r8qfXHjp16+vkzZ5eaSstaUGmcaD1aH60sjpaXRuNJoyii02qMkPUYPH/xKBpo29deCibJCWJ3VVnvDRm0y2Ta7ikzSjAjASFYE1aW1pZWT++7fAcPNZqmIW9mSiokyupfr+TzgqAwJROCEASkIAFFRiQ0jIY1simTwhQ+GEjBZlBThVlBoaAQq3qqKANvOoALtjYdwAVbDHv97a9ePzNePbFGDZUhEBmbAzOZf5iQ6v7KcVwPpOivbMN6OK91tgDpyRz35m9Bn0uY4lH07F16tN22NMQrjQw2UzCYQQWCiaooEVFZVBSbWX3TO1+3a9/Oj3/ok8/c+8zUapjSmUAlQVSS8SYzh9PVB4mRgbjzbTmY7LYfAAiWB85Yh1AhR6uqJhqp1Jn52brWGFU0Nk0dzYySYRKopLmPoiTaTn2EE0RT3UXNDFCQ+kweQ1RtTCuJtTRVrNebyagZj5rJJFZrdXV6ZfTEc6cfffr4C6dWVtesaUgaq5tYVZO1pbXR2nhcNU1ELVaLePEB3pyr6PiV+Xy9FOiz4Wf3xJ+dJ3SflaAsg9NwyUwxsMGp508ZxZ0Ht0nhBSlxpwp3zPlycwBfjZTSCIH2GmMDqwcUhCRz1OpnGHIhCgw19bpKjM3c7DRvToO8cGsTArpgqwS9+sYbDsxsO/aFp29+802T0Kg1ZAjB4dnMgDc638jnfBvozHp/tTDIBtDn/I9ouT5d2dBfb6DcqJVem9MNAKkLKf9pBjgzhondJITAYgohJbOgZBanp9/03rf86Qc/8cVP3Lty6szVd95SXDSHcllJCCALEcoczIgN7EPHyIgcm89iRrmDqMt5cgdSWxRwx5ACYQaITW1+65ZxPanrSBbG48nUzACCwJJzGU39bF7WNfigRlXLHVjWHh4zn1rJamqoozYao5oaaVSJigZ2ZqU6fmblxKmV0XgM5oKHBayJ44k2Vax95mcUNXEeDbnSjkNQLYryEmd1AxD3Eu6hrRq0Z6Y9JIaNT/hRMjCIYwhS8iQce/TY/MLU/NZBY9HUiNny1eUgThuRAL1rI22DkbeGU6pbgciS3Id5SzYZGdTzCApM4Mmkrpt6157ds/NzL97dzfXKrE0HcCHXtoUt7/nqd/7UL73/xteCpmEMUvEJwKnJCMBLmG7gvMDQNv7a2a2XsCTp3Z3xt57xz5x+YGNA2en/9L7X2lZdIqhap9tgAKkJERsCjQe1DPTVX3VbYH3iC8cmy3rtHVctXDoczyCaiBIFNpiIEAU3NmlbiPOxQFdkPK8NDL1pMq36kT+vRgGzW2ZfePZ0jBbrenp2WDWqhEFOfs3MZzJ21tK0Bf6NkoMwqDe9RVOBiIipKImIGjRKFONxFc+urh09sXhmeb2qNFA5CAXU1mM9qceVNFFExCCtD/NGuAT2bCRNbhAZaqskLz7dG894/mDb8Lp8mvOfyZqTC+ZRw6Mz9bnnzl15016eMeEa3rZs5t3LKcHK1ZpcacpnP32T5T7B/P3BO83ymYMZ5wjDrGA2mDS6dX7LS16lm+uVWZsO4EIuBr33G97z/n/3s8888dzem/cMQ2kUPelOJqgXeqf14mz5vNsn2e0X5Q1o0aMNL865woZovx9OZjZOhqGt3YockDtb3dF4kKmmVlGKMI4mYBFtZmanbn77q0saPHnvU/d9dPXq1et3XL632LJWh0YjG0sIgYDIpinm12AEg/bYqS1nPHstF75uDae1T7ExWaitmp4erK6Nz51b2r5zanpGEGujoIqCudXrz2XfhDUl1qebftP89VAjMY1iZpFMo0UX7o6GlfXJqcXlZ0+cXl6vG9HAA4LGuhGL43rSVE3imZq14JwfJXehmneMurOeMf3zTuSfYf37579NhixfMtb9pNRpAlZDtFgiHnvyeKGD3fMHZwcLTVgUjQZzpAeAQ2YEStUKpa5Ona+1Fq/rWrKV0J41EuTCEmXUMMZIoJ07d/JLljc21yuyNh3AhVwMXHfNtdcduvGej3zuwFX7p7baSEUojYRvB1u1so7pbRtv6w0rsSfbJ6hnL/ES6UGn6+KGyPrf04Z41L27jSVzcpJ7Ya01LkoENijnxk9jgENlOpgvb3vHnTOD4f2fefjTf/ipS5+75Oo7r5zfPbOOptJoHA1w1Uw1ZU46CE4JzYfivL3eqCZkOQb1wrJEFMWO3QuyvH70yROHLj28NlqfniJlM7CqM4FSk6tj722e0VL+TaGpIuPwt5lBVX04mxFX0c6srD13+tzJxeX19WgKJg5MqnFNq1qbRs2i9s2cZQWjVoGps/75JTkD7J3MlzrdG9eL2sQyp7Z39h15Yj/TQkJVdfroyRBnV8/FledXw85iJhQa1NiE1BIY5jNyfBNDzrMsdfKZ5nI5EWkfu1TP5dytefdd2lKejCdMdsn+/UXYpIFesLXpAC7kYvCwnP7n//RHv+pbvu6JTz124xuvG07pOirrbIGl6ucGq94LEc+LDan1Aen59jl7Ufif3EprzXvk8fTuPqicuffZZBmMsjxxphhacI2gZKmTHIAyBYUpi5mgsGvedtPufbs/90efe/yBp08eP3ndG27efcVODLnWiZK45J0RCGwgM3YaYtjgg6ivwZD6mFscyzdWlYLVPNmycwagE8+ebapLo46JhqLQIgQ2CuoNwyYKQOFkoFyDN1LkOndCQdTrAKJRoY3R2rg6vTJ65rnFxdW1SRSiIsDYoFabxVqbRiMZBypgguwwkz/r12hSpO82mjacupd09f9fziCdastGuj0o+QyTqjFZwcxSrJxeXz9Xb5/eeezhk4898RTN0fzu6fkt08O5qfmtc9OzU8OZYSgKLpUDUQATG7vGBYiUCGaMYGDX2WYFpEhtBhpqFRNol70RARw0TNbGbLZz5wKdf2FurldubTqAC77sjW+64823vf4jd39w/xUHFg7NMDcC8zJsH6TvDEGHFmw0A6mE2ALAG4JjtLag/eLuuRZkboFna4u9+eNalIiS6+iABkp6NYnzYW3vAYjZYKpQRgBCUYUopjtu3HvXzrse+uiDTzx69FMf/NNDJy658rarp3dMj7kRRLOamKAAM0iVjRRB/bs5fy+nWjlIW4Z7K+pDrgiktdWmgadnzz2/Ml6LM1tCFaVRUSm4CByJieFbyKxI8a63iaXmApAZBKoKdehfoCoT0+X1yfHT554/fmZtbGLMBUtsQmCYVfUEwYyIrWAzNhXLkznbbMYPq5+1FmbLJ7vLCl7iksFGD5CbPM4rGfhLKRczHKUhg4KYjEQVkOLUs2esqn/qp/+1lfbkU0898ewTJ5ZOPn/i2bOPnTlVnY4xej96URIPOJTMAw6DEMpQDsowDMOpQSjDoBxQMDGtVieTURUCLeyZ33rRbLF1Og5YGGoaDFkpD4NQro/GzGFh6za8aKM31yu2Nh3ABV6BSFR+4Vd+4ZJDRz76B59693e+fTgsqiACVUPBLfjTWv/2rX0L38I37Z/IiHjbGGada8ivMTN4yN4LR93ZdFBy+0RuhTWPUr2T1hLB3AgEhauHAZpeZwRidn4rGoswCFPDOrV3+qavu+OiK/bf9+n7nrz3mcXR2lW3XTu/e0c5qAIrIpxJxMSiGpjVKE+VQZoQZgSzkCb4Zul6Qq6sMhE3avPTw4WZLY899KQ0bzFwhJEYm7FE3wkyEIw4GIFcKMLBH3dqxmIa1TSqiohEFRtHOTUeHX3+5NK5NbgTH00AACmwSURBVDWzEMyMtGFTiSRmVEA1sZg0oecuemPUOwddrpQx9Pa0IdckUvXFzrsOetlZ+17rPd5GCY6K5fPXgoFBC7bheBlPP3L8LW+89bKrd6+Oli/Zf8Nd8YZaRaKKNqvLaytLy4tLKyvr68uryyurayujldX18fpktLq2Nj43WRuPztSrdVPXVT2pa2ni1GBw+aEjb3n961ab5Y/+4SfmDu04dPNhmquijYkIEBgZQ0mXRism5baFXbxp/S/c2nQAF26lmI8UfObZ5duvueOzj33qyU9/6crXXyoFS8rflSjAOuNNG6xAa3n9Lu969Lvsv7UMnTxka827GoAlkAPUmpIcOPZRZH+lAkq5vEfGbm1SJi/p492saRL6NCR1NmMQAgI3hLDAB2+77OJ9ex74zAMPPPToZ07efeXNV++7Ync5PyA2ZZHcP2oKzhqfRqYp9mcgbST365JJUUKVjRBixI5dOx6/72GpUTdqGtn7X9k7h4m8BUw9MFYXE1VTVW1EiIOo1o3EKFHEVJsmnl5afeqFM+tVbUIAMxkjmpp3IivUhLIv9ePGKUUxo6yV0J6b7BISjt7L4c6/Xl78YFun6bn/9gJJV4dlvpb7ZvIWQaNyHJ564Fw8x1/1trfFuKbNpGpqiVKEgkyHIQwXtu7Zsd3AoODe3AiqJqYiFiXGGOuqGk/GdVXFqE1TX7zn4h3btgbC3Ja5I7uO/MMP/Muq0StvOzg9N5QQI0AKlw9cH42msO2pR547dGDftm0zGzLRzfVKrU0HcOFWum9psjJ+8oGHfuB7/rcf/8nTX7r73kPXHpy6aFBr5CKQqqgQOCAgSWdmc5/KAN0EX4/1e1gC9WPJ7n3IkHaC88VpL23HK/exY2qdAdhFlNGLw30+lRH5NEpTQBnEHDS/1J0NUQCMGcRsqjCg8EJxcdGhg4cPXHXHa45/6IN/8Pjd9588unDw2st2HdgZpgcoVSVyEaCqwcwJKOwlB5cR8mZWiqo99weFGkMJxKFR3nNg7+Ofe+Cxh5685U1XWlQKpAGqKo2ScTAGALZENzISFwoCi2JSj0RUjdSsAa2N6xMnzpw6fU4lmDKl0gQZwCCfk4PUcpub1NJ57gfu2di1rH/a8PsGEMd6b7SXfAnQ5gDtIUibRT3H75mEghAsFHFQrcjRh5695rLrrrvqusl4wiAT81FFxCSmxFpTVAgAUhCxRQMQQAXTkImGbDNTxDNMgZmYuK4a4Wp9XNdx8sbXvvF9jz3+y3/y65PVxZted+NwIVAZDRLAk3E9GdWHdu2B2Cf/8CP7D+y+8rprZ7bM/y/cPpvry7E2HcAFXAkjefi++7fPzIUtcz/41//6D/zo//Gx3/3km77lzVNzg4k2akIcCFARyqB/ryaYm2LzT+o93OL62RAArtzTPZ70jJXSvFx/JEM+BPQZli0WZT6sJBksa79fCcYcDKZpSpkSOc/SI2MmIigVxhAulAk0qdcm6+Nzx5aOP/JsXK6mVvnc0tLiC/ctXLLzwBUH9l6+Zzg9FJhwBCQpNasRkakahIsgohQISTE77wUxYBojEYxl654tYDx8/9O3vvmGOq6oUjBQAaaCrQiWvKEiRhFVFZWqljrGKJ4JmCqiyOJofOrs4upKFYUKTr2unDF76ersbZGkrbQnC59ncqZHepdCrzDQK+SkU7FhbZDubqG5DS/unGG3HarGIA4MiIGCDI8/8mxzdv1r3vu2mUEpk4qo8JpPJyBhBkEAO4PHgJ52vzNEBQZTEtRRiLyxQVCWBNao9Xd/27cXZfGLv/ufR8srN77++p0H5ymQNWF1VNcr8fLbD+/duy1OyqWl1U9/6lOXHDx45LJLuSw2U4FXbG06gAu4yGBnz5ytJuMCYMW+Xfv/8jd860/8yr999AuPXXvntcZWaVTSkMaVt41XeLEV6QTEWieQOXf55d6an/IGv9FdTIEsEEDGQNJ46cmGZbGQrLPsnD8Gez3RbQXnad/e/+mIkAMeasoloCg5QEIgFBJCxOri6MSJxReeev70seOTlfGWcv7G62541TdcX0/qzz/wxfueeuSLzz107tjqJZcf3Hbx9mI6xHJE1KgZcuBtphoVYI1GUFDSgjMznzlcIBBLxOrM3Mxg1/SjDx8bryhvYyWTOkKYGU6HJOUIrTXWdSOidayjmYrFKKYQU1EdTSYnTy2O1xq1ELgwCHE31TijbhnV70XsyAF7j8ll2HgakT6ipbBiA5//vNe1fi77ij6I1H1tfrt76sBkaqoWmAoajs/R0QeOX7Z/312vvSXBX2hDjJxUUmoAzBeeZb+QKwnoBtSQu3ywmTruJFaVYfi/ve/bt81t+cD/+MAnfv+TN7/2xkOHD09NDUanj2Eihw/unZ7BWGmAKaX47LNHV1eWr7jmmtm5uY1ubnO9XGvTAVzIRaCVpRVWZWOwIurrX3Pnb/7R7z1yzxMHrzowfXE5cYl8H8DtyzIm008F+lw/tyOJgd8nS/qd3GUNPbQgq6C14WWL/HTfkIoETI7mUyJL5i0z1XZDQmAn7xOHgJJBzERCFGVtafz8Ey8sPX9u8fgyVdi+devbbn3L13zVO6+47EoWmAk4fMt7/9JjTx39z7/+a5/7wmePP/zcjn07dx/etfvwji27ZrlktSimjQqYVY0Dw4zYtW1y8mNw/U41KJo4qPcc2n3soWOnTq5unx5oOSk4qIpprZoOqf8RozRe6hUxZQduqiaePrW4NhqpsFkRKLgWktvFvs1Ox67LsqiLznM9pTv5L7oYuj/y+aQNL2jPMVH7fNYptSTxY+lre9vgp9gpVQgC4qEOjj50eny6/pp3v3371rlqMgrEsOTFqd0C69UkEgZo+UG/iHpEtPZaIDaAAaOgGoPhfe997y133vxTv/BTn/vDz586cOaWW284++hiWZdXXXZEmgpmUCO2qXK4srp6/333Xn3VtQs7trsCHzbXy7n6V8rmekWXW4YHvvjwmaeeLSkoq0Irot/5yO9/4Jfff/mbLrv+LZeuYz1qAWW2JJUM61llAO2DvT97xqgFhfP8qw2lws4bJEw/Iwob7Q55KTGDCcikwlSYJUOqapKRIXARlAK4NLZIQXl9HE8+d+r00eNnnz3TjKrZ6flDey+57ZY7rrj08isvvWJ+yyyiwbRp6iKEGKUoC0KopX7+5ImPfeKjv/8Hf3h25Uy5dWrbvh0HL9u/46Idw63TUsSaG4EoKWBRhZ0mlCRpxJSE1cjUbBB4+Zlzn/mte979Pe9607teXYe1kkUV0mgTY2xUVEUtKkRNIEZg5iZqI7K8Mj57bkka05gVcojYzCy2EqHJ8G88cP3YvH8su5PzEgT41pnYRoO/4ZOoM8td417u6d7gRjxiJwQAokKlBKJgQ17ij/7HBw/M7/+XP/pD27dNx6byyV/UFfzbC2WDfaDz/u4uofOXIVNbjWHBmNdp/Mef/vC/+3fvHw5nltZ0x8K2//wzH5gfzDQxcgQKNhMlYuamaa685uqL9u2HKnhTsPJlXJsZwAVbKdRmCLSgwgA1IuD2G2/77Q/vf+aRk1fdfvFgq0U0qiVblvLthXbAhmpAztKzbUD7VMrW3ZRYCpE7Q26UxFo2eIMNm2pIAjlG6E1gcftiSmAiBnNQIwEaqarqxIlz5547d/rYmdVzo1LDZXsPvfn133DNZdccOnTZ/JZ5ChgOB7FppKmlakIITAS1sggqItYMirB/z8Xf8a3f/k1f/01f/MIXP/bJT3zqi5/99CN3F/ODXft27Tm8Z/v+bdNbpmkYRCUEJW8ISNNduEEkEGlRGJFg2/wOmho89aWjb/ra28ViRAOlGMn1oMUQASEgBEgQadYmk8XllXOLSxKJqQxUUjAz6Y42Uc6bekjNRjR+g32nFz1sOM+un39yk6XvI3sbVnbuKTujNmBPjxgbt317VMBIQDzE8KmHj1Yrq1/1nrfu3LlQT1Zf7Kn6H3/eQxvaT9Lf/SiSuh8+RNQM0Ng001ODd731a++48faf/MAHPvapu6+75s752fk4HpuCQvAGDGIzEWJ+5EsPF0W5Y/eul3Cjm+vLtzYzgAu5DHjmyacfvfehqcG0kSqkMRXwf/ngr/7yH/6XG77qyFW3Xb6mVaNEESxIcXgCdvqpQG6xz+yTXmgIuGFps3VrjX7eiJ6NIcrGPYVwyRWkCBFERj5pPRAXxgCzsRhP1nVtPS6fWj7z7Mnlk2dHSyssdtH2nTddd8Orrrz22suv2bl9x3Q5VQQy02iRSopR3KwCxCARCUxGMDUuYLBoRqFgQcEDET5z9uQXHnzo/23v3GIsu47zXFVr77PPpe/dMyI5F3J4E6WhJFK0SVOyLSdwEAQJEARKgCAwAgMBkofAeTD8kBe/JYjzmrznLUCCGAiQy0NgA4kB2nJIiheJoShRQ4rk3DnT09OXc9l7rao81Fpr79PdQw4hUjPWqW8G3X1u++zLOVWr/lVV689ffun1H751MB1jRdXK6MTpzdWT6+sn14seUd9BwUIgGIhAMEBwxM5BUdXlS//zL0N/8gf/9nd9NQvk2QMIMYMIBQ6NBM8yq8P+eLp9e/fmrW0OUpV9IqenhiEwBydY6grASJyXZYb2ZLe9HTryfVLKjzNkmEWVQ9/EVFXXlfpU4cm+QgBi9VuSmKJGo87cEVPs/EYYikAQ+lSFj/v/+z/92RPLZ//oD/9oebkMwWPq2vapEcAdHVF3ZJKfKLE7KBIhSB08OUeubIL/2cXLG5ubD64t+aZBIt150vbeIlKQZnw9+9xzw9EyWLvoLwyLAO4lCLC+uuo5BGSVYJ0IIb3wtWf+65/+8fUPtp97fnnUW556ZueFmT1zENY1rdJ6kVG61zZmWbqNNim3kxEt3Up/JdGIIWd/aohBUVFuBQ1M+j8BAWLwgQM3dTOehYPdyf7tya0bO3u3Dia3JmHKVVmc3jr9zFd+4/yTX336qa9sbm5UDsvSIWs5VAjAGFe+lAJIc+d15xxpylA0tyjoSOetMUjjA5/c2vqbv/Xbv/2dv35wMH777R+/+uZrr7z5+qU3PrrAFwC4NyoHa4Ph2rA/qkYrw+XlAQMzQwjMDZa1C5Mw3h9vX91feahqnKQKNmLhAK6u693x/rXrt/Z292dNIOoNqwELIgeWAACE5FSNYdGM1Gjk8uxoMvm50Ta0A/P2iif5J90TXbEc9wTuxHLpvbpD7ORvUFJbOcitK0B7MAF6RAQMKOyoGPjqnTeuTq/P/v7v/O3N9Wo684SEyMLQUbOiqARJd/yEYbjkTxu2d6WfuhQ1BmZAKVwh6Jq6HlTVlx9+BJwE74lIQOsnUCBWfUBgRJzV/t133/36176B6Gwy4AvCHMA9Zri80qsqz1yQoKBDApRHTz3yzCPn/+K1l//4yp9srK8Phivl0JX9qlcUrixcSa4kVzhySM4hiCsIkQpXCDMSAgIhMosDUl08BNbMjhAYEJgZECUwoSN0iChemEE8M0vdNL72fubrWWjqenIwbWazycFkOp5NDyazWe2bpmk8sLiiGA0GJ0+eePyBs+d//cnHHjl37uwjq2urVTHAOPgLCMDcAKVuRIIimm/UKiFxBiEtLEtIap5dQGFBQhYpeijsRRpCWl3qfevFZ7794rNNaG7evPnu+z996513Ll659OHFD2+8d3M31ADQNI0P2tknAEi/V/HU1TT58RsfPbv5FFdhrz6Y1mFWcz2rJ+PpdH86mU28FxDsl5UjB6FxTjyAc47jai3qYbXdXdBQSdp2eno4OhubXHA02Wons1/oWNM2cSgSHUs+QUnSix3VYv+16BWw9esEBALsAAMACCITEolDDrUDKqFHTX9vmy+8/rPnv/orv/7Ci+yDIxEGSNM5qYn/oV08fKPdd4E81khHE482dTFBQUCHKjAi+6pwHAJzQ+wEczu+NF+us8jAEGDQK29v3/z4+scnH3rgrr5LxmfHHMA9puwV65sbN27eiGnygCJQlYN/+g//2fnXf+WDDy+ND/bHO7W/We81ByE0gYMIBxFB34QGCJi1pQEjADM756ggCSkjU0BEgg/A4IroNjSOEC+iK2NpnwPPUSli/c66wpW9XlW6st/vr1Xrq1sro1PD9Y31jdX19dXVk1tbG5vrq6vL1aBXlk5z8VEkgJdmAk7LxXSWGGNr/dw5utNTLhuMpEvlRyXWJbMAAAa1niIQGBik4cDOFZtbaye/9MK3fu3XmGUyme4fjMfjg8lksr+3T+R8CFVVjZaXitJVxfK//w//7k//x5+cefY0r9Xb451b++N6FsK05gacdyKOSDTZPc+5YFzlRJctjAP6TvFdK9fP5eHm6tuuNJcmiyF16cjb7LiB1Fg1zs/Ez0mq5NAgI5YToLarQ46d1pAhrtOl54rV2xMhAKNw6Yc/fvMCTfG7f/e7g/4gzGYMpPV9eX8Td6UMd+oNoD0Lca6pfQJm0w7IWrlNRSe6SJ1GEVKhBAIyBC6wuHL50ubJLVdax9AvBHMA957Hn3zsyvcuF9gnNTkEEOTx0489ee5pdEIYmLUXmQQOQYRD8IEb3/jQzJp6PJ1N6/rmre3d/f29/f0b2zf29vZ2bt3aPziYzSYhcAieRQaDwcrKyubG5mhpVPX6hOjQVWWv6pVE1CtKVxRLw+GgPxgNRv3BYDQYDgb9sihc4QpXiDASEhEiEmGsEgIRYUARDogYOJC2RaBYW9yxcZ3GFHl6Id2RtYeczSQdwxmH1OnPqBYEICTNPWUfdHvDQTkarCFuCIBD7TtGmFabFIB//A+++/K/+t7PfnzxzDcf8DOZhhBIHLqyJAQn7EFYSyNYxBGhrnQIIOBjf7nYUC3tXBqlS/sTIOVmtlMo0OkKka1sekaeuoUjT83zMtk6prfWNwxp0kcAG/WgqNmcCIIBJQB4QGGBqlftfjT96Rsf/o0Xnn/x+Webep8AHVKnHZ109uoz0hG5MMuMem1j1kEa4Mv8AaabKoVpWgIBIjlmKJAO9vbG4/Hy6sqdohDj58EcwL1n7eTG2vLq9GAqojExEGJgbqYHQJ4cYxo+g6DKuoVzFRFAH0ckgkJE54gBiUgAONV7URqRIunssQTPAFi4In//MI5zRQSEAwBA6oasBpmFEYOmgIN4iKXAufGntgNWTd8xpz6geV4asrmK4/rU+nJeV77Tt7ttLZpt4ZxSrW9DMeNE3ywAQACvXeGi5ILgqHzqyfNnNs/+v7/40aPPnKlchcVE2EMBUgsUIoE4MGEa+kuaI0ndqeeMbwpvWht/aMfbXZX2riMNXDsH1H2o83dMtRUmbaos2U/Gwg4hlqBXHkRn67XJnwACkiAEhKII5dvf+9FGb/Wf/KPfKcR7IL3SWcPvBivS3a/DHK8NtSIQdu9s56ZjpJfL1dsuRSLdpyNx8I6cc8QsLDKdzpZXj9sR4+fGHMC9BgEAv/bsM//3pZcCkgoNDgi4QadfUWTpmLEoJ0dtRU0UIaKgQxEfPYQ+ORtazdlkkco5FkZpQNq6Kd0N6UwdIwkCqbhNaezepiTGSVBMM8txhKd2KRUJZAmg1TOSqpGHvodnRDM5n71Thzp/2lqz0Wlol6xWTICXqNGolsTCRdV/4bnn/+P/+S+zndlgMBzK/tjXcZcFIMpwmuIJLIJO2ye3dRZZ5IlqXdaykobRivzxJGQ3J9Kx/njkj7nPRNeKSjSRyCSdGCm32RMhh8gcAFK9tl5MBCR0UpAUJa/+7AdXr793+fd/95+fPf3QeLznXJq9zu+Szqm0l+awm74LWs/RLSLTh1K8J9ANmjAtjBMn1INzJCAcYhxlSUBfHFZkcV+wtrp25uGHa54GQIA0SGIBQQlIQsCE4pARmUiQmFCQGJEJGSGAprKTjsxBEASBEUX/qyVyCCweQYSDMAMLas9jEQkCrP8BGTAgsKCuzMKIgsAQ/8fAIBWOQuzDE32I5KQXTEYwOa22FcJhs3/MOBMhjcMP3Y3RIwjmvwkJYiKTHrruU7S+DAIocQY38F/79ndoCt//szddXa6Xy6PeUoE9JIcIFA8NJa4EjNr/EiieUEEAEqE86xs9Jkoa8IueIuzI95ia68w5jxREQIq10ok9dC5aYUiS8h+3JRq0qUAFuoCjXikkcA7IMRUN4QxhQjuXDt586e3vfPP5v/d3/tZ4Osa2oUMnKJFWmusGIMd9YI8Bj7nVkf3i5TkUPsTTJCiMEPPaKC2Jh8IgVLrllZVuqaPxOWIRwP0BwpfPf3Vvd/fW9rYrq6CNhQkBGRg4RvqQbIwGAvmrqyVJnThaf0pKCtfbInNj9jTozzsQXxnl65S00w7l4rNE8tPTYD7fkzqywdwIPb/DoTvmbkraeM5El5TW+qlf/Vw7276Vxj3SmuV4cEEefuTcN89/4+X/9ertunnhN58bOedgOuUJQ3CIAUC0EyhzbG6DIFrwjCmcAgEUSlV0ksOZrk9ozxEnMQagNbA5csrTqIdVGIkCltrIuD3qvKX6Rw1CGFDAaU0FOgcNOsQKkIKrD8K1y9vvvPLRkyee+Je//3vQjBGZyHFgihd67vLi3Jm8O+bjhXxvzHySuVhgzgvGcUPME9APNrGOfAAKmjSzx89+uSzLzxaEGHeNFYLdJ4gwhFnz/e+/fHtnp8A+AQsRgmgbuCjUSmoX1prFfE+ru2LbB7q11rEmM04Qtq/OT2j/TqYp2x2Zfwocte93YTY6m5/Tf7rbyM/sqhNdif2Y7gnzL+reSIZfRXwSAATHlfvgo/f/8N/86/cvvAtnhmfPnzp97szqyeVq2OPAgYUlAIoAM4pIiMuN5dyfpM64OGmZ05fSeY5PglxMJ0AorS9rfXLribtHnw9VAOLsR6pZyEmfAoAoiEAoyMiCzkGcjQcPTmi8N9u+eGv74s0bH93GuvzVb/zGH/zev9gcAE8mjC5WFAilLNZWxbpDZ4dPIA1EUmbSoVr0bqCn5ybXt0XRKZ03AQ3liAoKLFNozpw9d+7co0TYNiU0PlfMAdw3iIBIPavfeO21ne2dqqiYQ0zqT98cgNaap29dO7w6bEK6t+PT0jC08/jxNh2TZzm6m3nDGPe684rjD+zYQXybCdqx2u3hSNql/MDhjjjZbsr8ZufuwlY6UAvjAJHATf3s9be//5//+3/74U/fmtzeB5DiS8snzpw4c/bU+tbKyuZIoA4le/aBhIl1ARzWTWtTNV3apD2KNPndPebUTRM6rivVYAsm24nxWOPIX++JgZC6fJ0GihIJIwICETgXtB7bcQMFhKn3H1/dvnTh8s7FW2HCy8O1sydPf+vF33r6qa89duqR0jXIHoRFiw41JjxSwHvkFHbOd5y07dS5HXNxZf5XfIHkqfX8UtQwh4EIASjNIDFIE5q1jbVT5x7b2NpwYNb/C8QcwP2EiAiw5w/evfD+e+9R4QBYO0FDHD1GH5D+tTO3mTa3QjepktGRp6pNOmS+5x++824ebyHu9EE6uq25t4o6VatBASRvdEiS6O5+K+0ct932gVYDQgQRJNE0VVcUjmrEg2nzwQeXX3ntldd++MpbF96ajfcBAgxp66GNM19+8OwTp6rVfgMcwDMKo6ZHiUoWzHOztYLRScWDPqxfRdsnKb0KkiCU11HWlyFA7DGBcTyNGAQRWVjAiRCRA+RADoQbt/vx/s6l29feuzQ+mPRc/yvnnnrmqeeffurrD516aGk4AuDCAQYfpNbJDQAU1oZ2MBflfcrYHztX/lDMOPda1BkUxDYywhT6iSBSOgmAREB6WAiBEbCqyo2NrQfPPNQfjKAoyOGnfBaNnw9zAPcVcVDvG75x9dq7P3m7brxDB7HzcC72P6SKdH/dIRrQR4686lgRZi5y/zRPcMyfdySO7I8ZtB99o6MRAECuFYbuweTfnSwgSO+i+gLl2WcUIIoqjecA4ErqQU9E2Pu96e233/3Jn7/6l6/+4IcXL13z3Ljl/nCjGq0tn33iwfUvrY1W+oJCVdlgCDBm1sb3qd4L40BXAKK0ohGA6uCaiZtb90SFHEVAXTwixXRTRgQQZiJEYQdIIATaJqQMNc2mcuv67SsfXb119ePpzngog7NfOvvtX/3Nx889ee6RR1eXByWVBIzCQYIjaLx3+cx1ZSc9syKH/WmaIG4jlPn5p2MuYreHEAKAtsoQ5DRMofTqeLmRUZhZAMBRf9DfWN/cOnFiaXnFlVpngpai8gvAHMD9h14Rxp2bNy9c+Mn+7l6eZc3fsnQzTU4eNd0dfST9PBKzt1/iuTvvENsfG+xnteauD+6INHWHcSRAZ8gfTWd7+84KRBSyNYkVGUHncHVzuZQsWkICAKYAhMJBil4h6IAKoWJy0Nze3b967do77//k/Ysf3di5fvn6h9PZuOyXqxuro63h5qmlla3lquoBOQYQgiAsIl5YYsansMRkIgZmAgZOddDq6QWjlA8gKKJpSCIsgALMgMLCMhM/DePdyfbVm9tXb+9cPyi4tzZYefTUo8+c/+b5p77+8INnlkd9CVyWBYGwNAhBKKRGTzrpgKwpXJJ7+rcnsGvQj5F+JGlVrWvtBD0AsZVctPS6EckeGfUoU8MMzXoikqrqr66tLa+srp/YqvoVgBbtxSU/sXv5jS8McwD3KSIMARjClYtXLn304f7+vogU5FKxJKQ+vxj18fTCjqw8v8Gjd3am4NJrj3DngOLI5ju/2qM4ZMQhK1fHqkiftv27NwlRfdeXkWQX0E55ShKGABAFmFgbF4s4CEKuLKkERGFsGt80vD85eP0Hb7311o9efvW1i1cuM+wCMBZUVsXSynA4GgyXh4Ol/mBp2Ov3ykFVlEVvWBWlc4Oi6AFXjKUE5BBLqJk9S2Bi5Ia59rs7e9PbB/WsmU1m3nMzberaz2ZBamjGstxf2to6cfrEg88+/ezj55549JHHql6BUKAEEg6hAcchBEJSTS1eN0QWjiOEnAAmHUsOrR/X54tkmR/b65WuQVRk2gSt+auYihAENHFBWIRFAjM5Knu90XC0vLyysrq6tLTUH1RIDgkA43KTd/9pMD4vzAHcr+jQTIIE4BB2trevX7t269a2Lt4NiBStPuIRu9gabZzfHnYMabLBaVjcNfVHRJlj7O4hE55CAWkfTgPBz3TYhyTmY7zE8YlAh4WJfFQSiwNAYyVGiKmayehJalCURuvaSg8kDnAZiZwAOSx9gyiunobLlz6+evXSB5c/uHTl/QvvX7h56/rOzk7dTBtpG0SjQyBdm9KxC1jxysnh1qmN5ZXlACIhOoAwDc3ESxPqST3sDZZHS2vLGw89cOrE5oOrK+trS5sFVGfPnO33q9HSsHCIIITIHLzUzB4IEDXXVKeHgYBiG4446xFLBmMs0I35uhesvVzYeg9oLwVqswYd8AtmLyHpX7z8cWFIJOf6VX84Gg1HSytrq/3BsNfvaaU6EoHECsO2MPizflKMzwNzAPc/cSjFEnwTxvt7N29u397ZmU7GwQfhoE3fEJKUrN/c45VaVaoT87pRRynKqdvzrzy6Y8f/hjSevIOxPnxs2Qp03QmkjBM47EikNeB3MhoSVQokljh7oqp0nvSMU65q83WOIAjqtCi0wRUgMManAAgKgSNEcmVVViICLA03IuibZu/g4GA83t3d3d/f29vfn81mO7s79ay+fuPm1WvXuJQHTp984IGTD558sF9VCNgv+4PBYNgfINLK8opzRb8aFK5w6ApXFOQIyaELjRcJzEEgsHgR8eKJSMfYROh945zjwLo0s3YLTR5ej5nzwB2S0Z7/BKjfT5+YpNR3PACmyxAX3MmtQgSRiFzhyl41GAyGw+FotFQNBqPRgIi0edTcp6k7WjGrf68xB/BXgfwNZRWJAVG48bPpdDqZTCbjejZr6qauZ7Pp1DcNi3AIzLnyC1LsLt15vU5e5d0pMofG5UcVhKyrZPFZZE7zObI9abcAkCSI9F7JKmGnT0Xe/ieeKgEQRBFwKmgIAkogEIgSSRK1AQGACVC03Wh8ROvlUlFx3AUEirI9MbCOgRGICIWB0HHspaxN8xwiEpFmuJRlKYIcL5/EWu3AEiV/SNmisYm2FqCx1i+7JOkwo2gDUBDRhlECAuRQV5PLVWXUGeFjbA+RPHIU47sNO9I1yDO06fSmdYb1HQWJiLAoiqrfr6p+fzDoVVV/NCx7VdWvXFEgISJhahF1t4V8xr3DHMBfZQQgfaFBIiGEEEJTN977yWQSQpjNZl5b+DeN9z54HzgEHwCEWcWR9nufE0yTwKSWOA8Msxrclf3nnUfH1LRikhz71EjH7mPrTOZcR1dwPqpvtVtqz0us/koKD+RCpe7mAHJuFTK0UyIcW1oIEAtg+zVRJQQBUyv7zpnQbBddK1I6Xyxd9IYJBMnlI1Xbmqr2NOLBVOellV/aYFWzS1HXoYmlfMIxdMFcuU0IIpQWds9BlD4DhKF78tugTQAgOh3tBygSJ2MdOVf0emVR9gaDQb8/GA2HVVX1B/2iLImIHAEAIqn3OCYl2bjvMQfwy0hq9Ja+jgiYogdWLcl77wOHpm6auvbeN3Xtg/d14733vvHecwghBBFm5iTxYjaX+jbZELd9JjDd3xn6YSviJPtz6EMnc3apuybJJ1mUzvu375gqoxlzANHZNLR+JG05+wVJ3T0BUhYnaHZOfELa1UM7n3xkKrnAztlAhtxzT4fo+frkDJ18GDlEU+udFmpROx7tcj5uYuaYg6qZNVrj3YnrYsUCahJOUm4AmLl7ZhDJOdfrlWXZK3u94WhU9nrDpaWyLHtVz7nSFS5WKjjq9HE1W//LgDmAxUZkzixrSMEAICEEEfGNZ+a6qTWqYA45kshOIoTAProKEW29LzkiySJDsjidcAAg6VMx2sizgsm8dFIUsWuy52pSOyQDq5YQonyj/mG+E3NnZ9JsqCS9vC0/a3ceclub+HpO76cOJ07EQB6mR7lFIGW1AyBwbDGk8vy80+p4SFKVT9uDC0BHx8e0dzFsSkk3KZ+psxu6m3rwIgLiiqJXlmWvqqpeFHCqftWvemVJzvV6PdDQJu/w3Fk1fgkxB2B8Iockg/kHNOWdmYWZQ2Bm770w++A5BN/4EHzwPoSg3oI5sBLSH9rxvUNOJ0nv0xm6A6Rx+dwO5VKm+QkF0qbZWafJpjvbbTWtulqmE21CmfySxGR5yVn7SefQG535lUMnJdUDJyuv0wBy5JsmebIkHx6kPRUA5E4gFScPIB+lSkEoRIgEhFT2ekRUFGWhU8hErijKsuz1+4RUDfq9qlf2SgAsyiKa+Kzb2Lh+UTEHYHyRSGeqt52rSL9UctaQgTnNXbMAZD8R4g/OAQe3TxdmlsBBWMMPTpqVlmBpPALQFl5JW6qbhtII2uenAJVutLM0xb1vYxiW7hwEHIk94iahDVg6JyFuCDsPpRAnTrcgAAAR6QQyAQCBLsBGziEhEhZFWZRFURbOubJXOVf0+72yLHu9HiJSURCpKN9J3lFovuXo0T00FhVzAMYvmkO2p3sz/921V8mJtBOX8VfqkZeUq6Q/cQAADswizAwgHFgjFf3JIeh8ZxNCHbyXAMzNeAIAwTORAyRyRAjBewBiEV14J+6RJLmncxxd2USVdyLSEIBIc4N0ehjJOeecI+cKl9EBOyIQuTiPrJ4hLuqmG8/pqblQK1b55o5PrczWnjgz9cYdMQdg/FLSmXXuzBQIZLPZijSg43tdAA1RhLMw3yYn6c92emJOuml1svYGHHn4OA65vjm/F+dD0k6aITc+f8wBGAvHUWMb/cThWYQYXBzuQfRJpviTHr4b+92NgczeG1805gAMwzAWFGvAZBiGsaCYAzAMw1hQzAEYhmEsKOYADMMwFhRzAIZhGAuKOQDDMIwFxRyAYRjGgmIOwDAMY0ExB2AYhrGgmAMwDMNYUMwBGIZhLCjmAAzDMBYUcwCGYRgLijkAwzCMBcUcgGEYxoJiDsAwDGNBMQdgGIaxoJgDMAzDWFDMARiGYSwo5gAMwzAWFHMAhmEYC4o5AMMwjAXFHIBhGMaCYg7AMAxjQTEHYBiGsaCYAzAMw1hQzAEYhmEsKOYADMMwFhRzAIZhGAuKOQDDMIwFxRyAYRjGgmIOwDAMY0ExB2AYhrGgmAMwDMNYUMwBGIZhLCjmAAzDMBYUcwCGYRgLijkAwzCMBcUcgGEYxoJiDsAwDGNBMQdgGIaxoJgDMAzDWFDMARiGYSwo5gAMwzAWFGKQe70PhmEYxj2A8F7vgWEYhvGLRwD+P9+1NmicLKQOAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<PIL.Image.Image image mode=RGB size=512x512>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prompt = \"A green pokemon on white background\"\n",
    "image = pipe(prompt=prompt).images[0]\n",
    "image"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: PixArt-alpha-ToCa/notebooks/train.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c423d2a1-475e-482e-b759-f16456fd6707",
   "metadata": {},
   "source": [
    "# Install"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0440d6a7-78b9-49e9-98a2-9a5ed75e1a2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "!git clone https://github.com/kopyl/PixArt-alpha.git"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0abadf51-a7e3-4091-bb02-0bdd8d28fb73",
   "metadata": {},
   "outputs": [],
   "source": [
    "%cd PixArt-alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4df1af24-f439-485d-a946-966dbf16c49b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pip install torch==2.0.0+cu117 torchvision==0.15.1+cu117 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu117\n",
    "!pip install -r requirements.txt\n",
    "!pip install wandb"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d44474fd-0b92-48fc-b4cf-142b59d3917c",
   "metadata": {},
   "source": [
    "## Download model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06b1c1c9-f8b1-4719-8564-2383eac9ff28",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python tools/download.py --model_names \"PixArt-XL-2-512x512.pth\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f298a89c-d2a5-4da7-8304-c1390da0ba58",
   "metadata": {},
   "source": [
    "## Make dataset out of Hugginggface dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e17b8883-0a5c-4fa3-a7d0-e8ee95e42027",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from tqdm.notebook import tqdm\n",
    "from datasets import load_dataset\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92957b2c-6765-48ee-9296-d6739066d74d",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = load_dataset(\"lambdalabs/pokemon-blip-captions\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0095cdda-c31a-48ee-a115-076a5fc393c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "root_dir = \"/workspace/pixart-pokemon\"\n",
    "images_dir = \"images\"\n",
    "captions_dir = \"captions\"\n",
    "\n",
    "images_dir_absolute = os.path.join(root_dir, images_dir)\n",
    "captions_dir_absolute = os.path.join(root_dir, captions_dir)\n",
    "\n",
    "if not os.path.exists(root_dir):\n",
    "    os.makedirs(os.path.join(root_dir, images_dir))\n",
    "\n",
    "if not os.path.exists(os.path.join(root_dir, images_dir)):\n",
    "    os.makedirs(os.path.join(root_dir, images_dir))\n",
    "if not os.path.exists(os.path.join(root_dir, captions_dir)):\n",
    "    os.makedirs(os.path.join(root_dir, captions_dir))\n",
    "\n",
    "image_format = \"png\"\n",
    "json_name = \"partition/data_info.json\"\n",
    "if not os.path.exists(os.path.join(root_dir, \"partition\")):\n",
    "    os.makedirs(os.path.join(root_dir, \"partition\"))\n",
    "\n",
    "absolute_json_name = os.path.join(root_dir, json_name)\n",
    "data_info = []\n",
    "\n",
    "order = 0\n",
    "for item in tqdm(dataset[\"train\"]): \n",
    "    image = item[\"image\"]\n",
    "    image.save(f\"{images_dir_absolute}/{order}.{image_format}\")\n",
    "    with open(f\"{captions_dir_absolute}/{order}.txt\", \"w\") as text_file:\n",
    "        text_file.write(item[\"text\"])\n",
    "    \n",
    "    width, height = 512, 512\n",
    "    ratio = 1\n",
    "    data_info.append({\n",
    "        \"height\": height,\n",
    "        \"width\": width,\n",
    "        \"ratio\": ratio,\n",
    "        \"path\": f\"images/{order}.{image_format}\",\n",
    "        \"prompt\": item[\"text\"],\n",
    "    })\n",
    "        \n",
    "    order += 1\n",
    "\n",
    "with open(absolute_json_name, \"w\") as json_file:\n",
    "    json.dump(data_info, json_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25be1c03",
   "metadata": {},
   "source": [
    "## Extract features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f07a4f5-1873-48bf-86d0-9304942de5d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python /workspace/PixArt-alpha/tools/extract_features.py \\\n",
    "    --img_size 512 \\\n",
    "    --json_path \"/workspace/pixart-pokemon/partition/data_info.json\" \\\n",
    "    --t5_save_root \"/workspace/pixart-pokemon/caption_feature_wmask\" \\\n",
    "    --vae_save_root \"/workspace/pixart-pokemon/img_vae_features\" \\\n",
    "    --pretrained_models_dir \"/workspace/PixArt-alpha/output/pretrained_models\" \\\n",
    "    --dataset_root \"/workspace/pixart-pokemon\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fc653d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "!wandb login REPLACE_THIS_WITH_YOUR_AUTH_TOKEN_OF_WANDB"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2cf1fd1a",
   "metadata": {},
   "source": [
    "## Train model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea0e9dab-17bc-45ed-9c81-b670bbb8de47",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m torch.distributed.launch \\\n",
    "    train_scripts/train.py \\\n",
    "    /workspace/PixArt-alpha/notebooks/PixArt_xl2_img512_internal_for_pokemon_sample_training.py \\\n",
    "    --work-dir output/trained_model \\\n",
    "    --report_to=\"wandb\" \\\n",
    "    --loss_report_name=\"train_loss\""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: PixArt-alpha-ToCa/requirements.txt
================================================
torch==2.1.1
torchaudio==2.1.1
torchvision==0.16.1
mmcv==1.7.0
git+https://github.com/huggingface/diffusers
timm==0.6.12
accelerate
tensorboard
tensorboardX
transformers
sentencepiece~=0.1.99
ftfy
beautifulsoup4
protobuf==3.20.2
gradio==4.1.1
yapf==0.40.1
opencv-python
bs4
einops
xformers
optimum
peft==0.6.2

================================================
FILE: PixArt-alpha-ToCa/scripts/infer_pixart_8_bits.py
================================================
# pip install -U accelerate transformers bitsandbytes
# pip install -U git+https://github.com/huggingface/diffusers

from transformers import T5EncoderModel
from diffusers import PixArtAlphaPipeline
import torch
import gc


def flush():
    gc.collect()
    torch.cuda.empty_cache()

def bytes_to_giga_bytes(bytes):
    return bytes / 1024 / 1024 / 1024

# Loading in 8 bits needs `bitsandbytes`.
text_encoder = T5EncoderModel.from_pretrained(
    "PixArt-alpha/PixArt-XL-2-1024-MS",
    subfolder="text_encoder",
    load_in_8bit=True,
    device_map="auto",

)

pipe = PixArtAlphaPipeline.from_pretrained(
    "PixArt-alpha/PixArt-XL-2-1024-MS",
    text_encoder=text_encoder,
    transformer=None,
    device_map="auto"
)

with torch.no_grad():
    prompt = "cute cat"
    prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt)

del text_encoder
del pipe
flush()

pipe = PixArtAlphaPipeline.from_pretrained(
    "PixArt-alpha/PixArt-XL-2-1024-MS",
    text_encoder=None,
    torch_dtype=torch.float16,
).to("cuda")

latents = pipe(
    negative_prompt=None,
    prompt_embeds=prompt_embeds,
    negative_prompt_embeds=negative_embeds,
    prompt_attention_mask=prompt_attention_mask,
    negative_prompt_attention_mask=negative_prompt_attention_mask,
    num_images_per_prompt=1,
    output_type="latent",
).images

del pipe.transformer
flush()

with torch.no_grad():
    image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
image = pipe.image_processor.postprocess(image, output_type="pil")

image[0].save("cat.png")

print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")

================================================
FILE: PixArt-alpha-ToCa/scripts/inference.py
================================================
import os
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import warnings
warnings.filterwarnings("ignore")  # ignore warning
import re
import argparse
from datetime import datetime
from tqdm import tqdm
import torch
from torchvision.utils import save_image
from diffusers.models import AutoencoderKL

from diffusion.model.utils import prepare_prompt_ar
from diffusion import IDDPM, DPMS, SASolverSampler
from tools.download import find_model
from diffusion.model.nets import PixArtMS_XL_2, PixArt_XL_2
from diffusion.model.t5 import T5Embedder
#from diffusion.data.datasets import get_chunks, ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST
from diffusion.data.datasets import get_chunks, ASPECT_RATIO_256_TEST, ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image_size', default=256, type=int)
    parser.add_argument('--t5_path', default='../autodl-tmp/pretrained_models/t5_ckpts', type=str) # change to your own path
    parser.add_argument('--tokenizer_path', default='../autodl-tmp/pretrained_models/sd-vae-ft-ema', type=str) # change to your own path
    parser.add_argument('--txt_file', default='asset/samples.txt', type=str) # change to your own path
    parser.add_argument('--model_path', default='../autodl-tmp/pretrained_models/PixArt-XL-2-1024x1024.pth', type=str) # change to your own path
    parser.add_argument('--bs', default=1, type=int)
    parser.add_argument('--cfg_scale', default=4.5, type=float)
    parser.add_argument('--sampling_algo', default='dpm-solver', type=str, choices=['iddpm', 'dpm-solver', 'sa-solver'])
    parser.add_argument('--seed', default=0, type=int)
    parser.add_argument('--dataset', default='custom', type=str)
    parser.add_argument('--step', default=-1, type=int)
    parser.add_argument('--save_name', default='test_sample', type=str)
    parser.add_argument("--fresh_ratio", type=float, default=0.30)
    parser.add_argument("--cache_type", type=str, choices=['random', 'attention','similarity','norm', 'compress'], default='attention')
    parser.add_argument("--ratio_scheduler", type=str, default='ToCa', choices=['linear', 'cosine', 'exp', 'constant','linear-mode','layerwise','ToCa'])
    parser.add_argument("--force_fresh", type=str, choices=['global', 'local'], default='global',
                        help="Force fresh strategy. global: fresh all tokens. local: fresh tokens acheiving fresh step threshold.")
    parser.add_argument("--fresh_threshold", type=int, default=3)
    parser.add_argument("--soft_fresh_weight", type=float, default=0.25,
                        help="soft weight for updating the stale tokens by adding extra scores.")
    
    return parser.parse_args()


def set_env(seed=0):
    torch.manual_seed(seed)
    torch.set_grad_enabled(False)
    for _ in range(30):
        torch.randn(1, 4, args.image_size, args.image_size)


@torch.inference_mode()
def visualize(items, bs, sample_steps, cfg_scale):

    for chunk in tqdm(list(get_chunks(items, bs)), unit='batch'):

        prompts = []
        if bs == 1:
            prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(chunk[0], base_ratios, device=device, show=False)  # ar for aspect ratio
            if args.image_size == 1024:
                latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8)
            else:
                hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1)
                ar = torch.tensor([[1.]], device=device).repeat(bs, 1)
                latent_size_h, latent_size_w = latent_size, latent_size
            prompts.append(prompt_clean.strip())
        else:
            hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1)
            ar = torch.tensor([[1.]], device=device).repeat(bs, 1)
            for prompt in chunk:
                prompts.append(prepare_prompt_ar(prompt, base_ratios, device=device, show=False)[0].strip())
            latent_size_h, latent_size_w = latent_size, latent_size

        null_y = model.y_embedder.y_embedding[None].repeat(len(prompts), 1, 1)[:, None]

        with torch.no_grad():
            caption_embs, emb_masks = t5.get_text_embeddings(prompts)
            caption_embs = caption_embs.float()[:, None]
            print('finish embedding')

            if args.sampling_algo == 'iddpm':
                # Create sampling noise:
                n = len(prompts)
                z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device).repeat(2, 1, 1, 1)
                model_kwargs = dict(y=torch.cat([caption_embs, null_y]),
                                    cfg_scale=cfg_scale, data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks,
                                    cache_type = args.cache_type,
                                    fresh_ratio = args.fresh_ratio,
                                    fresh_threshold = args.fresh_threshold,
                                    force_fresh = args.force_fresh,
                                    ratio_scheduler = args.ratio_scheduler,
                                    soft_fresh_weight = args.soft_fresh_weight)
                diffusion = IDDPM(str(sample_steps))
                # Sample images:
                samples = diffusion.p_sample_loop(
                    model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True,
                    device=device
                )
                samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
            elif args.sampling_algo == 'dpm-solver':
                # Create sampling noise:
                n = len(prompts)
                z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device)
                model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks,
                                    cache_type = args.cache_type,
                                    fresh_ratio = args.fresh_ratio,
                                    fresh_threshold = args.fresh_threshold,
                                    force_fresh = args.force_fresh,
                                    ratio_scheduler = args.ratio_scheduler,
                                    soft_fresh_weight = args.soft_fresh_weight)
                dpm_solver = DPMS(model.forward_with_dpmsolver,
                                  condition=caption_embs,
                                  uncondition=null_y,
                                  cfg_scale=cfg_scale,
                                  model_kwargs=model_kwargs)
                samples = dpm_solver.sample(
                    z,
                    steps=sample_steps,
                    order=2,
                    skip_type="time_uniform",
                    method="multistep",
                    model_kwargs = model_kwargs,
                )
            elif args.sampling_algo == 'sa-solver':
                # Create sampling noise:
                n = len(prompts)
                model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks,
                                    cache_type = args.cache_type,
                                    fresh_ratio = args.fresh_ratio,
                                    fresh_threshold = args.fresh_threshold,
                                    force_fresh = args.force_fresh,
                                    ratio_scheduler = args.ratio_scheduler,
                                    soft_fresh_weight = args.soft_fresh_weight)
                sa_solver = SASolverSampler(model.forward_with_dpmsolver, device=device)
                samples = sa_solver.sample(
                    S=25,
                    batch_size=n,
                    shape=(4, latent_size_h, latent_size_w),
                    eta=1,
                    conditioning=caption_embs,
                    unconditional_conditioning=null_y,
                    unconditional_guidance_scale=cfg_scale,
                    model_kwargs=model_kwargs,
                    
                )[0]
        samples = vae.decode(samples / 0.18215).sample
        torch.cuda.empty_cache()
        # Save images:
        os.umask(0o000)  # file permission: 666; dir permission: 777
        for i, sample in enumerate(samples):
            save_path = os.path.join(save_root, f"{prompts[i][:100]}.jpg")
            print("Saving path: ", save_path)
            save_image(sample, save_path, nrow=1, normalize=True, value_range=(-1, 1))


if __name__ == '__main__':
    args = get_args()
    # Setup PyTorch:
    seed = args.seed
    set_env(seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    assert args.sampling_algo in ['iddpm', 'dpm-solver', 'sa-solver']

    # only support fixed latent size currently
    latent_size = args.image_size // 8
    lewei_scale = {256: 1, 512: 1, 1024: 2}     # trick for positional embedding interpolation
    #lewei_scale = {512: 1, 1024: 2}     # trick for positional embedding interpolation
    sample_steps_dict = {'iddpm': 100, 'dpm-solver': 20, 'sa-solver': 25}
    sample_steps = args.step if args.step != -1 else sample_steps_dict[args.sampling_algo]
    weight_dtype = torch.float16
    print(f"Inference with {weight_dtype}")

    # model setting
    if args.image_size in [256, 512]:
        model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device)
    else:
        model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device)

    print(f"Generating sample from ckpt: {args.model_path}")
    state_dict = find_model(args.model_path)
    del state_dict['state_dict']['pos_embed']
    missing, unexpected = model.load_state_dict(state_dict['state_dict'], strict=False)
    print('Missing keys: ', missing)
    print('Unexpected keys', unexpected)
    model.eval()
    model.to(weight_dtype)
    base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST')

    vae = AutoencoderKL.from_pretrained(args.tokenizer_path).to(device)
    t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=args.t5_path, torch_dtype=torch.float)
    work_dir = os.path.join(*args.model_path.split('/')[:-2])
    work_dir = f'/{work_dir}' if args.model_path[0] == '/' else work_dir

    # data setting
    with open(args.txt_file, 'r') as f:
        items = [item.strip() for item in f.readlines()]

    # img save setting
    try:
        epoch_name = re.search(r'.*epoch_(\d+).*.pth', args.model_path).group(1)
        step_name = re.search(r'.*step_(\d+).*.pth', args.model_path).group(1)
    except Exception:
        epoch_name = 'unknown'
        step_name = 'unknown'
    img_save_dir = os.path.join(work_dir, 'vis')
    os.umask(0o000)  # file permission: 666; dir permission: 777
    os.makedirs(img_save_dir, exist_ok=True)

    save_root = os.path.join(img_save_dir, f"{datetime.now().date()}_{args.dataset}_epoch{epoch_name}_step{step_name}_scale{args.cfg_scale}_step{sample_steps}_size{args.image_size}_bs{args.bs}_samp{args.sampling_algo}_seed{seed}")
    os.makedirs(save_root, exist_ok=True)
    visualize(items, args.bs, sample_steps, args.cfg_scale)

================================================
FILE: PixArt-alpha-ToCa/scripts/inference_ddp.py
================================================
import os
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import warnings
warnings.filterwarnings("ignore")  # ignore warning
import re
import argparse
from datetime import datetime
from tqdm import tqdm
import torch
from torchvision.utils import save_image
from diffusers.models import AutoencoderKL
import torch.distributed as dist
from torch.utils.data import DataLoader, DistributedSampler

from diffusion.model.utils import prepare_prompt_ar
from diffusion import IDDPM, DPMS, SASolverSampler
from tools.download import find_model
from diffusion.model.nets import PixArtMS_XL_2, PixArt_XL_2
from diffusion.model.t5 import T5Embedder
from diffusion.data.datasets import get_chunks, ASPECT_RATIO_256_TEST, ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image_size', default=256, type=int)
    parser.add_argument('--t5_path', default='../autodl-tmp/pretrained_models/t5_ckpts', type=str) # change to your t5 path
    parser.add_argument('--tokenizer_path', default='../autodl-tmp/pretrained_models/sd-vae-ft-ema', type=str) # change to your tokenizer path
    parser.add_argument('--txt_file', default='asset/samples.txt', type=str) # change to your txt prompt file
    parser.add_argument('--model_path', default='../autodl-tmp/pretrained_models/PixArt-XL-2-1024x1024.pth', type=str)
    parser.add_argument('--bs', default=1, type=int)
    parser.add_argument('--cfg_scale', default=4.5, type=float)
    parser.add_argument('--sampling_algo', default='dpm-solver', type=str, choices=['iddpm', 'dpm-solver', 'sa-solver'])
    parser.add_argument('--seed', default=0, type=int)
    parser.add_argument('--dataset', default='custom', type=str)
    parser.add_argument('--step', default=-1, type=int)
    parser.add_argument('--save_name', default='test_sample', type=str)
    parser.add_argument("--fresh_ratio", type=float, default=0.30)
    parser.add_argument("--cache_type", type=str, choices=['random', 'attention', 'similarity', 'norm', 'compress'], default='attention')
    parser.add_argument("--ratio_scheduler", type=str, default='ToCa', choices=['linear', 'cosine', 'exp', 'constant', 'linear-mode', 'layerwise', 'ToCa'])
    parser.add_argument("--force_fresh", type=str, choices=['global', 'local'], default='global')
    parser.add_argument("--fresh_threshold", type=int, default=3)
    parser.add_argument("--soft_fresh_weight", type=float, default=0.25)
    return parser.parse_args()


def setup_ddp():
    dist.init_process_group(backend='nccl')
    local_rank = dist.get_rank()
    torch.cuda.set_device(local_rank)
    return local_rank


def cleanup_ddp():
    dist.destroy_process_group()


def set_env(seed=0, local_rank=None):
    global_seed = seed + local_rank
    torch.manual_seed(global_seed)
    torch.cuda.manual_seed(global_seed)
    #torch.cuda.manual_seed_all(global_seed)
    torch.set_grad_enabled(False)
    return torch.device(f'cuda:{local_rank}')


@torch.inference_mode()
def visualize(items, bs, sample_steps, cfg_scale, device):
    sampler = DistributedSampler(items, shuffle=False, num_replicas=dist.get_world_size(), rank=dist.get_rank())
    data_loader = DataLoader(items, batch_size=bs, sampler=sampler, drop_last=False)
    
    pbar = tqdm(data_loader, unit='batch') if dist.get_rank() == 0 else data_loader
    for chunk in pbar:
        prompts = []
        if bs == 1:
            prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(chunk[0], base_ratios, device=device, show=False)  # ar for aspect ratio
            if args.image_size == 1024:
                latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8)
            else:
                hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1)
                ar = torch.tensor([[1.]], device=device).repeat(bs, 1)
                latent_size_h, latent_size_w = latent_size, latent_size
            prompts.append(prompt_clean.strip())
        else:
            hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1)
            ar = torch.tensor([[1.]], device=device).repeat(bs, 1)
            for prompt in chunk:
                prompts.append(prepare_prompt_ar(prompt, base_ratios, device=device, show=False)[0].strip())
            latent_size_h, latent_size_w = latent_size, latent_size


        null_y = model.module.y_embedder.y_embedding[None].repeat(len(prompts), 1, 1)[:, None]

        with torch.no_grad():
            caption_embs, emb_masks = t5.get_text_embeddings(prompts)
            caption_embs = caption_embs.float()[:, None]
            #print('finish embedding')

            if args.sampling_algo == 'iddpm':
                # we have not tested this part, there may bugsss.
                n = len(prompts)
                z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device).repeat(2, 1, 1, 1)
                model_kwargs = dict(y=torch.cat([caption_embs, null_y]),
                                    cfg_scale=cfg_scale, data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks,
                                    cache_type=args.cache_type,
                                    fresh_ratio=args.fresh_ratio,
                                    fresh_threshold=args.fresh_threshold,
                                    force_fresh=args.force_fresh,
                                    ratio_scheduler=args.ratio_scheduler,
                                    soft_fresh_weight=args.soft_fresh_weight)
                diffusion = IDDPM(str(sample_steps))
                samples = diffusion.p_sample_loop(
                    model.module.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True,
                    device=device
                )
                samples, _ = samples.chunk(2, dim=0)

            elif args.sampling_algo == 'dpm-solver':
                # Main srategy, we have tested and make sure it works.
                n = len(prompts)
                z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device)
                model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks,
                                    cache_type=args.cache_type,
                                    fresh_ratio=args.fresh_ratio,
                                    fresh_threshold=args.fresh_threshold,
                                    force_fresh=args.force_fresh,
                                    ratio_scheduler=args.ratio_scheduler,
                                    soft_fresh_weight=args.soft_fresh_weight)
                dpm_solver = DPMS(model.module.forward_with_dpmsolver,
                                  condition=caption_embs,
                                  uncondition=null_y,
                                  cfg_scale=cfg_scale,
                                  model_kwargs=model_kwargs)
                samples = dpm_solver.sample(
                    z,
                    steps=sample_steps,
                    order=2,
                    skip_type="time_uniform",
                    method="multistep",
                    model_kwargs=model_kwargs,
                    rank = dist.get_rank()
                )
            # not supported now
            elif args.sampling_algo == 'sa-solver':
                n = len(prompts)
                model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks,
                                    cache_type=args.cache_type,
                                    fresh_ratio=args.fresh_ratio,
                                    fresh_threshold=args.fresh_threshold,
                                    force_fresh=args.force_fresh,
                                    ratio_scheduler=args.ratio_scheduler,
                                    soft_fresh_weight=args.soft_fresh_weight)
                sa_solver = SASolverSampler(model.module.forward_with_dpmsolver, device=device)
                samples = sa_solver.sample(
                    S=25,
                    batch_size=n,
                    shape=(4, latent_size_h, latent_size_w),
                    eta=1,
                    conditioning=caption_embs,
                    unconditional_conditioning=null_y,
                    unconditional_guidance_scale=cfg_scale,
                    model_kwargs=model_kwargs,
                )[0]

        samples = vae.decode(samples / 0.18215).sample
        torch.cuda.empty_cache()

        dist.barrier()
        #if dist.get_rank() == 0:
        os.umask(0o000)
        for i, sample in enumerate(samples):
            save_path = os.path.join(save_root, f"{prompts[i][:100]}.jpg")
            #print("Saving path: ", save_path)
            save_image(sample, save_path, nrow=1, normalize=True, value_range=(-1, 1))


if __name__ == '__main__':
    args = get_args()
    
    # Setup DDP
    local_rank = setup_ddp()
    
    # Setup environment
    device = set_env(args.seed, local_rank)
    
    # only support fixed latent size currently
    latent_size = args.image_size // 8
    lewei_scale = {256: 1, 512: 1, 1024: 2}
    sample_steps_dict = {'iddpm': 100, 'dpm-solver': 20, 'sa-solver': 25}
    sample_steps = args.step if args.step != -1 else sample_steps_dict[args.sampling_algo]
    weight_dtype = torch.float16
    print(f"Inference with {weight_dtype}")

    # model setting
    if args.image_size in [256, 512]:
        model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device)
    else:
        model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device)

    print(f"Generating sample from ckpt: {args.model_path}")
    state_dict = find_model(args.model_path)
    del state_dict['state_dict']['pos_embed']
    missing, unexpected = model.load_state_dict(state_dict['state_dict'], strict=False)
    print('Missing keys: ', missing)
    print('Unexpected keys', unexpected)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])
    model.module.eval()
    model.module.to(weight_dtype)
    base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST')

    vae = AutoencoderKL.from_pretrained(args.tokenizer_path).to(device)
    t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=args.t5_path, torch_dtype=torch.float)
    work_dir = os.path.join(*args.model_path.split('/')[:-2])
    work_dir = f'/{work_dir}' if args.model_path[0] == '/' else work_dir

    with open(args.txt_file, 'r') as f:
        items = [item.strip() for item in f.readlines()]

    epoch_name = re.search(r'.*epoch_(\d+).*.pth', args.model_path).group(1) if re.search(r'.*epoch_(\d+).*.pth', args.model_path) else 'unknown'
    step_name = re.search(r'.*step_(\d+).*.pth', args.model_path).group(1) if re.search(r'.*step_(\d+).*.pth', args.model_path) else 'unknown'
    img_save_dir = os.path.join(work_dir, 'vis')
    os.umask(0o000)
    os.makedirs(img_save_dir, exist_ok=True)

    save_root = os.path.join(img_save_dir, f"{datetime.now().date()}_{args.dataset}_epoch{epoch_name}_step{step_name}_scale{args.cfg_scale}_step{sample_steps}_size{args.image_size}_bs{args.bs}_samp{args.sampling_algo}_seed{args.seed}")
    os.makedirs(save_root, exist_ok=True)

    visualize(items, args.bs, sample_steps, args.cfg_scale, device)
    
    cleanup_ddp()


================================================
FILE: PixArt-alpha-ToCa/scripts/inference_lcm.py
================================================
import os
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import warnings
warnings.filterwarnings("ignore")  # ignore warning
import re
import argparse
from datetime import datetime
from tqdm import tqdm
import torch
from torchvision.utils import save_image
from diffusers.models import AutoencoderKL

from diffusion.model.utils import prepare_prompt_ar
from tools.download import find_model
from diffusion.model.nets import PixArtMS_XL_2, PixArt_XL_2
from diffusion.model.t5 import T5Embedder
from diffusion.data.datasets import get_chunks
from diffusion.lcm_scheduler import LCMScheduler
from diffusion.data.datasets import ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image_size', default=1024, type=int)
    parser.add_argument('--t5_path', default='output/pretrained_models/t5_ckpts', type=str)
    parser.add_argument('--tokenizer_path', default='output/pretrained_models/sd-vae-ft-ema', type=str)
    parser.add_argument('--txt_file', default='asset/samples.txt', type=str)
    parser.add_argument('--model_path', default='output/pretrained_models/PixArt-XL-2-1024x1024.pth', type=str)
    parser.add_argument('--bs', default=1, type=int)
    parser.add_argument('--cfg_scale', default=4.5, type=float)
    parser.add_argument('--sample_steps', default=4, type=int)
    parser.add_argument('--seed', default=0, type=int)
    parser.add_argument('--dataset', default='custom', type=str)
    parser.add_argument('--step', default=-1, type=int)
    parser.add_argument('--save_name', default='test_sample', type=str)

    return parser.parse_args()


def set_env(seed=0):
    torch.manual_seed(seed)
    torch.set_grad_enabled(False)
    for _ in range(30):
        torch.randn(1, 4, args.image_size, args.image_size)

@torch.inference_mode()
def visualize(items, bs, sample_steps, cfg_scale):
    # 4. Prepare timesteps
    scheduler.set_timesteps(sample_steps, 50)
    timesteps = scheduler.timesteps

    for chunk in tqdm(list(get_chunks(items, bs)), unit='batch'):

        prompts = []
        if bs == 1:
            prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(chunk[0], base_ratios, device=device, show=False)  # ar for aspect ratio
            if args.image_size == 1024:
                latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8)
            else:
                hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1)
                ar = torch.tensor([[1.]], device=device).repeat(bs, 1)
                latent_size_h, latent_size_w = latent_size, latent_size
            prompts.append(prompt_clean.strip())
        else:
            hw = torch.tensor([[args.image_size, args.image_size]], dtype=torch.float, device=device).repeat(bs, 1)
            ar = torch.tensor([[1.]], device=device).repeat(bs, 1)
            prompts.append(prepare_prompt_ar(prompt, base_ratios, device=device, show=False)[0].strip())
            latent_size_h, latent_size_w = latent_size, latent_size

        with torch.no_grad():
            caption_embs, emb_masks = t5.get_text_embeddings(prompts)
            caption_embs = caption_embs.float()[:, None]
            print('finish embedding')

            # Create sampling noise:
            n = len(prompts)
            latents = torch.randn(n, 4, latent_size_h, latent_size_w, device=device)
            model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks)

            # 7. LCM MultiStep Sampling Loop:
            for i, t in tqdm(list(enumerate(timesteps))):
                ts = torch.full((bs,), t, device=device, dtype=torch.long)

                # model prediction (v-prediction, eps, x)
                model_pred = model(latents, ts, caption_embs, **model_kwargs)[:, :4]

                # compute the previous noisy sample x_t -> x_t-1
                latents, denoised = scheduler.step(model_pred, i, t, latents, return_dict=False)

        samples = vae.decode(denoised / 0.18215).sample
        torch.cuda.empty_cache()
        # Save images:
        os.umask(0o000)  # file permission: 666; dir permission: 777
        for i, sample in enumerate(samples):
            save_path = os.path.join(save_root, f"{prompts[i][:100]}.jpg")
            print("Saving path: ", save_path)
            save_image(sample, save_path, nrow=1, normalize=True, value_range=(-1, 1))


if __name__ == '__main__':
    args = get_args()
    # Setup PyTorch:
    seed = args.seed
    set_env(seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # only support fixed latent size currently
    latent_size = args.image_size // 8
    lewei_scale = {512: 1, 1024: 2}     # trick for positional embedding interpolation
    sample_steps = args.sample_steps

    # Initalize Scheduler:
    scheduler = LCMScheduler(beta_start=0.0001, beta_end=0.02, beta_schedule="linear", prediction_type="epsilon")

    # model setting
    if args.image_size == 512:
        model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device)
    else:
        model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device)

    print(f"Generating sample from ckpt: {args.model_path}")
    state_dict = find_model(args.model_path)
    del state_dict['state_dict']['pos_embed']
    missing, unexpected = model.load_state_dict(state_dict['state_dict'], strict=False)
    print('Missing keys: ', missing)
    print('Unexpected keys', unexpected)
    model.eval()
    base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST')

    vae = AutoencoderKL.from_pretrained(args.tokenizer_path).to(device)
    t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=args.t5_path, torch_dtype=torch.float)
    work_dir = os.path.join(*args.model_path.split('/')[:-2])
    work_dir = f'/{work_dir}' if args.model_path[0] == '/' else work_dir

    # data setting
    with open(args.txt_file, 'r') as f:
        items = [item.strip() for item in f.readlines()]

    # img save setting
    try:
        epoch_name = re.search(r'.*epoch_(\d+).*.pth', args.model_path).group(1)
        step_name = re.search(r'.*step_(\d+).*.pth', args.model_path).group(1)
    except Exception:
        epoch_name = 'unknown'
        step_name = 'unknown'
    img_save_dir = os.path.join(work_dir, 'vis')
    os.umask(0o000)  # file permission: 666; dir permission: 777
    os.makedirs(img_save_dir, exist_ok=True)

    save_root = os.path.join(img_save_dir, f"{datetime.now().date()}_{args.dataset}_epoch{epoch_name}_step{step_name}_scale{args.cfg_scale}_step{sample_steps}_size{args.image_size}_bs{args.bs}_sampLCM_seed{seed}")
    os.makedirs(save_root, exist_ok=True)
    visualize(items, args.bs, sample_steps, args.cfg_scale)


================================================
FILE: PixArt-alpha-ToCa/scripts/interface.py
================================================
import argparse
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import os
import random
import torch
from torchvision.utils import save_image
from diffusion import IDDPM, DPMS, SASolverSampler
from diffusers.models import AutoencoderKL
from tools.download import find_model
from datetime import datetime
from typing import List, Union
import gradio as gr
import numpy as np
from gradio.components import Textbox, Image
from diffusion.model.utils import prepare_prompt_ar, resize_and_crop_tensor
from diffusion.model.nets import PixArtMS_XL_2, PixArt_XL_2
from diffusion.model.t5 import T5Embedder
from torchvision.utils import _log_api_usage_once, make_grid
from diffusion.data.datasets import ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST
from asset.examples import examples


MAX_SEED = np.iinfo(np.int32).max


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image_size', default=1024, type=int)
    parser.add_argument('--model_path', default='output/pretrained_models/PixArt-XL-2-1024-MS.pth', type=str)
    parser.add_argument('--t5_path', default='output/pretrained_models', type=str)
    parser.add_argument('--tokenizer_path', default='output/pretrained_models/sd-vae-ft-ema', type=str)
    parser.add_argument('--llm_model', default='t5', type=str)
    parser.add_argument('--port', default=7788, type=int)

    return parser.parse_args()


@torch.no_grad()
def ndarr_image(tensor: Union[torch.Tensor, List[torch.Tensor]], **kwargs,) -> None:
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(save_image)
    grid = make_grid(tensor, **kwargs)
    # Add 0.5 after unnormalizing to [0, 255] to round to the nearest integer
    return grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()


def set_env(seed=0):
    torch.manual_seed(seed)
    torch.set_grad_enabled(False)
    for _ in range(30):
        torch.randn(1, 4, args.image_size, args.image_size)


def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed


@torch.inference_mode()
def generate_img(prompt, sampler, sample_steps, scale, seed=0, randomize_seed=False):
    seed = int(randomize_seed_fn(seed, randomize_seed))
    set_env(seed)

    os.makedirs(f'output/demo/online_demo_prompts/', exist_ok=True)
    save_promt_path = f'output/demo/online_demo_prompts/tested_prompts{datetime.now().date()}.txt'
    with open(save_promt_path, 'a') as f:
        f.write(prompt + '\n')
    print(prompt)
    prompt_clean, prompt_show, hw, ar, custom_hw = prepare_prompt_ar(prompt, base_ratios, device=device)      # ar for aspect ratio
    prompt_clean = prompt_clean.strip()
    if isinstance(prompt_clean, str):
        prompts = [prompt_clean]

    caption_embs, emb_masks = llm_embed_model.get_text_embeddings(prompts)
    caption_embs = caption_embs[:, None]

    null_y = model.y_embedder.y_embedding[None].repeat(len(prompts), 1, 1)[:, None]

    latent_size_h, latent_size_w = int(hw[0, 0]//8), int(hw[0, 1]//8)
    # Sample images:
    if sampler == 'iddpm':
        # Create sampling noise:
        n = len(prompts)
        z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device).repeat(2, 1, 1, 1)
        model_kwargs = dict(y=torch.cat([caption_embs, null_y]),
                            cfg_scale=scale, data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks)
        diffusion = IDDPM(str(sample_steps))
        samples = diffusion.p_sample_loop(
            model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True,
            device=device
        )
        samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
    elif sampler == 'dpm-solver':
        # Create sampling noise:
        n = len(prompts)
        z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device)
        model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks)
        dpm_solver = DPMS(model.forward_with_dpmsolver,
                          condition=caption_embs,
                          uncondition=null_y,
                          cfg_scale=scale,
                          model_kwargs=model_kwargs)
        samples = dpm_solver.sample(
            z,
            steps=sample_steps,
            order=2,
            skip_type="time_uniform",
            method="multistep",
        )
    elif sampler == 'sa-solver':
        # Create sampling noise:
        n = len(prompts)
        model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks)
        sa_solver = SASolverSampler(model.forward_with_dpmsolver, device=device)
        samples = sa_solver.sample(
            S=sample_steps,
            batch_size=n,
            shape=(4, latent_size_h, latent_size_w),
            eta=1,
            conditioning=caption_embs,
            unconditional_conditioning=null_y,
            unconditional_guidance_scale=scale,
            model_kwargs=model_kwargs,
        )[0]
    samples = vae.decode(samples / 0.18215).sample
    torch.cuda.empty_cache()
    samples = resize_and_crop_tensor(samples, custom_hw[0,1], custom_hw[0,0])
    display_model_info = f'Model path: {args.model_path},\nBase image size: {args.image_size}, \nSampling Algo: {sampler}'
    return ndarr_image(samples, normalize=True, value_range=(-1, 1)), prompt_show, display_model_info, seed


if __name__ == '__main__':
    from diffusion.utils.logger import get_root_logger
    args = get_args()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger = get_root_logger()

    assert args.image_size in [512, 1024], "We only provide pre-trained models for 256x256, 512x512 and 1024x1024 resolutions."
    lewei_scale = {512: 1, 1024: 2}
    latent_size = args.image_size // 8
    t5_device = {512: 'cuda', 1024: 'cuda'}
    if args.image_size == 512:
        model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device)
    else:
        model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size]).to(device)
    state_dict = find_model(args.model_path)
    del state_dict['state_dict']['pos_embed']
    missing, unexpected = model.load_state_dict(state_dict['state_dict'], strict=False)
    logger.warning(f'Missing keys: {missing}')
    logger.warning(f'Unexpected keys: {unexpected}')
    model.eval()
    base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST')

    vae = AutoencoderKL.from_pretrained(args.tokenizer_path).to(device)

    if args.llm_model == 't5':
        llm_embed_model = T5Embedder(device=t5_device[args.image_size], local_cache=True, cache_dir=args.t5_path, torch_dtype=torch.float)
    else:
        print('We support t5 only, please initialize the llm again')
        sys.exit()

    title = f"""
        '' Unleashing your Creativity \n ''
        <div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
            <img src='https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png' style='width: 400px; height: auto; margin-right: 10px;' />
            {args.image_size}px
        </div>
    """
    DESCRIPTION = """# PixArt-Alpha 1024px
            ## If PixArt-Alpha is helpful, please help to ⭐ the [Github Repo](https://github.com/PixArt-alpha/PixArt) and recommend it to your friends 😊'
            #### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-XL-2-1024-MS](https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS) checkpoint.
            #### English prompts ONLY; 提示词仅限英文
            Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing).
            """
    if not torch.cuda.is_available():
        DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"

    demo = gr.Interface(
        fn=generate_img,
        inputs=[Textbox(label="Note: If you want to specify a aspect ratio or determine a customized height and width, "
                              "use --ar h:w (or --aspect_ratio h:w) or --hw h:w. If no aspect ratio or hw is given, all setting will be default.",
                        placeholder="Please enter your prompt. \n"),
                gr.Radio(
                    choices=["iddpm", "dpm-solver"],
                    label=f"Sampler",
                    interactive=True,
                    value='dpm-solver',
                ),
                gr.Slider(
                    label='Sample Steps',
                    minimum=1,
                    maximum=100,
                    value=14,
                    step=1
                ),
                gr.Slider(
                    label='Guidance Scale',
                    minimum=0.1,
                    maximum=30.0,
                    value=4.5,
                    step=0.1
                ),
                gr.Slider(
                    label="Seed",
                    minimum=0,
                    maximum=MAX_SEED,
                    step=1,
                    value=0,
                ),
                gr.Checkbox(label="Randomize seed", value=True),
                ],
        outputs=[Image(type="numpy", label="Img"),
                 Textbox(label="clean prompt"),
                 Textbox(label="model info"),
                 gr.Slider(label='seed')],
        title=title,
        description=DESCRIPTION,
        examples=examples,
    )
    demo.launch(server_name="0.0.0.0", server_port=args.port, debug=True)

================================================
FILE: PixArt-alpha-ToCa/scripts/interface_controlnet.py
================================================
import argparse
import os
from datetime import datetime
import numpy as np
import sys
from pathlib import Path
from typing import List, Union

current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))

import gradio as gr
from gradio.components import Textbox, Image, Slider
import torch
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from torchvision.utils import _log_api_usage_once, make_grid, save_image

from diffusion import IDDPM, DPMS, SASolverSampler
from diffusion.data.datasets import *
from diffusion.model.hed import HEDdetector
from diffusion.model.nets import PixArtMS_XL_2, ControlPixArtHalf, ControlPixArtMSHalf
from diffusion.model.t5 import T5Embedder
from diffusion.model.utils import prepare_prompt_ar, resize_and_crop_tensor
from diffusion.utils.misc import read_config
from diffusers.models import AutoencoderKL
from tools.download import find_model

vae_scale = 0.18215

DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png)
        # PixArt-Alpha 1024px + ControlNet. This is the demo for ControlNet combined with 1024px PixArt-Alpha.
        # The input reference image need to be around 1024x1024. And descriptive prompts also need to be provided.
        # You may change the random seed, if you didn't get satisfied results.
        """


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("config", type=str, help="config")
    parser.add_argument('--num_sampling_steps', default=14, type=int)
    parser.add_argument('--cfg_scale', default=4.5, type=int)
    parser.add_argument('--image_size', default=1024, type=int)
    parser.add_argument('--model_path', type=str)
    parser.add_argument('--tokenizer_path', default='output/pretrained_models/sd-vae-ft-ema', type=str)

    parser.add_argument('--llm_model', default='t5', type=str)

    parser.add_argument('--sampling_algo', default='dpm-solver', type=str, choices=['iddpm', 'dpm-solver', 'sa-solver'])

    parser.add_argument('--port', default=7788, type=int)
    parser.add_argument('--condition_strength', default=1, type=float)

    return parser.parse_args()


@torch.no_grad()
def ndarr_image(tensor: Union[torch.Tensor, List[torch.Tensor]], **kwargs, ) -> None:
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(save_image)
    grid = make_grid(tensor, **kwargs)
    ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
    return ndarr


def set_env():
    torch.manual_seed(0)
    torch.set_grad_enabled(False)


@torch.inference_mode()
def generate_img(prompt, given_image, seed):
    torch.manual_seed(seed)
    torch.cuda.empty_cache()
    strength = 1.0
    c_vis = given_image

    save_promt_path = f'{save_prompt_path}/tested_prompts{datetime.now().date()}.txt'
    with open(save_promt_path, 'a') as f:
        f.write(prompt + '\n')
    prompt_clean, prompt_show, hw, ar, custom_hw = prepare_prompt_ar(prompt, base_ratios, device=device)  # ar for aspect ratio
    prompt_clean = prompt_clean.strip()
    if isinstance(prompt_clean, str):
        prompts = [prompt_clean]

    caption_embs, emb_masks = llm_embed_model.get_text_embeddings(prompts)
    caption_embs = caption_embs[:, None]

    null_y = model.y_embedder.y_embedding[None].repeat(len(prompts), 1, 1)[:, None]

    # condition process
    if given_image is not None:
        ar = torch.tensor([given_image.size[1] / given_image.size[0]], device=device)[None]
        custom_hw = torch.tensor([given_image.size[1], given_image.size[0]], device=device)[None]
        closest_hw = base_ratios[min(base_ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))]
        hw = torch.tensor(closest_hw, device=device)[None]
        condition_transform = T.Compose([
            T.Lambda(lambda img: img.convert('RGB')),
            T.Resize(int(min(closest_hw))),
            T.CenterCrop([int(closest_hw[0]), int(closest_hw[1])]),
            T.ToTensor(),
        ])

        given_image = condition_transform(given_image).unsqueeze(0).to(device)
        hed_edge = hed(given_image) * strength
        hed_edge = TF.normalize(hed_edge, [.5], [.5])
        hed_edge = hed_edge.repeat(1, 3, 1, 1)
        posterior = vae.encode(hed_edge).latent_dist
        condition = posterior.sample()
        c = condition * vae_scale
        c_vis = vae.decode(condition)['sample']
        c_vis = torch.clamp(127.5 * c_vis + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()[0]
    else:
        c = None

    latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8)
    # Sample images:
    if args.sampling_algo == 'iddpm':
        # Create sampling noise:
        n = len(prompts)
        z = torch.randn(n, 4, latent_size, latent_size, device=device).repeat(2, 1, 1, 1)
        model_kwargs = dict(y=torch.cat([caption_embs, null_y]), cfg_scale=args.cfg_scale,
                            data_info={'img_hw': hw, 'aspect_ratio': ar},
                            mask=emb_masks, c=c)
        diffusion = IDDPM(str(args.num_sampling_steps))
        samples = diffusion.p_sample_loop(
            model.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True,
            device=device
        )
        samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
    elif args.sampling_algo == 'dpm-solver':
        # Create sampling noise:
        n = len(prompts)
        z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device)
        model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, c=c)
        dpm_solver = DPMS(model.forward_with_dpmsolver,
                          condition=caption_embs,
                          uncondition=null_y,
                          cfg_scale=args.cfg_scale,
                          model_kwargs=model_kwargs)
        samples = dpm_solver.sample(
            z,
            steps=args.num_sampling_steps,
            order=2,
            skip_type="time_uniform",
            method="multistep",
        )

    elif args.sampling_algo == 'sa-solver':
        # Create sampling noise:
        n = len(prompts)
        model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks, c=c)
        sas_solver = SASolverSampler(model.forward_with_dpmsolver, device=device)
        samples = sas_solver.sample(
            S=args.num_sampling_steps,
            batch_size=n,
            shape=(4, latent_size_h, latent_size_w),
            eta=1,
            conditioning=caption_embs,
            unconditional_conditioning=null_y,
            unconditional_guidance_scale=args.cfg_scale,
            model_kwargs=model_kwargs,
        )[0]

    samples = vae.decode(samples / vae_scale).sample
    torch.cuda.empty_cache()
    samples = resize_and_crop_tensor(samples, custom_hw[0, 1], custom_hw[0, 0])

    return ndarr_image(samples, normalize=True, value_range=(-1, 1)), c_vis, prompt_show


if __name__ == '__main__':
    args = get_args()
    config = read_config(args.config)
    set_env()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    save_prompt_path = 'output/demo/online_demo_prompts/'
    os.makedirs(save_prompt_path, exist_ok=True)

    assert args.image_size in [512, 1024], "We only provide pre-trained models for 512x512 and 1024x1024 resolutions."
    lewei_scale = {512: 1, 1024: 2}
    latent_size = args.image_size // 8
    weight_dtype = torch.float16
    print(f"Inference with {weight_dtype}")

    model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[args.image_size])
    if config.image_size == 512:
        print('model architecture ControlPixArtHalf and image size is 512')
        model = ControlPixArtHalf(model).to(device)
    elif config.image_size == 1024:
        print('model architecture ControlPixArtMSHalf and image size is 1024')
        model = ControlPixArtMSHalf(model).to(device)

    state_dict = find_model(args.model_path)['state_dict']
    if 'pos_embed' in state_dict:
        del state_dict['pos_embed']
    elif 'base_model.pos_embed' in state_dict:
        del state_dict['base_model.pos_embed']
    missing, unexpected = model.load_state_dict(state_dict, strict=False)
    print('Missing keys (missing pos_embed is normal): ', missing)
    print('Unexpected keys', unexpected)
    model.eval()
    model.to(weight_dtype)
    display_model_info = f'model path: {args.model_path},\n base image size: {args.image_size}'
    base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST')

    vae = AutoencoderKL.from_pretrained(args.tokenizer_path).to(device)
    hed = HEDdetector(False).to(device)

    if args.llm_model == 't5':
        print("begin load t5")
        llm_embed_model = T5Embedder(device=device, local_cache=True, cache_dir='data/t5_ckpts', torch_dtype=torch.float)
        print("finish load t5")
    else:
        print(f'We support t5 only, please initialize the llm again')
        sys.exit()

    gr.Markdown(DESCRIPTION)
    demo = gr.Interface(fn=generate_img,
                        inputs=[
                            Textbox(label="Enter a reference image, the resolution of image need around 1024 x 1024",
                                    placeholder="Please enter your prompt. \n"),
                            Image(type="pil", label="Condition"),
                            Slider(minimum=0., maximum=10000., value=0, step=2, label='seed'),
                            ],
                        outputs=[Image(type="numpy", label="Img"),
                                 Image(type="numpy", label="HED Edge Map"),
                                 Textbox(label="clean prompt"),]
                        )
    demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=args.port, debug=True)


================================================
FILE: PixArt-alpha-ToCa/scripts/pipeline_pixart_inpaint.py
================================================
# Copyright 2023 PixArt-Alpha Authors and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import html
import inspect
import re
import urllib.parse as ul
from typing import Callable, List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
from transformers import T5EncoderModel, T5Tokenizer

from diffusers.image_processor import PipelineImageInput, PixArtImageProcessor, VaeImageProcessor
from diffusers.models import AutoencoderKL, Transformer2DModel
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
from diffusers.schedulers import DPMSolverMultistepScheduler
from diffusers.utils import (
    BACKENDS_MAPPING,
    deprecate,
    is_bs4_available,
    is_ftfy_available,
    logging,
    replace_example_docstring,
)
from diffusers.utils.torch_utils import randn_tensor


logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

if is_bs4_available():
    from bs4 import BeautifulSoup

if is_ftfy_available():
    import ftfy

EXAMPLE_DOC_STRING = """
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import PixArtAlphaInpaintPipeline

        >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too.
        >>> pipe = PixArtAlphaInpaintPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
        >>> # Enable memory optimizations.
        >>> pipe.enable_model_cpu_offload()

        >>> prompt = ""
        >>> image = Image.open('')
        >>> image = pipe(prompt,
                        image=image,
                        mask_image=mask_image,
                        strength=1.0).images[0]
        ```
"""

ASPECT_RATIO_1024_BIN = {
    "0.25": [512.0, 2048.0],
    "0.28": [512.0, 1856.0],
    "0.32": [576.0, 1792.0],
    "0.33": [576.0, 1728.0],
    "0.35": [576.0, 1664.0],
    "0.4": [640.0, 1600.0],
    "0.42": [640.0, 1536.0],
    "0.48": [704.0, 1472.0],
    "0.5": [704.0, 1408.0],
    "0.52": [704.0, 1344.0],
    "0.57": [768.0, 1344.0],
    "0.6": [768.0, 1280.0],
    "0.68": [832.0, 1216.0],
    "0.72": [832.0, 1152.0],
    "0.78": [896.0, 1152.0],
    "0.82": [896.0, 1088.0],
    "0.88": [960.0, 1088.0],
    "0.94": [960.0, 1024.0],
    "1.0": [1024.0, 1024.0],
    "1.07": [1024.0, 960.0],
    "1.13": [1088.0, 960.0],
    "1.21": [1088.0, 896.0],
    "1.29": [1152.0, 896.0],
    "1.38": [1152.0, 832.0],
    "1.46": [1216.0, 832.0],
    "1.67": [1280.0, 768.0],
    "1.75": [1344.0, 768.0],
    "2.0": [1408.0, 704.0],
    "2.09": [1472.0, 704.0],
    "2.4": [1536.0, 640.0],
    "2.5": [1600.0, 640.0],
    "3.0": [1728.0, 576.0],
    "4.0": [2048.0, 512.0],
}

ASPECT_RATIO_512_BIN = {
    "0.25": [256.0, 1024.0],
    "0.28": [256.0, 928.0],
    "0.32": [288.0, 896.0],
    "0.33": [288.0, 864.0],
    "0.35": [288.0, 832.0],
    "0.4": [320.0, 800.0],
    "0.42": [320.0, 768.0],
    "0.48": [352.0, 736.0],
    "0.5": [352.0, 704.0],
    "0.52": [352.0, 672.0],
    "0.57": [384.0, 672.0],
    "0.6": [384.0, 640.0],
    "0.68": [416.0, 608.0],
    "0.72": [416.0, 576.0],
    "0.78": [448.0, 576.0],
    "0.82": [448.0, 544.0],
    "0.88": [480.0, 544.0],
    "0.94": [480.0, 512.0],
    "1.0": [512.0, 512.0],
    "1.07": [512.0, 480.0],
    "1.13": [544.0, 480.0],
    "1.21": [544.0, 448.0],
    "1.29": [576.0, 448.0],
    "1.38": [576.0, 416.0],
    "1.46": [608.0, 416.0],
    "1.67": [640.0, 384.0],
    "1.75": [672.0, 384.0],
    "2.0": [704.0, 352.0],
    "2.09": [736.0, 352.0],
    "2.4": [768.0, 320.0],
    "2.5": [800.0, 320.0],
    "3.0": [864.0, 288.0],
    "4.0": [1024.0, 256.0],
}


# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
def retrieve_timesteps(
    scheduler,
    num_inference_steps: Optional[int] = None,
    device: Optional[Union[str, torch.device]] = None,
    timesteps: Optional[List[int]] = None,
    **kwargs,
):
    """
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used,
            `timesteps` must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
                must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    """
    if timesteps is not None:
        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
        if not accepts_timesteps:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" timestep schedules. Please check whether you are using the correct scheduler."
            )
        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
        timesteps = scheduler.timesteps
        num_inference_steps = len(timesteps)
    else:
        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
        timesteps = scheduler.timesteps
    return timesteps, num_inference_steps


# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
def retrieve_latents(
    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
):
    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
        return encoder_output.latent_dist.sample(generator)
    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
        return encoder_output.latent_dist.mode()
    elif hasattr(encoder_output, "latents"):
        return encoder_output.latents
    else:
        raise AttributeError("Could not access latents of provided encoder_output")


class PixArtAlphaInpaintPipeline(DiffusionPipeline):
    r"""
    Pipeline for text-to-image generation using PixArt-Alpha.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`T5EncoderModel`]):
            Frozen text-encoder. PixArt-Alpha uses
            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
        tokenizer (`T5Tokenizer`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
        transformer ([`Transformer2DModel`]):
            A text conditioned `Transformer2DModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
    """

    bad_punct_regex = re.compile(
        r"["
        + "#®•©™&@·º½¾¿¡§~"
        + r"\)"
        + r"\("
        + r"\]"
        + r"\["
        + r"\}"
        + r"\{"
        + r"\|"
        + "\\"
        + r"\/"
        + r"\*"
        + r"]{1,}"
    )  # noqa

    _optional_components = ["tokenizer", "text_encoder"]
    model_cpu_offload_seq = "text_encoder->transformer->vae"

    def __init__(
        self,
        tokenizer: T5Tokenizer,
        text_encoder: T5EncoderModel,
        vae: AutoencoderKL,
        transformer: Transformer2DModel,
        scheduler: DPMSolverMultistepScheduler,
    ):
        super().__init__()

        self.register_modules(
            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
        )

        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
        self.mask_processor = VaeImageProcessor(
            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
        )

    # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py
    def mask_text_embeddings(self, emb, mask):
        if emb.shape[0] == 1:
            keep_index = mask.sum().item()
            return emb[:, :, :keep_index, :], keep_index
        else:
            masked_feature = emb * mask[:, None, :, None]
            return masked_feature, emb.shape[2]

    # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
    def encode_prompt(
        self,
        prompt: Union[str, List[str]],
        do_classifier_free_guidance: bool = True,
        negative_prompt: str = "",
        num_images_per_prompt: int = 1,
        device: Optional[torch.device] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        prompt_attention_mask: Optional[torch.FloatTensor] = None,
        negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
        clean_caption: bool = False,
        **kwargs,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
                PixArt-Alpha, this should be "".
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                whether to use classifier free guidance or not
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                number of images that should be generated per prompt
            device: (`torch.device`, *optional*):
                torch device to place the resulting embeddings on
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
                string.
            clean_caption (bool, defaults to `False`):
                If `True`, the function will preprocess and clean the provided caption before encoding.
        """

        if "mask_feature" in kwargs:
            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)

        if device is None:
            device = self._execution_device

        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
        elif prompt is not None and isinstance(prompt, list):
            batch_size = len(prompt)
        else:
            batch_size = prompt_embeds.shape[0]

        # See Section 3.1. of the paper.
        max_length = 120

        if prompt_embeds is None:
            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
            text_inputs = self.tokenizer(
                prompt,
                padding="max_length",
                max_length=max_length,
                truncation=True,
                add_special_tokens=True,
                return_tensors="pt",
            )
            text_input_ids = text_inputs.input_ids
            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids

            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
                text_input_ids, untruncated_ids
            ):
                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
                logger.warning(
                    "The following part of your input was truncated because CLIP can only handle sequences up to"
                    f" {max_length} tokens: {removed_text}"
                )

            prompt_attention_mask = text_inputs.attention_mask
            prompt_attention_mask = prompt_attention_mask.to(device)

            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
            prompt_embeds = prompt_embeds[0]

        if self.text_encoder is not None:
            dtype = self.text_encoder.dtype
        elif self.transformer is not None:
            dtype = self.transformer.dtype
        else:
            dtype = None

        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)

        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)

        # get unconditional embeddings for classifier free guidance
        if do_classifier_free_guidance and negative_prompt_embeds is None:
            uncond_tokens = [negative_prompt] * batch_size
            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
            max_length = prompt_embeds.shape[1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
                max_length=max_length,
                truncation=True,
                return_attention_mask=True,
                add_special_tokens=True,
                return_tensors="pt",
            )
            negative_prompt_attention_mask = uncond_input.attention_mask
            negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)

            negative_prompt_embeds = self.text_encoder(
                uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
            )
            negative_prompt_embeds = negative_prompt_embeds[0]

        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
            seq_len = negative_prompt_embeds.shape[1]

            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)

            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
        else:
            negative_prompt_embeds = None
            negative_prompt_attention_mask = None

        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
        # and should be between [0, 1]

        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
        extra_step_kwargs = {}
        if accepts_eta:
            extra_step_kwargs["eta"] = eta

        # check if the scheduler accepts generator
        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
        if accepts_generator:
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs

    def check_inputs(
        self,
        prompt,
        height,
        width,
        negative_prompt,
        callback_steps,
        prompt_embeds=None,
        negative_prompt_embeds=None,
        prompt_attention_mask=None,
        negative_prompt_attention_mask=None,
    ):
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

        if (callback_steps is None) or (
            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
        ):
            raise ValueError(
                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
                f" {type(callback_steps)}."
            )

        if prompt is not None and prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
                " only forward one of the two."
            )
        elif prompt is None and prompt_embeds is None:
            raise ValueError(
                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
            )
        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

        if prompt is not None and negative_prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
            )

        if negative_prompt is not None and negative_prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
            )

        if prompt_embeds is not None and prompt_attention_mask is None:
            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")

        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")

        if prompt_embeds is not None and negative_prompt_embeds is not None:
            if prompt_embeds.shape != negative_prompt_embeds.shape:
                raise ValueError(
                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
                    f" {negative_prompt_embeds.shape}."
                )
            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
                raise ValueError(
                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
                    f" {negative_prompt_attention_mask.shape}."
                )

    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
    def _text_preprocessing(self, text, clean_caption=False):
        if clean_caption and not is_bs4_available():
            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
            logger.warn("Setting `clean_caption` to False...")
            clean_caption = False

        if clean_caption and not is_ftfy_available():
            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
            logger.warn("Setting `clean_caption` to False...")
            clean_caption = False

        if not isinstance(text, (tuple, list)):
            text = [text]

        def process(text: str):
            if clean_caption:
                text = self._clean_caption(text)
                text = self._clean_caption(text)
            else:
                text = text.lower().strip()
            return text

        return [process(t) for t in text]

    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
    def _clean_caption(self, caption):
        caption = str(caption)
        caption = ul.unquote_plus(caption)
        caption = caption.strip().lower()
        caption = re.sub("<person>", "person", caption)
        # urls:
        caption = re.sub(
            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",
            # noqa
            "",
            caption,
        )  # regex for urls
        caption = re.sub(
            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",
            # noqa
            "",
            caption,
        )  # regex for urls
        # html:
        caption = BeautifulSoup(caption, features="html.parser").text

        # @<nickname>
        caption = re.sub(r"@[\w\d]+\b", "", caption)

        # 31C0—31EF CJK Strokes
        # 31F0—31FF Katakana Phonetic Extensions
        # 3200—32FF Enclosed CJK Letters and Months
        # 3300—33FF CJK Compatibility
        # 3400—4DBF CJK Unified Ideographs Extension A
        # 4DC0—4DFF Yijing Hexagram Symbols
        # 4E00—9FFF CJK Unified Ideographs
        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
        #######################################################

        # все виды тире / all types of dash --> "-"
        caption = re.sub(
            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",
            # noqa
            "-",
            caption,
        )

        # кавычки к одному стандарту
        caption = re.sub(r"[`´«»“”¨]", '"', caption)
        caption = re.sub(r"[‘’]", "'", caption)

        # &quot;
        caption = re.sub(r"&quot;?", "", caption)
        # &amp
        caption = re.sub(r"&amp", "", caption)

        # ip adresses:
        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)

        # article ids:
        caption = re.sub(r"\d:\d\d\s+$", "", caption)

        # \n
        caption = re.sub(r"\\n", " ", caption)

        # "#123"
        caption = re.sub(r"#\d{1,3}\b", "", caption)
        # "#12345.."
        caption = re.sub(r"#\d{5,}\b", "", caption)
        # "123456.."
        caption = re.sub(r"\b\d{6,}\b", "", caption)
        # filenames:
        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)

        #
        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""

        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "

        # this-is-my-cute-cat / this_is_my_cute_cat
        regex2 = re.compile(r"(?:\-|\_)")
        if len(re.findall(regex2, caption)) > 3:
            caption = re.sub(regex2, " ", caption)

        caption = ftfy.fix_text(caption)
        caption = html.unescape(html.unescape(caption))

        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231

        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
        caption = re.sub(r"\bpage\s+\d+\b", "", caption)

        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...

        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)

        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
        caption = re.sub(r"\s+", " ", caption)

        caption.strip()

        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
        caption = re.sub(r"^\.\S+$", "", caption)

        return caption.strip()

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(
        self,
        batch_size,
        num_channels_latents,
        height,
        width,
        dtype,
        device,
        generator,
        latents=None,
        image=None,
        timestep=None,
        is_strength_max=True,
        return_image_latents=True,
    ):
        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
            )

        if (image is None or timestep is None) and not is_strength_max:
            raise ValueError(
                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
                "However, either the image or the noise timestep has not been provided."
            )

        if return_image_latents or (latents is None and not is_strength_max):
            image = image.to(device=device, dtype=dtype)

            if image.shape[1] == 4:
                image_latents = image
            else:
                image_latents = self._encode_vae_image(image=image, generator=generator)
            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)

        if latents is None:
            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
            # if strength is 1. then initialise the latents to noise, else initial to image + noise
            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
            # if pure noise then scale the initial latents by the  Scheduler's init sigma
            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
        else:
            noise = latents.to(device)
            latents = noise * self.scheduler.init_noise_sigma

        # scale the initial noise by the standard deviation required by the scheduler
        latents = latents * self.scheduler.init_noise_sigma
        return latents, noise, image_latents

    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
        if isinstance(generator, list):
            image_latents = [
                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
                for i in range(image.shape[0])
            ]
            image_latents = torch.cat(image_latents, dim=0)
        else:
            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)

        image_latents = self.vae.config.scaling_factor * image_latents

        return image_latents

    def prepare_mask_latents(
        self, mask, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
    ):
        # resize the mask to latents shape as we concatenate the mask to the latents
        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
        # and half precision
        mask = torch.nn.functional.interpolate(
            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
        )
        mask = mask.to(device=device, dtype=dtype)

        if mask.shape[0] < batch_size:
            if not batch_size % mask.shape[0] == 0:
                raise ValueError(
                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
                    " of masks that you pass is divisible by the total requested batch size."
                )
            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)

        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask

        return mask

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
    def get_timesteps(self, num_inference_steps, strength, device):
        # get the original timestep using init_timestep
        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)

        t_start = max(num_inference_steps - init_timestep, 0)
        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]

        return timesteps, num_inference_steps - t_start

    @torch.no_grad()
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
        prompt: Union[str, List[str]] = None,
        image: PipelineImageInput = None,
        mask_image: PipelineImageInput = None,
        strength: float = 1.0,
        negative_prompt: str = "",
        num_inference_steps: int = 20,
        timesteps: List[int] = None,
        guidance_scale: float = 4.5,
        num_images_per_prompt: Optional[int] = 1,
        height: Optional[int] = None,
        width: Optional[int] = None,
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        prompt_attention_mask: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: int = 1,
        clean_caption: bool = True,
        use_resolution_binning: bool = True,
        **kwargs,
    ) -> Union[ImagePipelineOutput, Tuple]:
        """
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to
                be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch
                tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the
                expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the
                expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but
                if passing latents directly it is not encoded again.
            mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
                1)`, or `(H, W)`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
                timesteps are used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 4.5):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            height (`int`, *optional*, defaults to self.unet.config.sample_size):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to self.unet.config.sample_size):
                The width in pixels of the generated image.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not
                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
            negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
                Pre-generated attention mask for negative text embeddings.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
            clean_caption (`bool`, *optional*, defaults to `True`):
                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
                be installed. If the dependencies are not installed, the embeddings will be created from the raw
                prompt.
            use_resolution_binning (`bool` defaults to `True`):
                If set to `True`, the requested height and width are first mapped to the closest resolutions using
                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
                the requested resolution. Useful for generating non-square images.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images
        """
        if "mask_feature" in kwargs:
            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
        # 1. Check inputs. Raise error if not correct
        height = height or self.transformer.config.sample_size * self.vae_scale_factor
        width = width or self.transformer.config.sample_size * self.vae_scale_factor
        if use_resolution_binning:
            aspect_ratio_bin = (
                ASPECT_RATIO_1024_BIN if self.transformer.config.sample_size == 128 else ASPECT_RATIO_512_BIN
            )
            orig_height, orig_width = height, width
            height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)

        self.check_inputs(
            prompt,
            height,
            width,
            negative_prompt,
            callback_steps,
            prompt_embeds,
            negative_prompt_embeds,
            prompt_attention_mask,
            negative_prompt_attention_mask,
        )

        # 2. Default height and width to transformer
        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
        elif prompt is not None and isinstance(prompt, list):
            batch_size = len(prompt)
        else:
            batch_size = prompt_embeds.shape[0]

        device = self._execution_device

        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
        (
            prompt_embeds,
            prompt_attention_mask,
            negative_prompt_embeds,
            negative_prompt_attention_mask,
        ) = self.encode_prompt(
            prompt,
            do_classifier_free_guidance,
            negative_prompt=negative_prompt,
            num_images_per_prompt=num_images_per_prompt,
            device=device,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
            prompt_attention_mask=prompt_attention_mask,
            negative_prompt_attention_mask=negative_prompt_attention_mask,
            clean_caption=clean_caption,
        )
        if do_classifier_free_guidance:
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)

        # 4. Prepare timesteps
        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
        timesteps, num_inference_steps = self.get_timesteps(
            num_inference_steps=num_inference_steps, strength=strength, device=device
        )

        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
        is_strength_max = strength == 1.0
        init_image = self.image_processor.preprocess(image, height=height, width=width)
        init_image = init_image.to(dtype=torch.float32)

        # 5. Prepare latents.
        latent_channels = self.transformer.config.in_channels
        latents_outputs = self.prepare_latents(
            batch_size * num_images_per_prompt,
            latent_channels,
            height,
            width,
            prompt_embeds.dtype,
            device,
            generator,
            latents,
            image=init_image,
            timestep=latent_timestep,
            is_strength_max=is_strength_max,
        )
        latents, noise, image_latents = latents_outputs

        mask_condition = self.mask_processor.preprocess(mask_image, height=height, width=width)
        mask = self.prepare_mask_latents(
            mask_condition,
            batch_size * num_images_per_prompt,
            height,
            width,
            prompt_embeds.dtype,
            device,
            generator,
            do_classifier_free_guidance,
        )

        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 6.1 Prepare micro-conditions.
        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
        if self.transformer.config.sample_size == 128:
            resolution = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1)
            aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size * num_images_per_prompt, 1)
            resolution = resolution.to(dtype=prompt_embeds.dtype, device=device)
            aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device)
            added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}

        # 7. Denoising loop
        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)

        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, t in enumerate(timesteps):
                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                current_timestep = t
                if not torch.is_tensor(current_timestep):
                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
                    # This would be a good case for the `match` statement (Python 3.10+)
                    is_mps = latent_model_input.device.type == "mps"
                    if isinstance(current_timestep, float):
                        dtype = torch.float32 if is_mps else torch.float64
                    else:
                        dtype = torch.int32 if is_mps else torch.int64
                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
                elif len(current_timestep.shape) == 0:
                    current_timestep = current_timestep[None].to(latent_model_input.device)
                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                current_timestep = current_timestep.expand(latent_model_input.shape[0])

                # predict noise model_output
                noise_pred = self.transformer(
                    latent_model_input,
                    encoder_hidden_states=prompt_embeds,
                    encoder_attention_mask=prompt_attention_mask,
                    timestep=current_timestep,
                    added_cond_kwargs=added_cond_kwargs,
                    return_dict=False,
                )[0]

                # perform guidance
                if do_classifier_free_guidance:
                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

                # learned sigma
                if self.transformer.config.out_channels // 2 == latent_channels:
                    noise_pred = noise_pred.chunk(2, dim=1)[0]
                else:
                    noise_pred = noise_pred

                # compute previous image: x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

                init_latents_proper = image_latents
                if do_classifier_free_guidance:
                    init_mask, _ = mask.chunk(2)
                else:
                    init_mask = mask

                if i < len(timesteps) - 1:
                    noise_timestep = timesteps[i + 1]
                    init_latents_proper = self.scheduler.add_noise(
                        init_latents_proper, noise, torch.tensor([noise_timestep])
                    )

                latents = (1 - init_mask) * init_latents_proper + init_mask * latents

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
                    if callback is not None and i % callback_steps == 0:
                        step_idx = i // getattr(self.scheduler, "order", 1)
                        callback(step_idx, t, latents)

        if not output_type == "latent":
            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
            if use_resolution_binning:
                image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
        else:
            image = latents

        if not output_type == "latent":
            image = self.image_processor.postprocess(image, output_type=output_type)

        # Offload all models
        self.maybe_free_model_hooks()

        if not return_dict:
            return (image,)

        return ImagePipelineOutput(images=image)


================================================
FILE: PixArt-alpha-ToCa/scripts/pipeline_pixart_reference.py
================================================
# Copyright 2023 PixArt-Alpha Authors and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import html
import inspect
import re
import urllib.parse as ul
from typing import Callable, List, Optional, Tuple, Union
from PIL import Image

import torch
import torch.nn.functional as F
from transformers import T5EncoderModel, T5Tokenizer

from diffusers.image_processor import VaeImageProcessor, PipelineImageInput
from diffusers.models import AutoencoderKL, Transformer2DModel
from diffusers.schedulers import DPMSolverMultistepScheduler
from diffusers.utils import (
    BACKENDS_MAPPING,
    deprecate,
    is_bs4_available,
    is_ftfy_available,
    logging,
    replace_example_docstring,
)
from diffusers.utils.torch_utils import randn_tensor
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput

logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

if is_bs4_available():
    from bs4 import BeautifulSoup

if is_ftfy_available():
    import ftfy

EXAMPLE_DOC_STRING = """
    Examples:
        ```py
        >>> import PIL
        >>> from io import BytesIO
        >>> import requests
        >>> import torch
        
        >>> from diffusers import PixArtAlphaReferencePipeline
        
        >>> def download_image(url):
        ...     response = requests.get(url)
        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")

        >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too.
        >>> pipe = PixArtAlphaReferencePipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
        >>> pipe = pipe.to('cuda')
        
        >>> img_url = "http://p1.qhimgs4.com/t01fef6f9d5e69335dd.jpg"
        >>> ref_image = download_image(img_url).crop((0, 0, 2160, 2160)).resize((1024, 1024))
        >>> image_out = pipe(
        ...           prompt='',
        ...            height=1024,
        ...            width=1024,
        ...            image=ref_image,
        ...            num_inference_steps=20,
        ...            guidance_scale=4.0,
        ...            ).images[0]
        ```
"""

ASPECT_RATIO_1024_BIN = {
    "0.25": [512.0, 2048.0],
    "0.28": [512.0, 1856.0],
    "0.32": [576.0, 1792.0],
    "0.33": [576.0, 1728.0],
    "0.35": [576.0, 1664.0],
    "0.4": [640.0, 1600.0],
    "0.42": [640.0, 1536.0],
    "0.48": [704.0, 1472.0],
    "0.5": [704.0, 1408.0],
    "0.52": [704.0, 1344.0],
    "0.57": [768.0, 1344.0],
    "0.6": [768.0, 1280.0],
    "0.68": [832.0, 1216.0],
    "0.72": [832.0, 1152.0],
    "0.78": [896.0, 1152.0],
    "0.82": [896.0, 1088.0],
    "0.88": [960.0, 1088.0],
    "0.94": [960.0, 1024.0],
    "1.0": [1024.0, 1024.0],
    "1.07": [1024.0, 960.0],
    "1.13": [1088.0, 960.0],
    "1.21": [1088.0, 896.0],
    "1.29": [1152.0, 896.0],
    "1.38": [1152.0, 832.0],
    "1.46": [1216.0, 832.0],
    "1.67": [1280.0, 768.0],
    "1.75": [1344.0, 768.0],
    "2.0": [1408.0, 704.0],
    "2.09": [1472.0, 704.0],
    "2.4": [1536.0, 640.0],
    "2.5": [1600.0, 640.0],
    "3.0": [1728.0, 576.0],
    "4.0": [2048.0, 512.0],
}

ASPECT_RATIO_512_BIN = {
    "0.25": [256.0, 1024.0],
    "0.28": [256.0, 928.0],
    "0.32": [288.0, 896.0],
    "0.33": [288.0, 864.0],
    "0.35": [288.0, 832.0],
    "0.4": [320.0, 800.0],
    "0.42": [320.0, 768.0],
    "0.48": [352.0, 736.0],
    "0.5": [352.0, 704.0],
    "0.52": [352.0, 672.0],
    "0.57": [384.0, 672.0],
    "0.6": [384.0, 640.0],
    "0.68": [416.0, 608.0],
    "0.72": [416.0, 576.0],
    "0.78": [448.0, 576.0],
    "0.82": [448.0, 544.0],
    "0.88": [480.0, 544.0],
    "0.94": [480.0, 512.0],
    "1.0": [512.0, 512.0],
    "1.07": [512.0, 480.0],
    "1.13": [544.0, 480.0],
    "1.21": [544.0, 448.0],
    "1.29": [576.0, 448.0],
    "1.38": [576.0, 416.0],
    "1.46": [608.0, 416.0],
    "1.67": [640.0, 384.0],
    "1.75": [672.0, 384.0],
    "2.0": [704.0, 352.0],
    "2.09": [736.0, 352.0],
    "2.4": [768.0, 320.0],
    "2.5": [800.0, 320.0],
    "3.0": [864.0, 288.0],
    "4.0": [1024.0, 256.0],
}


# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
def retrieve_timesteps(
        scheduler,
        num_inference_steps: Optional[int] = None,
        device: Optional[Union[str, torch.device]] = None,
        timesteps: Optional[List[int]] = None,
        **kwargs,
):
    """
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used,
            `timesteps` must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
                must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    """
    if timesteps is not None:
        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
        if not accepts_timesteps:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" timestep schedules. Please check whether you are using the correct scheduler."
            )
        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
        timesteps = scheduler.timesteps
        num_inference_steps = len(timesteps)
    else:
        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
        timesteps = scheduler.timesteps
    return timesteps, num_inference_steps


# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
def retrieve_latents(
        encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
):
    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
        return encoder_output.latent_dist.sample(generator)
    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
        return encoder_output.latent_dist.mode()
    elif hasattr(encoder_output, "latents"):
        return encoder_output.latents
    else:
        raise AttributeError("Could not access latents of provided encoder_output")


class PixArtAlphaReferencePipeline(DiffusionPipeline):
    r"""
    Pipeline for image-to-image generation using PixArt-Alpha.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`T5EncoderModel`]):
            Frozen text-encoder. PixArt-Alpha uses
            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
        tokenizer (`T5Tokenizer`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
        transformer ([`Transformer2DModel`]):
            A text conditioned `Transformer2DModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
    """

    bad_punct_regex = re.compile(
        r"["
        + "#®•©™&@·º½¾¿¡§~"
        + r"\)"
        + r"\("
        + r"\]"
        + r"\["
        + r"\}"
        + r"\{"
        + r"\|"
        + "\\"
        + r"\/"
        + r"\*"
        + r"]{1,}"
    )  # noqa

    _optional_components = ["tokenizer", "text_encoder"]
    model_cpu_offload_seq = "text_encoder->transformer->vae"

    def __init__(
            self,
            tokenizer: T5Tokenizer,
            text_encoder: T5EncoderModel,
            vae: AutoencoderKL,
            transformer: Transformer2DModel,
            scheduler: DPMSolverMultistepScheduler,
    ):
        super().__init__()

        self.register_modules(
            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
        )

        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
        self.mask_processor = VaeImageProcessor(
            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
        )

    # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py
    def mask_text_embeddings(self, emb, mask):
        if emb.shape[0] == 1:
            keep_index = mask.sum().item()
            return emb[:, :, :keep_index, :], keep_index
        else:
            masked_feature = emb * mask[:, None, :, None]
            return masked_feature, emb.shape[2]

    # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
    def encode_prompt(
            self,
            prompt: Union[str, List[str]],
            do_classifier_free_guidance: bool = True,
            negative_prompt: str = "",
            num_images_per_prompt: int = 1,
            device: Optional[torch.device] = None,
            prompt_embeds: Optional[torch.FloatTensor] = None,
            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
            prompt_attention_mask: Optional[torch.FloatTensor] = None,
            negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
            clean_caption: bool = False,
            **kwargs,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
                PixArt-Alpha, this should be "".
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                whether to use classifier free guidance or not
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                number of images that should be generated per prompt
            device: (`torch.device`, *optional*):
                torch device to place the resulting embeddings on
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
                string.
            clean_caption (bool, defaults to `False`):
                If `True`, the function will preprocess and clean the provided caption before encoding.
        """

        if "mask_feature" in kwargs:
            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)

        if device is None:
            device = self._execution_device

        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
        elif prompt is not None and isinstance(prompt, list):
            batch_size = len(prompt)
        else:
            batch_size = prompt_embeds.shape[0]

        # See Section 3.1. of the paper.
        max_length = 120

        if prompt_embeds is None:
            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
            text_inputs = self.tokenizer(
                prompt,
                padding="max_length",
                max_length=max_length,
                truncation=True,
                add_special_tokens=True,
                return_tensors="pt",
            )
            text_input_ids = text_inputs.input_ids
            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids

            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
                    text_input_ids, untruncated_ids
            ):
                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1: -1])
                logger.warning(
                    "The following part of your input was truncated because CLIP can only handle sequences up to"
                    f" {max_length} tokens: {removed_text}"
                )

            prompt_attention_mask = text_inputs.attention_mask
            prompt_attention_mask = prompt_attention_mask.to(device)

            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
            prompt_embeds = prompt_embeds[0]

        if self.text_encoder is not None:
            dtype = self.text_encoder.dtype
        elif self.transformer is not None:
            dtype = self.transformer.dtype
        else:
            dtype = None

        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)

        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)

        # get unconditional embeddings for classifier free guidance
        if do_classifier_free_guidance and negative_prompt_embeds is None:
            uncond_tokens = [negative_prompt] * batch_size
            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
            max_length = prompt_embeds.shape[1]
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
                max_length=max_length,
                truncation=True,
                return_attention_mask=True,
                add_special_tokens=True,
                return_tensors="pt",
            )
            negative_prompt_attention_mask = uncond_input.attention_mask
            negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)

            negative_prompt_embeds = self.text_encoder(
                uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
            )
            negative_prompt_embeds = negative_prompt_embeds[0]

        if do_classifier_free_guidance:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
            seq_len = negative_prompt_embeds.shape[1]

            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)

            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
        else:
            negative_prompt_embeds = None
            negative_prompt_attention_mask = None

        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
        # and should be between [0, 1]

        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
        extra_step_kwargs = {}
        if accepts_eta:
            extra_step_kwargs["eta"] = eta

        # check if the scheduler accepts generator
        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
        if accepts_generator:
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs

    def check_inputs(
            self,
            prompt,
            image,
            height,
            width,
            negative_prompt,
            callback_steps,
            prompt_embeds=None,
            negative_prompt_embeds=None,
            prompt_attention_mask=None,
            negative_prompt_attention_mask=None,
    ):
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

        if (callback_steps is None) or (
                callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
        ):
            raise ValueError(
                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
                f" {type(callback_steps)}."
            )

        if prompt is not None and prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
                " only forward one of the two."
            )
        elif prompt is None and prompt_embeds is None:
            raise ValueError(
                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
            )
        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

        if prompt is not None and negative_prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
            )

        if negative_prompt is not None and negative_prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
            )

        if prompt_embeds is not None and prompt_attention_mask is None:
            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")

        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")

        if prompt_embeds is not None and negative_prompt_embeds is not None:
            if prompt_embeds.shape != negative_prompt_embeds.shape:
                raise ValueError(
                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
                    f" {negative_prompt_embeds.shape}."
                )
            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
                raise ValueError(
                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
                    f" {negative_prompt_attention_mask.shape}."
                )

        if image is None:
            raise ValueError(
                "Provide `image`. Cannot leave `image` undefined."
            )

    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
    def _text_preprocessing(self, text, clean_caption=False):
        if clean_caption and not is_bs4_available():
            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
            logger.warn("Setting `clean_caption` to False...")
            clean_caption = False

        if clean_caption and not is_ftfy_available():
            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
            logger.warn("Setting `clean_caption` to False...")
            clean_caption = False

        if not isinstance(text, (tuple, list)):
            text = [text]

        def process(text: str):
            if clean_caption:
                text = self._clean_caption(text)
                text = self._clean_caption(text)
            else:
                text = text.lower().strip()
            return text

        return [process(t) for t in text]

    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
    def _clean_caption(self, caption):
        caption = str(caption)
        caption = ul.unquote_plus(caption)
        caption = caption.strip().lower()
        caption = re.sub("<person>", "person", caption)
        # urls:
        caption = re.sub(
            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",
            # noqa
            "",
            caption,
        )  # regex for urls
        caption = re.sub(
            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",
            # noqa
            "",
            caption,
        )  # regex for urls
        # html:
        caption = BeautifulSoup(caption, features="html.parser").text

        # @<nickname>
        caption = re.sub(r"@[\w\d]+\b", "", caption)

        # 31C0—31EF CJK Strokes
        # 31F0—31FF Katakana Phonetic Extensions
        # 3200—32FF Enclosed CJK Letters and Months
        # 3300—33FF CJK Compatibility
        # 3400—4DBF CJK Unified Ideographs Extension A
        # 4DC0—4DFF Yijing Hexagram Symbols
        # 4E00—9FFF CJK Unified Ideographs
        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
        #######################################################

        # все виды тире / all types of dash --> "-"
        caption = re.sub(
            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",
            # noqa
            "-",
            caption,
        )

        # кавычки к одному стандарту
        caption = re.sub(r"[`´«»“”¨]", '"', caption)
        caption = re.sub(r"[‘’]", "'", caption)

        # &quot;
        caption = re.sub(r"&quot;?", "", caption)
        # &amp
        caption = re.sub(r"&amp", "", caption)

        # ip adresses:
        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)

        # article ids:
        caption = re.sub(r"\d:\d\d\s+$", "", caption)

        # \n
        caption = re.sub(r"\\n", " ", caption)

        # "#123"
        caption = re.sub(r"#\d{1,3}\b", "", caption)
        # "#12345.."
        caption = re.sub(r"#\d{5,}\b", "", caption)
        # "123456.."
        caption = re.sub(r"\b\d{6,}\b", "", caption)
        # filenames:
        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)

        #
        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""

        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "

        # this-is-my-cute-cat / this_is_my_cute_cat
        regex2 = re.compile(r"(?:\-|\_)")
        if len(re.findall(regex2, caption)) > 3:
            caption = re.sub(regex2, " ", caption)

        caption = ftfy.fix_text(caption)
        caption = html.unescape(html.unescape(caption))

        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231

        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
        caption = re.sub(r"\bpage\s+\d+\b", "", caption)

        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...

        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)

        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
        caption = re.sub(r"\s+", " ", caption)

        caption.strip()

        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
        caption = re.sub(r"^\.\S+$", "", caption)

        return caption.strip()

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None,
                        image=None,
                        timestep=None,
                        is_strength_max=True,
                        return_image_latents=True,
                        ):
        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
            )

        if (image is None or timestep is None) and not is_strength_max:
            raise ValueError(
                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
                "However, either the image or the noise timestep has not been provided."
            )

        if return_image_latents or (latents is None and not is_strength_max):
            image = image.to(device=device, dtype=dtype)

            if image.shape[1] == 4:
                image_latents = image
            else:
                image_latents = self._encode_vae_image(image=image, generator=generator)
            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)

        if latents is None:
            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
            # if strength is 1. then initialise the latents to noise, else initial to image + noise
            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
            # if pure noise then scale the initial latents by the  Scheduler's init sigma
            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
        else:
            noise = latents.to(device)
            latents = noise * self.scheduler.init_noise_sigma

        # scale the initial noise by the standard deviation required by the scheduler
        latents = latents * self.scheduler.init_noise_sigma
        return latents, noise, image_latents

    @staticmethod
    def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]:
        """Returns binned height and width."""
        ar = float(height / width)
        closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
        default_hw = ratios[closest_ratio]
        return int(default_hw[0]), int(default_hw[1])

    @staticmethod
    def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int) -> torch.Tensor:
        orig_height, orig_width = samples.shape[2], samples.shape[3]

        # Check if resizing is needed
        if orig_height != new_height or orig_width != new_width:
            ratio = max(new_height / orig_height, new_width / orig_width)
            resized_width = int(orig_width * ratio)
            resized_height = int(orig_height * ratio)

            # Resize
            samples = F.interpolate(
                samples, size=(resized_height, resized_width), mode="bilinear", align_corners=False
            )

            # Center Crop
            start_x = (resized_width - new_width) // 2
            end_x = start_x + new_width
            start_y = (resized_height - new_height) // 2
            end_y = start_y + new_height
            samples = samples[:, :, start_y:end_y, start_x:end_x]

        return samples

    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
        if isinstance(generator, list):
            image_latents = [
                retrieve_latents(self.vae.encode(image[i: i + 1]), generator=generator[i])
                for i in range(image.shape[0])
            ]
            image_latents = torch.cat(image_latents, dim=0)
        else:
            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)

        image_latents = self.vae.config.scaling_factor * image_latents

        return image_latents

    def prepare_mask_latents(
            self, mask, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
    ):
        # resize the mask to latents shape as we concatenate the mask to the latents
        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
        # and half precision
        mask = torch.nn.functional.interpolate(
            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
        )
        mask = mask.to(device=device, dtype=dtype)

        if mask.shape[0] < batch_size:
            if not batch_size % mask.shape[0] == 0:
                raise ValueError(
                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
                    " of masks that you pass is divisible by the total requested batch size."
                )
            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)

        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask

        return mask

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
    def get_timesteps(self, num_inference_steps, strength, device):
        # get the original timestep using init_timestep
        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)

        t_start = max(num_inference_steps - init_timestep, 0)
        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]

        return timesteps, num_inference_steps - t_start

    @torch.no_grad()
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
            self,
            prompt: Union[str, List[str]] = None,
            image: PipelineImageInput = None,
            strength: float = 1.0,
            negative_prompt: str = "",
            num_inference_steps: int = 20,
            timesteps: List[int] = None,
            guidance_scale: float = 4.5,
            num_images_per_prompt: Optional[int] = 1,
            height: Optional[int] = None,
            width: Optional[int] = None,
            eta: float = 0.0,
            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
            latents: Optional[torch.FloatTensor] = None,
            prompt_embeds: Optional[torch.FloatTensor] = None,
            prompt_attention_mask: Optional[torch.FloatTensor] = None,
            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
            negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
            output_type: Optional[str] = "pil",
            return_dict: bool = True,
            callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
            callback_steps: int = 1,
            clean_caption: bool = True,
            use_resolution_binning: bool = True,
            **kwargs,
    ) -> Union[ImagePipelineOutput, Tuple]:
        """
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
                The reference image guides the image generation.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
                timesteps are used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 4.5):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            height (`int`, *optional*, defaults to self.unet.config.sample_size):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to self.unet.config.sample_size):
                The width in pixels of the generated image.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not
                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
            negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
                Pre-generated attention mask for negative text embeddings.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
            clean_caption (`bool`, *optional*, defaults to `True`):
                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
                be installed. If the dependencies are not installed, the embeddings will be created from the raw
                prompt.
            use_resolution_binning (`bool` defaults to `True`):
                If set to `True`, the requested height and width are first mapped to the closest resolutions using
                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
                the requested resolution. Useful for generating non-square images.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images
        """
        if "mask_feature" in kwargs:
            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
        # 1. Check inputs. Raise error if not correct
        height = height or self.transformer.config.sample_size * self.vae_scale_factor
        width = width or self.transformer.config.sample_size * self.vae_scale_factor

        width *= 2
        ref = image
        image = Image.new("RGB", (width, height), (255, 255, 255))
        image.paste(ref, (0, 0))

        mask_image = Image.new("RGB", (width, height), (255, 255, 255))
        balck_rect = Image.new("RGB", (width // 2, height), (0, 0, 0))
        mask_image.paste(balck_rect, (0, 0))

        if use_resolution_binning:
            aspect_ratio_bin = (
                ASPECT_RATIO_1024_BIN if self.transformer.config.sample_size == 128 else ASPECT_RATIO_512_BIN
            )
            orig_height, orig_width = height, width
            height, width = self.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)

        self.check_inputs(
            prompt,
            image,
            height,
            width,
            negative_prompt,
            callback_steps,
            prompt_embeds,
            negative_prompt_embeds,
            prompt_attention_mask,
            negative_prompt_attention_mask,
        )

        # 2. Default height and width to transformer
        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
        elif prompt is not None and isinstance(prompt, list):
            batch_size = len(prompt)
        else:
            batch_size = prompt_embeds.shape[0]

        device = self._execution_device

        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0

        # 3. Encode input prompt
        (
            prompt_embeds,
            prompt_attention_mask,
            negative_prompt_embeds,
            negative_prompt_attention_mask,
        ) = self.encode_prompt(
            prompt,
            do_classifier_free_guidance,
            negative_prompt=negative_prompt,
            num_images_per_prompt=num_images_per_prompt,
            device=device,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
            prompt_attention_mask=prompt_attention_mask,
            negative_prompt_attention_mask=negative_prompt_attention_mask,
            clean_caption=clean_caption,
        )
        if do_classifier_free_guidance:
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)

        # 4. Prepare timesteps
        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
        timesteps, num_inference_steps = self.get_timesteps(
            num_inference_steps=num_inference_steps, strength=strength, device=device
        )

        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
        is_strength_max = strength == 1.0
        init_image = self.image_processor.preprocess(image, height=height, width=width)
        init_image = init_image.to(dtype=torch.float32)

        # 5. Prepare latents.
        latent_channels = self.transformer.config.in_channels
        latents_outputs = self.prepare_latents(
            batch_size * num_images_per_prompt,
            latent_channels,
            height,
            width,
            prompt_embeds.dtype,
            device,
            generator,
            latents,
            image=init_image,
            timestep=latent_timestep,
            is_strength_max=is_strength_max,
        )
        latents, noise, image_latents = latents_outputs

        mask_condition = self.mask_processor.preprocess(mask_image, height=height, width=width)
        mask = self.prepare_mask_latents(
            mask_condition,
            batch_size * num_images_per_prompt,
            height,
            width,
            prompt_embeds.dtype,
            device,
            generator,
            do_classifier_free_guidance,
        )

        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 6.1 Prepare micro-conditions.
        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
        if self.transformer.config.sample_size == 128:
            resolution = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1)
            aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size * num_images_per_prompt, 1)
            resolution = resolution.to(dtype=prompt_embeds.dtype, device=device)
            aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device)
            added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}

        # 7. Denoising loop
        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)

        latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, t in enumerate(timesteps):
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                current_timestep = t
                if not torch.is_tensor(current_timestep):
                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
                    # This would be a good case for the `match` statement (Python 3.10+)
                    is_mps = latent_model_input.device.type == "mps"
                    if isinstance(current_timestep, float):
                        dtype = torch.float32 if is_mps else torch.float64
                    else:
                        dtype = torch.int32 if is_mps else torch.int64
                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
                elif len(current_timestep.shape) == 0:
                    current_timestep = current_timestep[None].to(latent_model_input.device)
                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML

                # predict noise model_output
                noise_pred = self.transformer(
                    latent_model_input,
                    encoder_hidden_states=prompt_embeds,
                    encoder_attention_mask=prompt_attention_mask,
                    timestep=current_timestep,
                    added_cond_kwargs=added_cond_kwargs,
                    return_dict=False,
                )[0]

                # perform guidance
                if do_classifier_free_guidance:
                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

                # learned sigma
                if self.transformer.config.out_channels // 2 == latent_channels:
                    noise_pred = noise_pred.chunk(2, dim=1)[0]
                else:
                    noise_pred = noise_pred

                # compute previous image: x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

                init_latents_proper = image_latents
                if do_classifier_free_guidance:
                    init_mask, _ = mask.chunk(2)
                else:
                    init_mask = mask

                if i < len(timesteps) - 1:
                    noise_timestep = timesteps[i + 1]
                    init_latents_proper = self.scheduler.add_noise(
                        init_latents_proper, noise, torch.tensor([noise_timestep])
                    )
                latents_ = latents
                latents = (1 - init_mask) * init_latents_proper + init_mask * latents

                latent_model_input = torch.cat([latents_] + [latents]) if do_classifier_free_guidance else latents

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
                    if callback is not None and i % callback_steps == 0:
                        step_idx = i // getattr(self.scheduler, "order", 1)
                        callback(step_idx, t, latents)
        if not output_type == "latent":
            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
            if use_resolution_binning:
                image = self.resize_and_crop_tensor(image, orig_width, orig_height)
        else:
            image = latents

        image = image.chunk(2, -1)[1]
        if not output_type == "latent":
            image = self.image_processor.postprocess(image, output_type=output_type)

        # Offload all models
        self.maybe_free_model_hooks()

        if not return_dict:
            return (image,)

        return ImagePipelineOutput(images=image)


================================================
FILE: PixArt-alpha-ToCa/timing_analysis.py
================================================
import json
import numpy as np
import matplotlib.pyplot as plt

with open('timing_info.json', 'r') as f:
    data = json.load(f)

attn_times = []
cross_attn_times = []
mlp_times = []
block_times = []

for entry in data:
    timing_info = entry['timing_info']
    attn_times.extend(timing_info['attn_time'])
    cross_attn_times.extend(timing_info['cross_attn_time'])
    mlp_times.extend(timing_info['mlp_time'])
    block_times.extend(timing_info['block_time'])

average_attn_time = np.mean(attn_times)
average_cross_attn_time = np.mean(cross_attn_times)
average_mlp_time = np.mean(mlp_times)
average_block_time = np.mean(block_times)

print(f"Average Attention Time: {average_attn_time:.4f} ms")
print(f"Average Cross Attention Time: {average_cross_attn_time:.4f} ms")
print(f"Average MLP Time: {average_mlp_time:.4f} ms")
print(f"Average Block Time: {average_block_time:.4f} ms")

labels = ['Attention', 'Cross Attention', 'MLP', 'Block']
avg_times = [average_attn_time, average_cross_attn_time, average_mlp_time, average_block_time]

plt.bar(labels, avg_times, color=['blue', 'green', 'red', 'orange'])
plt.ylabel('Average Time (ms)')
plt.title('Average Time per Module')

plt.savefig('module_average_times.png')


================================================
FILE: PixArt-alpha-ToCa/timing_info.json
================================================
[{"timing_info": {"block_time": [10.906271934509277], "attn_time": [7.704576015472412], "cross_attn_time": [0.9379839897155762], "mlp_time": [2.0203518867492676]}, "current": {"num_steps": 20, "step": 0, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.602560043334961], "attn_time": [0.5560320019721985], "cross_attn_time": [0.5662720203399658], "mlp_time": [0.30105599761009216]}, "current": {"num_steps": 20, "step": 0, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4970879554748535], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 0, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 0, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4755840301513672], "attn_time": [0.4925439953804016], "cross_attn_time": [0.52019202709198], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 0, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4776320457458496], "attn_time": [0.48742398619651794], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 0, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4428160190582275], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5038080215454102], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 0, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4407680034637451], "attn_time": [0.4761599898338318], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 0, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.46943998336792], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 0, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.465343952178955], "attn_time": [0.48230400681495667], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 0, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4632960557937622], "attn_time": [0.48742398619651794], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 0, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4612480401992798], "attn_time": [0.4761599898338318], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 0, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4592000246047974], "attn_time": [0.48230400681495667], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 0, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.435647964477539], "attn_time": [0.47308799624443054], "cross_attn_time": [0.506879985332489], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 0, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.46943998336792], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 0, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5421439409255981], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5631999969482422], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 0, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.474560022354126], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5048320293426514], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 0, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4725120067596436], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 0, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4499839544296265], "attn_time": [0.48230400681495667], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 0, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4899200201034546], "attn_time": [0.506879985332489], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 0, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4551039934158325], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 0, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.46943998336792], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 0, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 0, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4776320457458496], "attn_time": [0.4853760004043579], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 0, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.457152009010315], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 0, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.5099520087242126], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 0, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.462272047996521], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 0, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4888960123062134], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.30105599761009216]}, "current": {"num_steps": 20, "step": 0, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.733855962753296], "attn_time": [0.579584002494812], "cross_attn_time": [0.567296028137207], "mlp_time": [0.3266560137271881]}, "current": {"num_steps": 20, "step": 1, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5523840188980103], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 1, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.48742398619651794], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 1, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 1, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 1, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5242879986763], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 1, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.481727957725525], "attn_time": [0.48844799399375916], "cross_attn_time": [0.5038080215454102], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 1, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 1, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4796799421310425], "attn_time": [0.48127999901771545], "cross_attn_time": [0.5099520087242126], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 1, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 1, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5242879986763], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 1, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.4843519926071167], "cross_attn_time": [0.52019202709198], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 1, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 1, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 1, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 1, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 1, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [2.950144052505493], "attn_time": [0.5017600059509277], "cross_attn_time": [1.1509759426116943], "mlp_time": [0.9451519846916199]}, "current": {"num_steps": 20, "step": 1, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29900801181793213]}, "current": {"num_steps": 20, "step": 1, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4940160512924194], "attn_time": [0.4904960095882416], "cross_attn_time": [0.506879985332489], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 1, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 1, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 1, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4899200201034546], "attn_time": [0.4864000082015991], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 1, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5099520087242126], "cross_attn_time": [0.52019202709198], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 1, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [3.0791680812835693], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5181440114974976], "mlp_time": [1.8472959995269775]}, "current": {"num_steps": 20, "step": 1, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.6936960220336914], "attn_time": [0.6215680241584778], "cross_attn_time": [0.5591040253639221], "mlp_time": [0.30003198981285095]}, "current": {"num_steps": 20, "step": 1, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5421439409255981], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 1, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 1, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5452159643173218], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 1, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.6070079803466797], "attn_time": [0.5406720042228699], "cross_attn_time": [0.5591040253639221], "mlp_time": [0.3092480003833771]}, "current": {"num_steps": 20, "step": 2, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.598464012145996], "attn_time": [0.5355520248413086], "cross_attn_time": [0.5427200198173523], "mlp_time": [0.30822399258613586]}, "current": {"num_steps": 20, "step": 2, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5472639799118042], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5437440276145935], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 2, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29900801181793213]}, "current": {"num_steps": 20, "step": 2, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.56876802444458], "attn_time": [0.5191680192947388], "cross_attn_time": [0.5427200198173523], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 2, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5554560422897339], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.3041279911994934]}, "current": {"num_steps": 20, "step": 2, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5544320344924927], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5437440276145935], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 2, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 2, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5738879442214966], "attn_time": [0.5263360142707825], "cross_attn_time": [0.536575973033905], "mlp_time": [0.30105599761009216]}, "current": {"num_steps": 20, "step": 2, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.558527946472168], "attn_time": [0.5191680192947388], "cross_attn_time": [0.536575973033905], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 2, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5472639799118042], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 2, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5007359981536865], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 2, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.563647985458374], "attn_time": [0.5191680192947388], "cross_attn_time": [0.5396479964256287], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 2, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5441919565200806], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 2, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.6773120164871216], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.3491840064525604]}, "current": {"num_steps": 20, "step": 2, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5656960010528564], "attn_time": [0.5191680192947388], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.3041279911994934]}, "current": {"num_steps": 20, "step": 2, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5493119955062866], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.30105599761009216]}, "current": {"num_steps": 20, "step": 2, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 2, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546239972114563], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 2, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 2, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 2, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.572864055633545], "attn_time": [0.5283839702606201], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 2, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.558527946472168], "attn_time": [0.5191680192947388], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 2, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 2, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.504256010055542], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5109760165214539], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 2, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.506879985332489], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 2, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5109760165214539], "cross_attn_time": [0.532480001449585], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 2, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 2, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5578240156173706], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5457919836044312], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 3, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4915199875831604], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 3, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 3, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 3, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 3, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.491968035697937], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 3, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 3, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 3, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5452159643173218], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2979840040206909]}, "current": {"num_steps": 20, "step": 3, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4970879554748535], "attn_time": [0.4853760004043579], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 3, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 3, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 3, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.48742398619651794], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 3, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [2.6859519481658936], "attn_time": [1.0670080184936523], "cross_attn_time": [0.8294399976730347], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 3, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 3, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5109760165214539], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 3, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 3, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 3, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 3, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 3, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.48742398619651794], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 3, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 3, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.506879985332489], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 3, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.4843519926071167], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 3, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4878720045089722], "attn_time": [0.4853760004043579], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 3, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 3, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.506879985332489], "cross_attn_time": [0.52019202709198], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 3, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 3, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.547327995300293], "attn_time": [0.5232639908790588], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.30105599761009216]}, "current": {"num_steps": 20, "step": 4, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 4, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 4, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 4, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4940160512924194], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5079039931297302], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 4, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 4, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 4, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.4833280146121979], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 4, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 4, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 4, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5400960445404053], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29900801181793213]}, "current": {"num_steps": 20, "step": 4, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 4, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 4, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 4, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 4, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 4, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2979840040206909]}, "current": {"num_steps": 20, "step": 4, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.499135971069336], "attn_time": [0.4853760004043579], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 4, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 4, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 4, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 4, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 4, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 4, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.4864000082015991], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 4, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.4833280146121979], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 4, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 4, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.506879985332489], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 4, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 4, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.53711998462677], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5437440276145935], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 5, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5089280009269714], "cross_attn_time": [0.532480001449585], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 5, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 5, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 5, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5089280009269714], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 5, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 5, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5411200523376465], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.3020800054073334]}, "current": {"num_steps": 20, "step": 5, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 5, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 5, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 5, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 5, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 5, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 5, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 5, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 5, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.48844799399375916], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 5, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 5, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 5, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 5, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 5, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 5, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.491968035697937], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 5, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 5, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 5, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.5058559775352478], "cross_attn_time": [0.52019202709198], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 5, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 5, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.481727957725525], "attn_time": [0.48230400681495667], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 5, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 5, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5565439462661743], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5416960120201111], "mlp_time": [0.3041279911994934]}, "current": {"num_steps": 20, "step": 6, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.48742398619651794], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 6, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 6, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5011839866638184], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 6, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 6, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.506879985332489], "cross_attn_time": [0.52019202709198], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 6, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 6, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 6, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 6, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.5007359981536865], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 6, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 6, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 6, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4970879554748535], "attn_time": [0.48844799399375916], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 6, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4735360145568848], "attn_time": [0.48127999901771545], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 6, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 6, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 6, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 6, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 6, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 6, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 6, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 6, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5472639799118042], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 6, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 6, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.535904049873352], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 7, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 7, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5079039931297302], "cross_attn_time": [0.52019202709198], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 7, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.506879985332489], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 7, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 7, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5130239725112915], "cross_attn_time": [0.52019202709198], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 7, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5452159643173218], "attn_time": [0.502784013748169], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 7, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 7, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 7, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 7, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 7, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 7, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 7, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.506879985332489], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 7, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 7, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 7, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5441919565200806], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 7, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 7, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5038080215454102], "cross_attn_time": [0.532480001449585], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 7, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 7, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 7, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5089280009269714], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 7, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 7, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.502784013748169], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 7, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5534080266952515], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2969599962234497]}, "current": {"num_steps": 20, "step": 7, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5109760165214539], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 7, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.506879985332489], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 7, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5411200523376465], "attn_time": [0.5160959959030151], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 7, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546463966369629], "attn_time": [0.5181440114974976], "cross_attn_time": [0.536575973033905], "mlp_time": [0.3031040132045746]}, "current": {"num_steps": 20, "step": 8, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5017600059509277], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 8, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [3.84716796875], "attn_time": [0.749567985534668], "cross_attn_time": [0.5457919836044312], "mlp_time": [0.30720001459121704]}, "current": {"num_steps": 20, "step": 8, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 8, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 8, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 8, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 8, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 8, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.506879985332489], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 8, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 8, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 8, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 8, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.502784013748169], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 8, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 8, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 8, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.506879985332489], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 8, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 8, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 8, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5298559665679932], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 8, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 8, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 8, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 8, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 8, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.506879985332489], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 8, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 8, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.506879985332489], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 8, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 8, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 8, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5425920486450195], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5375999808311462], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 9, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 9, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.502784013748169], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 9, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 9, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 9, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 9, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 9, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 9, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 9, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.49459201097488403], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2744320034980774]}, "current": {"num_steps": 20, "step": 9, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 9, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.506879985332489], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 9, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 9, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 9, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 9, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 9, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 9, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 9, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 9, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.506879985332489], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 9, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 9, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5109760165214539], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 9, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.499135971069336], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 9, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5166079998016357], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 10, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.502784013748169], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 10, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 10, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 10, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 10, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 10, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 10, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5109760165214539], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 10, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.502784013748169], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 10, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.502784013748169], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 10, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 10, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 10, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 10, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 10, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.506879985332489], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 10, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.506879985332489], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 10, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 10, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 10, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5820800065994263], "attn_time": [0.536575973033905], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 10, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.6005120277404785], "attn_time": [0.5335040092468262], "cross_attn_time": [0.5478399991989136], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 10, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.551360011100769], "attn_time": [0.5120000243186951], "cross_attn_time": [0.536575973033905], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 10, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 10, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.6680959463119507], "attn_time": [0.5765119791030884], "cross_attn_time": [0.5488640069961548], "mlp_time": [0.30003198981285095]}, "current": {"num_steps": 20, "step": 10, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5718400478363037], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5437440276145935], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 10, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5523840188980103], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 10, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 10, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 10, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546239972114563], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 10, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5750720500946045], "attn_time": [0.5294079780578613], "cross_attn_time": [0.5529599785804749], "mlp_time": [0.29900801181793213]}, "current": {"num_steps": 20, "step": 11, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 11, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.502784013748169], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 11, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5544320344924927], "attn_time": [0.5222399830818176], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 11, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5089280009269714], "cross_attn_time": [0.532480001449585], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 11, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5171200037002563], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 11, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5375999808311462], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 11, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 11, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 11, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 11, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5646719932556152], "attn_time": [0.5345280170440674], "cross_attn_time": [0.5242879986763], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 11, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5400960445404053], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 11, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.506879985332489], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 11, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5242879986763], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 11, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5411200523376465], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 11, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5534080266952515], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.29900801181793213]}, "current": {"num_steps": 20, "step": 11, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.506879985332489], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 11, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [2.9480960369110107], "attn_time": [0.5120000243186951], "cross_attn_time": [0.8601599931716919], "mlp_time": [1.1284480094909668]}, "current": {"num_steps": 20, "step": 11, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5503360033035278], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5406720042228699], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 11, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 11, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.4925439953804016], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 11, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 11, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.506879985332489], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 11, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 11, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 11, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 11, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 11, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5493119955062866], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 11, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5667200088500977], "attn_time": [0.5396479964256287], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.30617600679397583]}, "current": {"num_steps": 20, "step": 12, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5605759620666504], "attn_time": [0.5222399830818176], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 12, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5482879877090454], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 12, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.502784013748169], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 12, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.5048320293426514], "cross_attn_time": [0.52019202709198], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 12, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5421439409255981], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 12, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 12, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5038080215454102], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 12, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 12, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 12, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5349760055541992], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 12, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 12, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 12, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 12, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.8565119504928589], "attn_time": [0.6768640279769897], "cross_attn_time": [0.5652480125427246], "mlp_time": [0.317440003156662]}, "current": {"num_steps": 20, "step": 12, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5615999698638916], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5447679758071899], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 12, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5534080266952515], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 12, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5523840188980103], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 12, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5441919565200806], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 12, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5472639799118042], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 12, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.551360011100769], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 12, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5431679487228394], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 12, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.502784013748169], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 12, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546239972114563], "attn_time": [0.506879985332489], "cross_attn_time": [0.536575973033905], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 12, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5441919565200806], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 12, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.558527946472168], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 12, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.551360011100769], "attn_time": [0.5140479803085327], "cross_attn_time": [0.532480001449585], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 12, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5534080266952515], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 12, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.593727946281433], "attn_time": [0.5375999808311462], "cross_attn_time": [0.5550079941749573], "mlp_time": [0.3051519989967346]}, "current": {"num_steps": 20, "step": 13, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5708160400390625], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5447679758071899], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 13, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5575040578842163], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5396479964256287], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 13, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5677440166473389], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5406720042228699], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 13, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5575040578842163], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 13, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5595519542694092], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5396479964256287], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 13, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5503360033035278], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 13, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5779839754104614], "attn_time": [0.5242879986763], "cross_attn_time": [0.536575973033905], "mlp_time": [0.3031040132045746]}, "current": {"num_steps": 20, "step": 13, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 13, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5411200523376465], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 13, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 13, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5411200523376465], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5355520248413086], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 13, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5605759620666504], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5375999808311462], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 13, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 13, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4940160512924194], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 13, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 13, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 13, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 13, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5298559665679932], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 13, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 13, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 13, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 13, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 13, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.502784013748169], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 13, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 13, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 13, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5421439409255981], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 13, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.504256010055542], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2764799892902374]}, "current": {"num_steps": 20, "step": 13, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5440959930419922], "attn_time": [0.5212159752845764], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.30003198981285095]}, "current": {"num_steps": 20, "step": 14, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5073280334472656], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5109760165214539], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 14, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 14, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.504256010055542], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 14, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 14, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5089280009269714], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 14, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 14, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5130239725112915], "cross_attn_time": [0.52019202709198], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 14, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5349760055541992], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 14, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 14, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.504256010055542], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 14, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 14, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5022079944610596], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 14, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5099520087242126], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 14, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 14, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [5.377024173736572], "attn_time": [1.6383999586105347], "cross_attn_time": [1.7756160497665405], "mlp_time": [1.4632960557937622]}, "current": {"num_steps": 20, "step": 14, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5130239725112915], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 14, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.539072036743164], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 14, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 14, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 14, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5349760055541992], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 14, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 14, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 14, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5421439409255981], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 14, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 14, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5400960445404053], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 14, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 14, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.502784013748169], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 14, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.540992021560669], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.3031040132045746]}, "current": {"num_steps": 20, "step": 15, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 15, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5595519542694092], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.3031040132045746]}, "current": {"num_steps": 20, "step": 15, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5375999808311462], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 15, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 15, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 15, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 15, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5400960445404053], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 15, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2979840040206909]}, "current": {"num_steps": 20, "step": 15, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5130239725112915], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 15, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 15, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 15, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 15, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5441919565200806], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 15, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 15, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 15, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 15, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 15, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 15, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 15, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 15, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 15, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 15, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 15, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 15, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 15, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 15, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 15, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.540287971496582], "attn_time": [0.5181440114974976], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2979840040206909]}, "current": {"num_steps": 20, "step": 16, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 16, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 16, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 16, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 16, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 16, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 16, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5130239725112915], "cross_attn_time": [0.5242879986763], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 16, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 16, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 16, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5001599788665771], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 16, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 16, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.502784013748169], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 16, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 16, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 16, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4970879554748535], "attn_time": [0.48947200179100037], "cross_attn_time": [0.52019202709198], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 16, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 16, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 16, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 16, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5185920000076294], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 16, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.502784013748169], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 16, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 16, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.506879985332489], "cross_attn_time": [0.5120000243186951], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 16, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.506879985332489], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 16, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 16, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 16, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 16, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [2.0336639881134033], "attn_time": [0.7659519910812378], "cross_attn_time": [0.6256639957427979], "mlp_time": [0.3164159953594208]}, "current": {"num_steps": 20, "step": 16, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.549888014793396], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5447679758071899], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 17, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.506879985332489], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 17, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546239972114563], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5386239886283875], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 17, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.4935680031776428], "cross_attn_time": [0.532480001449585], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 17, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.506879985332489], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 17, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.502784013748169], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 17, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5120000243186951], "cross_attn_time": [0.52019202709198], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 17, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5058559775352478], "cross_attn_time": [0.532480001449585], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 17, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 17, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5482879877090454], "attn_time": [0.5160959959030151], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 17, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.546239972114563], "attn_time": [0.5171200037002563], "cross_attn_time": [0.5242879986763], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 17, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 17, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4981119632720947], "attn_time": [0.4904960095882416], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 17, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 17, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.509376049041748], "attn_time": [0.4864000082015991], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 17, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 17, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 17, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5370240211486816], "attn_time": [0.506879985332489], "cross_attn_time": [0.5335040092468262], "mlp_time": [0.28569599986076355]}, "current": {"num_steps": 20, "step": 17, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5007359981536865], "cross_attn_time": [0.532480001449585], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 17, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 17, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 17, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5298559665679932], "attn_time": [0.5089280009269714], "cross_attn_time": [0.52019202709198], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 17, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.502784013748169], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 17, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 17, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 17, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5196160078048706], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 17, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5308799743652344], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5283839702606201], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 17, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5360000133514404], "attn_time": [0.5160959959030151], "cross_attn_time": [0.52019202709198], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 17, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5399680137634277], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5427200198173523], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 18, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 18, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5400960445404053], "attn_time": [0.5150719881057739], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 18, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.528831958770752], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 18, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5298559665679932], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 18, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.4976640045642853], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 18, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.4935680031776428], "cross_attn_time": [0.5304319858551025], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 18, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5237120389938354], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5345280170440674], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 18, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5242879986763], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 18, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.502784013748169], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 18, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5206400156021118], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 18, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.491968035697937], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.27750399708747864]}, "current": {"num_steps": 20, "step": 18, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5017600059509277], "cross_attn_time": [0.532480001449585], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 18, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5083520412445068], "attn_time": [0.502784013748169], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 18, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 18, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.28672000765800476]}, "current": {"num_steps": 20, "step": 18, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 18, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5273600220680237], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 18, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5155199766159058], "attn_time": [0.502784013748169], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 18, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5319039821624756], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 18, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.499135971069336], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 18, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [14.97599983215332], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5130239725112915], "mlp_time": [13.750271797180176]}, "current": {"num_steps": 20, "step": 18, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 18, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5124479532241821], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5109760165214539], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 18, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5104000568389893], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5232639908790588], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 18, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5267839431762695], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 18, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.502784013748169], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 18, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5380480289459229], "attn_time": [0.5109760165214539], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.29388800263404846]}, "current": {"num_steps": 20, "step": 18, "layer": 27, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5278079509735107], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.2949120104312897]}, "current": {"num_steps": 20, "step": 19, "layer": 0, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4960639476776123], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 19, "layer": 1, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5038080215454102], "cross_attn_time": [0.5314559936523438], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 2, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5032320022583008], "attn_time": [0.49459201097488403], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2877439856529236]}, "current": {"num_steps": 20, "step": 19, "layer": 3, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.481727957725525], "attn_time": [0.48025599122047424], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 4, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.511423945426941], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 19, "layer": 5, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.499135971069336], "attn_time": [0.4997119903564453], "cross_attn_time": [0.5140479803085327], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 19, "layer": 6, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5063040256500244], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5222399830818176], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 19, "layer": 7, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5247360467910767], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5242879986763], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 8, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5329279899597168], "attn_time": [0.5089280009269714], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 19, "layer": 9, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.521664023399353], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5263360142707825], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 10, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.533951997756958], "attn_time": [0.5007359981536865], "cross_attn_time": [0.532480001449585], "mlp_time": [0.2836480140686035]}, "current": {"num_steps": 20, "step": 19, "layer": 11, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.4950400590896606], "attn_time": [0.48947200179100037], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.27955201268196106]}, "current": {"num_steps": 20, "step": 19, "layer": 12, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.5048320293426514], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 13, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5134719610214233], "attn_time": [0.5007359981536865], "cross_attn_time": [0.5160959959030151], "mlp_time": [0.28467199206352234]}, "current": {"num_steps": 20, "step": 19, "layer": 14, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5298559665679932], "attn_time": [0.5120000243186951], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 19, "layer": 15, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5099520087242126], "cross_attn_time": [0.5181440114974976], "mlp_time": [0.289792001247406]}, "current": {"num_steps": 20, "step": 19, "layer": 16, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5226880311965942], "attn_time": [0.506879985332489], "cross_attn_time": [0.5171200037002563], "mlp_time": [0.2826240062713623]}, "current": {"num_steps": 20, "step": 19, "layer": 17, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5431679487228394], "attn_time": [0.5171200037002563], "cross_attn_time": [0.52019202709198], "mlp_time": [0.29183998703956604]}, "current": {"num_steps": 20, "step": 19, "layer": 18, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5431679487228394], "attn_time": [0.5140479803085327], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.2959359884262085]}, "current": {"num_steps": 20, "step": 19, "layer": 19, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.5017600059509277], "cross_attn_time": [0.5058559775352478], "mlp_time": [0.2887679934501648]}, "current": {"num_steps": 20, "step": 19, "layer": 20, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5144959688186646], "attn_time": [0.49561598896980286], "cross_attn_time": [0.5253120064735413], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 19, "layer": 21, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5175679922103882], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5294079780578613], "mlp_time": [0.2805759906768799]}, "current": {"num_steps": 20, "step": 19, "layer": 22, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5257600545883179], "attn_time": [0.5058559775352478], "cross_attn_time": [0.5191680192947388], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 19, "layer": 23, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5349760055541992], "attn_time": [0.5079039931297302], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.29286399483680725]}, "current": {"num_steps": 20, "step": 19, "layer": 24, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.5052800178527832], "attn_time": [0.4915199875831604], "cross_attn_time": [0.5150719881057739], "mlp_time": [0.2908160090446472]}, "current": {"num_steps": 20, "step": 19, "layer": 25, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.516543984413147], "attn_time": [0.4986880123615265], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.2815999984741211]}, "current": {"num_steps": 20, "step": 19, "layer": 26, "is_force_fresh": true, "module": "mlp"}}, {"timing_info": {"block_time": [1.504256010055542], "attn_time": [0.49663999676704407], "cross_attn_time": [0.5212159752845764], "mlp_time": [0.27852800488471985]}, "current": {"num_steps": 20, "step": 19, "layer": 27, "is_force_fresh": true, "module": "mlp"}}]

================================================
FILE: PixArt-alpha-ToCa/tools/VLM_caption_lightning.py
================================================
# {'model': 'LLaVA-7B-v0', 'prompt': 'You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab.You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.Follow the instructions carefully and explain your answers in detail.###Human: Hi!###Assistant: Hi there!  How can I help you today?\n###Human: ?\n<image>###Assistant:', 'temperature': 0.2, 'max_new_tokens': 512, 'stop': '###', 'images': "List of 1 images: ['793f00027d3dc5bd69445a388a2f289c']"}
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import argparse
import torch
from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel, AutoConfig
from diffusion.model.llava import LlavaMPTForCausalLM
from PIL import Image
from tqdm import tqdm
from os import path, makedirs
from torch.utils.data import Dataset, DataLoader
import json


DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"


def expand2square(pil_img, background_color=(122, 116, 104)):
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height) // 2))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width) // 2, 0))
        return result


def pad2square(image):
    max_hw, min_hw = max(image.size), min(image.size)
    aspect_ratio = max_hw / min_hw
    max_len, min_len = 800, 400
    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
    longest_edge = int(shortest_edge * aspect_ratio)
    W, H = image.size
    if H > W:
        H, W = longest_edge, shortest_edge
    else:
        H, W = shortest_edge, longest_edge
    image = image.resize((W, H))
    return image


def load_model(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = LlavaMPTForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)

    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
    tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
    if mm_use_im_start_end:
        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)

    vision_tower = model.get_model().vision_tower[0]
    if vision_tower.device.type == 'meta':
        vision_tower = CLIPVisionModel.from_pretrained(
            vision_tower.config._name_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=True).cuda()
        model.get_model().vision_tower[0] = vision_tower
    else:
        vision_tower.to(device='cuda', dtype=torch.float16)
    vision_config = vision_tower.config
    vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
        [DEFAULT_IMAGE_PATCH_TOKEN])[0]
    vision_config.use_im_start_end = mm_use_im_start_end
    if mm_use_im_start_end:
        vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids(
            [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])

    model.cuda()

    if hasattr(model.config, "max_sequence_length"):
        context_len = model.config.max_sequence_length
    else:
        context_len = 2048

    return tokenizer, model, context_len


class SanitizedLaion(Dataset):
    def __init__(self, root_dir, index_file, prompt, config, img_extension='.jpg', caption=True) -> None:
        super().__init__()
        self.root_dir = root_dir
        self.image_processor = CLIPImageProcessor.from_pretrained(AutoConfig.from_pretrained(config).mm_vision_tower, torch_dtype=torch.float16)
        self.prompt = prompt
        self.img_extension = img_extension
        self.caption=caption

        if '.txt' in index_file:
            with open(index_file, 'r') as f:
                self.lines = f.readlines()
        elif '.json' in index_file:
            with open(index_file, 'r') as f:
                self.lines = json.load(f)
        else:
            raise ValueError(f'{index_file} format not supported')

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        item = self.lines[idx]
        caption = item['prompt'].strip()
        prompt = self.prompt.format(caption) if self.caption else self.prompt
        with open(path.join(self.root_dir, item['path']), 'rb') as f:
            img = pad2square(Image.open(f).convert('RGB'))
        return self.image_processor(img, return_tensors='pt')['pixel_values'].squeeze(), prompt, item['path'].split(self.img_extension)[0]


@torch.no_grad()
def caption(tokenizer, model, context_len, images, prompt, prefix):
    images = images.to(model.device, dtype=torch.float16)
    # HACK: 256 is the max image token length hacked
    replace_token = DEFAULT_IMAGE_PATCH_TOKEN * 256
    if getattr(model.config, 'mm_use_im_start_end', False):
        replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN

    prompt = list(map(lambda p: p.replace(DEFAULT_IMAGE_TOKEN, replace_token), prompt))

    temperature = 0.2
    max_new_tokens = 1024
    stop_str = '<|im_end|>'

    max_src_len = context_len - max_new_tokens - 8
    input_ids = tokenizer(prompt).input_ids
    input_ids = list(map(lambda input_id: input_id[-max_src_len:], input_ids))
    lens = list(map(lambda x: len(x), input_ids))
    longest = max(lens)
    input_ids = list(map(lambda x: x if len(x) == longest else [tokenizer.pad_token_id] * (longest - len(x)) + x, input_ids))

    pred_ids = torch.zeros([images.shape[0], 0], device=model.device, dtype=torch.long)
    past_key_values = None
    finish = [False] * images.shape[0]
    for i in tqdm(range(max_new_tokens), leave=False):
        if i == 0:
            out = model(
                torch.as_tensor(input_ids).cuda(),
                use_cache=True,
                images=images)
            del images
        else:
            attention_mask = torch.ones(1, past_key_values[0][0].shape[-2] + 1, device="cuda")
            out = model(input_ids=token,
                        use_cache=True,
                        attention_mask=attention_mask,
                        past_key_values=past_key_values)
        past_key_values = out.past_key_values
        logits = out.logits
        last_token_logits = logits[:, -1]
        if temperature < 1e-4:
            token = torch.argmax(last_token_logits)
        else:
            probs = torch.softmax(last_token_logits / temperature, dim=-1)
            token = torch.multinomial(probs, num_samples=1)

        pred_ids = torch.concatenate([pred_ids, token], dim=1)

        for ii in torch.nonzero(token.cpu() == tokenizer.eos_token_id, as_tuple=True)[0]:
            if finish[ii]:
                continue
            ii = int(ii)
            output = tokenizer.decode(pred_ids[ii][:-1]).removesuffix(stop_str)
            finish[ii] = True
            yield output, prefix[ii]

        if all(finish):
            break


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="liuhaotian/LLaVA-Lightning-MPT-7B-preview")
    parser.add_argument("--data-root", type=str, required=True)
    parser.add_argument('--index', type=str, required=True)
    parser.add_argument('--output', type=str, required=True)
    args = parser.parse_args()

    prompt = """<|im_start|>system
    - You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab.
    - You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
    - You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user
    Given the caption of this image "{}", describe this image in a very detailed manner
    <image><|im_end|><|im_start|>assistant\n"""

    prompt_nocap = """<|im_start|>system
    - You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab.
    - You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
    - You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user
    Describe this image in a very detailed manner
    <image><|im_end|><|im_start|>assistant\n"""
    d = SanitizedLaion(args.data_root, args.index, prompt, args.model_path, img_extension='.png')
    l = DataLoader(d, batch_size=32, pin_memory=True, num_workers=10)

    tokenizer, model, context_len = load_model(args.model_path)
    # model = torch.compile(model)
    for b in tqdm(l):
        for c, p in caption(tokenizer, model, context_len, *b):
            o = path.join(args.output, f'{p}.txt')
            makedirs(path.dirname(o), exist_ok=True, mode=0o755)
            with open(o, 'w') as k:
                k.write(c)


================================================
FILE: PixArt-alpha-ToCa/tools/convert_pixart_alpha_to_diffusers.py
================================================
import argparse
import os

import torch
from transformers import T5EncoderModel, T5Tokenizer

from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, PixArtAlphaPipeline, Transformer2DModel


ckpt_id = "PixArt-alpha/PixArt-alpha"
# https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/scripts/inference.py#L125
interpolation_scale = {256: 0.5, 512: 1, 1024: 2}


def main(args):
    all_state_dict = torch.load(args.orig_ckpt_path, map_location='cpu')
    state_dict = all_state_dict.pop("state_dict")
    converted_state_dict = {}

    # Patch embeddings.
    converted_state_dict["pos_embed.proj.weight"] = state_dict.pop("x_embedder.proj.weight")
    converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")

    # Caption projection.
    converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
    converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias")
    converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight")
    converted_state_dict["caption_projection.linear_2.bias"] = state_dict.pop("y_embedder.y_proj.fc2.bias")

    # AdaLN-single LN
    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.weight"] = state_dict.pop(
        "t_embedder.mlp.0.weight"
    )
    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias")
    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.weight"] = state_dict.pop(
        "t_embedder.mlp.2.weight"
    )
    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias")

    if args.image_size == 1024 and args.multi_scale_train:
        # Resolution.
        converted_state_dict["adaln_single.emb.resolution_embedder.linear_1.weight"] = state_dict.pop(
            "csize_embedder.mlp.0.weight"
        )
        converted_state_dict["adaln_single.emb.resolution_embedder.linear_1.bias"] = state_dict.pop(
            "csize_embedder.mlp.0.bias"
        )
        converted_state_dict["adaln_single.emb.resolution_embedder.linear_2.weight"] = state_dict.pop(
            "csize_embedder.mlp.2.weight"
        )
        converted_state_dict["adaln_single.emb.resolution_embedder.linear_2.bias"] = state_dict.pop(
            "csize_embedder.mlp.2.bias"
        )
        # Aspect ratio.
        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_1.weight"] = state_dict.pop(
            "ar_embedder.mlp.0.weight"
        )
        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_1.bias"] = state_dict.pop(
            "ar_embedder.mlp.0.bias"
        )
        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_2.weight"] = state_dict.pop(
            "ar_embedder.mlp.2.weight"
        )
        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_2.bias"] = state_dict.pop(
            "ar_embedder.mlp.2.bias"
        )
    # Shared norm.
    converted_state_dict["adaln_single.linear.weight"] = state_dict.pop("t_block.1.weight")
    converted_state_dict["adaln_single.linear.bias"] = state_dict.pop("t_block.1.bias")

    for depth in range(28):
        # Transformer blocks.
        converted_state_dict[f"transformer_blocks.{depth}.scale_shift_table"] = state_dict.pop(
            f"blocks.{depth}.scale_shift_table"
        )

        # Attention is all you need 🤘

        # Self attention.
        q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.weight"), 3, dim=0)
        q_bias, k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.bias"), 3, dim=0)
        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q
        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.bias"] = q_bias
        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k
        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.bias"] = k_bias
        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v
        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.bias"] = v_bias
        # Projection.
        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict.pop(
            f"blocks.{depth}.attn.proj.weight"
        )
        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict.pop(
            f"blocks.{depth}.attn.proj.bias"
        )

        # Feed-forward.
        converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.weight"] = state_dict.pop(
            f"blocks.{depth}.mlp.fc1.weight"
        )
        converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.bias"] = state_dict.pop(
            f"blocks.{depth}.mlp.fc1.bias"
        )
        converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.weight"] = state_dict.pop(
            f"blocks.{depth}.mlp.fc2.weight"
        )
        converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.bias"] = state_dict.pop(
            f"blocks.{depth}.mlp.fc2.bias"
        )

        # Cross-attention.
        q = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.weight")
        q_bias = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.bias")
        k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.weight"), 2, dim=0)
        k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.bias"), 2, dim=0)

        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.weight"] = q
        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.bias"] = q_bias
        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.weight"] = k
        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.bias"] = k_bias
        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.weight"] = v
        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.bias"] = v_bias

        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.weight"] = state_dict.pop(
            f"blocks.{depth}.cross_attn.proj.weight"
        )
        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.bias"] = state_dict.pop(
            f"blocks.{depth}.cross_attn.proj.bias"
        )

    # Final block.
    converted_state_dict["proj_out.weight"] = state_dict.pop("final_layer.linear.weight")
    converted_state_dict["proj_out.bias"] = state_dict.pop("final_layer.linear.bias")
    converted_state_dict["scale_shift_table"] = state_dict.pop("final_layer.scale_shift_table")

    # DiT XL/2
    transformer = Transformer2DModel(
        sample_size=args.image_size // 8,
        num_layers=28,
        attention_head_dim=72,
        in_channels=4,
        out_channels=8,
        patch_size=2,
        attention_bias=True,
        num_attention_heads=16,
        cross_attention_dim=1152,
        activation_fn="gelu-approximate",
        num_embeds_ada_norm=1000,
        norm_type="ada_norm_single",
        norm_elementwise_affine=False,
        norm_eps=1e-6,
        caption_channels=4096,
    )
    transformer.load_state_dict(converted_state_dict, strict=True)

    assert transformer.pos_embed.pos_embed is not None
    state_dict.pop("pos_embed")
    state_dict.pop("y_embedder.y_embedding")
    assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"

    num_model_params = sum(p.numel() for p in transformer.parameters())
    print(f"Total number of transformer parameters: {num_model_params}")

    if args.only_transformer:
        transformer.save_pretrained(os.path.join(args.dump_path, "transformer"))
    else:
        scheduler = DPMSolverMultistepScheduler()

        vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="sd-vae-ft-ema")

        tokenizer = T5Tokenizer.from_pretrained(ckpt_id, subfolder="t5-v1_1-xxl")
        text_encoder = T5EncoderModel.from_pretrained(ckpt_id, subfolder="t5-v1_1-xxl")

        pipeline = PixArtAlphaPipeline(
            tokenizer=tokenizer, text_encoder=text_encoder, transformer=transformer, vae=vae, scheduler=scheduler
        )

        pipeline.save_pretrained(args.dump_path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # set multi_scale_train=True if using PixArtMS structure during training else set it to False
    parser.add_argument("--multi_scale_train", default=True, type=str, required=True, help="If use Multi-Scale PixArtMS structure during training.")
    parser.add_argument("--orig_ckpt_path", default=None, type=str, required=False, help="Path to the checkpoint to convert.")
    parser.add_argument(
        "--image_size",
        default=1024,
        type=int,
        choices=[256, 512, 1024],
        required=False,
        help="Image size of pretrained model, either 512 or 1024.",
    )
    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
    parser.add_argument("--only_transformer", default=True, type=bool, required=True)

    args = parser.parse_args()
    main(args)


================================================
FILE: PixArt-alpha-ToCa/tools/download.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""
Functions for downloading pre-trained PixArt models
"""
from torchvision.datasets.utils import download_url
import torch
import os
import argparse


pretrained_models = {'PixArt-XL-2-512x512.pth', 'PixArt-XL-2-1024-MS.pth'}
vae_models = {
    'sd-vae-ft-ema/config.json',
    'sd-vae-ft-ema/diffusion_pytorch_model.bin'
}
t5_models = {
    't5-v1_1-xxl/config.json', 't5-v1_1-xxl/pytorch_model-00001-of-00002.bin',
    't5-v1_1-xxl/pytorch_model-00002-of-00002.bin', 't5-v1_1-xxl/pytorch_model.bin.index.json',
    't5-v1_1-xxl/special_tokens_map.json', 't5-v1_1-xxl/spiece.model',
    't5-v1_1-xxl/tokenizer_config.json',
}


def find_model(model_name):
    """
    Finds a pre-trained G.pt model, downloading it if necessary. Alternatively, loads a model from a local path.
    """
    if model_name in pretrained_models:
        return download_model(model_name)
    assert os.path.isfile(model_name), f'Could not find PixArt checkpoint at {model_name}'
    return torch.load(model_name, map_location=lambda storage, loc: storage)


def download_model(model_name):
    """
    Downloads a pre-trained PixArt model from the web.
    """
    assert model_name in pretrained_models
    local_path = f'output/pretrained_models/{model_name}'
    if not os.path.isfile(local_path):
        os.makedirs('output/pretrained_models', exist_ok=True)
        web_path = f'https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/{model_name}'
        download_url(web_path, 'output/pretrained_models')
    return torch.load(local_path, map_location=lambda storage, loc: storage)


def download_other(model_name, model_zoo, output_dir):
    """
    Downloads a pre-trained PixArt model from the web.
    """
    assert model_name in model_zoo
    local_path = os.path.join(output_dir, model_name)
    if not os.path.isfile(local_path):
        os.makedirs(output_dir, exist_ok=True)
        web_path = f'https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/{model_name}'
        print(web_path)
        download_url(web_path, os.path.join(output_dir, model_name.split('/')[0]))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_names', nargs='+', type=str, default=pretrained_models)
    args = parser.parse_args()
    model_names = args.model_names
    model_names = set(model_names)

    # Download PixArt checkpoints
    for t5_model in t5_models:
        download_other(t5_model, t5_models, 'output/pretrained_models/t5_ckpts')
    for vae_model in vae_models:
        download_other(vae_model, vae_models, 'output/pretrained_models/')
    for model in model_names:
        download_model(model)    # for vae_model in vae_models:
    print('Done.')


================================================
FILE: PixArt-alpha-ToCa/tools/extract_features.py
================================================
import os
from pathlib import Path
import sys
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
from PIL import Image
import torch
from torchvision import transforms as T
import numpy as np
import json
from tqdm import tqdm
import argparse
import threading
from queue import Queue
from pathlib import Path
from torch.utils.data import DataLoader, RandomSampler
from accelerate import Accelerator
from torchvision.transforms.functional import InterpolationMode
from torchvision.datasets.folder import default_loader

from diffusion.model.t5 import T5Embedder
from diffusers.models import AutoencoderKL
from diffusion.data.datasets.InternalData import InternalData
from diffusion.utils.misc import SimpleTimer
from diffusion.utils.data_sampler import AspectRatioBatchSampler
from diffusion.data.builder import DATASETS
from diffusion.data import ASPECT_RATIO_512, ASPECT_RATIO_1024


def get_closest_ratio(height: float, width: float, ratios: dict):
    aspect_ratio = height / width
    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
    return ratios[closest_ratio], float(closest_ratio)


@DATASETS.register_module()
class DatasetMS(InternalData):
    def __init__(self, root, image_list_json=None, transform=None, resolution=1024, load_vae_feat=False, aspect_ratio_type=None, start_index=0, end_index=100000000, **kwargs):
        if image_list_json is None:
            image_list_json = ['data_info.json']
        assert os.path.isabs(root), 'root must be a absolute path'
        self.root = root
        self.img_dir_name = 'InternalImgs'        # need to change to according to your data structure
        self.json_dir_name = 'InternalData'        # need to change to according to your data structure
        self.transform = transform
        self.load_vae_feat = load_vae_feat
        self.resolution = resolution
        self.meta_data_clean = []
        self.img_samples = []
        self.txt_feat_samples = []
        self.aspect_ratio = aspect_ratio_type
        assert self.aspect_ratio in [ASPECT_RATIO_1024, ASPECT_RATIO_512]
        self.ratio_index = {}
        self.ratio_nums = {}
        for k, v in self.aspect_ratio.items():
            self.ratio_index[float(k)] = []     # used for self.getitem
            self.ratio_nums[float(k)] = 0      # used for batch-sampler

        image_list_json = image_list_json if isinstance(image_list_json, list) else [image_list_json]
        for json_file in image_list_json:
            meta_data = self.load_json(os.path.join(self.root, 'partition', json_file))
            meta_data_clean = [item for item in meta_data if item['ratio'] <= 4]
            self.meta_data_clean.extend(meta_data_clean)
            self.img_samples.extend([os.path.join(self.root.replace(self.json_dir_name, self.img_dir_name), item['path']) for item in meta_data_clean])

        self.img_samples = self.img_samples[start_index: end_index]
        # scan the dataset for ratio static
        for i, info in enumerate(self.meta_data_clean[:len(self.meta_data_clean)//3]):
            ori_h, ori_w = info['height'], info['width']
            closest_size, closest_ratio = get_closest_ratio(ori_h, ori_w, self.aspect_ratio)
            self.ratio_nums[closest_ratio] += 1
            if len(self.ratio_index[closest_ratio]) == 0:
                self.ratio_index[closest_ratio].append(i)

        # Set loader and extensions
        if self.load_vae_feat:
            raise ValueError("No VAE loader here")
        self.loader = default_loader

    def __getitem__(self, idx):
        data_info = {}
        for _ in range(20):
            try:
                img_path = self.img_samples[idx]
                img = self.loader(img_path)
                if self.transform:
                    img = self.transform(img)
                # Calculate closest aspect ratio and resize & crop image[w, h]
                if isinstance(img, Image.Image):
                    h, w = (img.size[1], img.size[0])
                    assert h, w == (self.meta_data_clean[idx]['height'], self.meta_data_clean[idx]['width'])
                    closest_size, closest_ratio = get_closest_ratio(h, w, self.aspect_ratio)
                    closest_size = list(map(lambda x: int(x), closest_size))
                    transform = T.Compose([
                        T.Lambda(lambda img: img.convert('RGB')),
                        T.Resize(closest_size, interpolation=InterpolationMode.BICUBIC),  # Image.BICUBIC
                        T.CenterCrop(closest_size),
                        T.ToTensor(),
                        T.Normalize([.5], [.5]),
                    ])
                    img = transform(img)
                    data_info['img_hw'] = torch.tensor([h, w], dtype=torch.float32)
                    data_info['aspect_ratio'] = closest_ratio
                # change the path according to your data structure
                return img, '_'.join(self.img_samples[idx].rsplit('/', 2)[-2:]) # change from 'serial-number-of-dir/serial-number-of-image.png' ---> 'serial-number-of-dir_serial-number-of-image.png'
            except Exception as e:
                print(f"Error details: {str(e)}")
                idx = np.random.randint(len(self))
        raise RuntimeError('Too many bad data.')

    def get_data_info(self, idx):
        data_info = self.meta_data_clean[idx]
        return {'height': data_info['height'], 'width': data_info['width']}


def extract_caption_t5_do(q):
    while not q.empty():
        item = q.get()
        extract_caption_t5_job(item)
        q.task_done()


def extract_caption_t5_job(item):
    global mutex
    global t5
    global t5_save_dir

    with torch.no_grad():
        caption = item['prompt'].strip()
        if isinstance(caption, str):
            caption = [caption]

        save_path = os.path.join(t5_save_dir, Path(item['path']).stem)
        if os.path.exists(f"{save_path}.npz"):
            return
        try:
            mutex.acquire()
            caption_emb, emb_mask = t5.get_text_embeddings(caption)
            mutex.release()
            emb_dict = {
                'caption_feature': caption_emb.float().cpu().data.numpy(),
                'attention_mask': emb_mask.cpu().data.numpy(),
            }
            np.savez_compressed(save_path, **emb_dict)
        except Exception as e:
            print(e)


def extract_caption_t5():
    global t5
    global t5_save_dir
    # global images_extension
    t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=f'{args.pretrained_models_dir}/t5_ckpts', model_max_length=120)
    t5_save_dir = args.t5_save_root
    os.makedirs(t5_save_dir, exist_ok=True)

    train_data_json = json.load(open(args.json_path, 'r'))
    train_data = train_data_json[args.start_index: args.end_index]

    global mutex
    mutex = threading.Lock()
    jobs = Queue()

    for item in tqdm(train_data):
        jobs.put(item)

    for _ in range(20):
        worker = threading.Thread(target=extract_caption_t5_do, args=(jobs,))
        worker.start()

    jobs.join()


def extract_img_vae_do(q):
    while not q.empty():
        item = q.get()
        extract_img_vae_job(item)
        q.task_done()


def extract_img_vae_job(item):
    return


def extract_img_vae():
    vae = AutoencoderKL.from_pretrained(f'{args.pretrained_models_dir}/sd-vae-ft-ema').to(device)

    train_data_json = json.load(open(args.json_path, 'r'))
    image_names = set()

    vae_save_root = f'{args.vae_save_root}/{image_resize}resolution'
    os.umask(0o000)  # file permission: 666; dir permission: 777
    os.makedirs(vae_save_root, exist_ok=True)

    vae_save_dir = os.path.join(vae_save_root, 'noflip')
    os.makedirs(vae_save_dir, exist_ok=True)

    for item in train_data_json:
        image_name = item['path']
        if image_name in image_names:
            continue
        image_names.add(image_name)
    lines = sorted(image_names)
    lines = lines[args.start_index: args.end_index]

    _, images_extension = os.path.splitext(lines[0])

    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB')),
        T.Resize(image_resize),  # Image.BICUBIC
        T.CenterCrop(image_resize),
        T.ToTensor(),
        T.Normalize([.5], [.5]),
    ])

    os.umask(0o000)  # file permission: 666; dir permission: 777
    for image_name in tqdm(lines):
        save_path = os.path.join(vae_save_dir, Path(image_name).stem)
        if os.path.exists(f"{save_path}.npy"):
            continue
        try:
            img = Image.open(f'{args.dataset_root}/{image_name}')
            img = transform(img).to(device)[None]

            with torch.no_grad():
                posterior = vae.encode(img).latent_dist
                z = torch.cat([posterior.mean, posterior.std], dim=1).detach().cpu().numpy().squeeze()

            np.save(save_path, z)
        except Exception as e:
            print(e)
            print(image_name)


def save_results(results, paths, signature, work_dir):
    timer = SimpleTimer(len(results), log_interval=100, desc="Saving Results")
    # save to npy
    new_paths = []
    os.umask(0o000)  # file permission: 666; dir permission: 777
    for res, p in zip(results, paths):
        file_name = p.split('.')[0] + '.npy'
        new_folder = signature
        save_folder = os.path.join(work_dir, new_folder)
        if os.path.exists(save_folder):
            raise FileExistsError(f"{save_folder} exists. BE careful not to overwrite your files. Comment this error raising for overwriting!!")
        os.makedirs(save_folder, exist_ok=True)
        new_paths.append(os.path.join(new_folder, file_name))
        np.save(os.path.join(save_folder, file_name), res)
        timer.log()
    # save paths
    with open(os.path.join(work_dir, f"VAE-{signature}.txt"), 'w') as f:
        f.write('\n'.join(new_paths))


def inference(vae, dataloader, signature, work_dir):
    timer = SimpleTimer(len(dataloader), log_interval=100, desc="VAE-Inference")

    for batch in dataloader:
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=True):
                posterior = vae.encode(batch[0]).latent_dist
                results = torch.cat([posterior.mean, posterior.std], dim=1).detach().cpu().numpy()
        path = batch[1]
        save_results(results, path, signature=signature, work_dir=work_dir)
        timer.log()


def extract_img_vae_multiscale(bs=1):

    assert image_resize in [512, 1024]
    work_dir = os.path.abspath(args.vae_save_root)
    os.umask(0o000)  # file permission: 666; dir permission: 777
    os.makedirs(work_dir, exist_ok=True)
    accelerator = Accelerator(mixed_precision='fp16')
    vae = AutoencoderKL.from_pretrained(f'{args.pretrained_models_dir}/sd-vae-ft-ema').to(device)

    signature = 'ms'

    aspect_ratio_type = ASPECT_RATIO_1024 if image_resize == 1024 else ASPECT_RATIO_512
    dataset = DatasetMS(args.dataset_root, image_list_json=[args.json_file], transform=None, sample_subset=None,
                        aspect_ratio_type=aspect_ratio_type, start_index=args.start_index, end_index=args.end_index)

    # create AspectRatioBatchSampler
    sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset, batch_size=bs, aspect_ratios=dataset.aspect_ratio, ratio_nums=dataset.ratio_nums)

    # create DataLoader
    dataloader = DataLoader(dataset, batch_sampler=sampler, num_workers=13, pin_memory=True)
    dataloader = accelerator.prepare(dataloader, )

    inference(vae, dataloader, signature=signature, work_dir=work_dir)
    accelerator.wait_for_everyone()

    print('done')


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--multi_scale", action='store_true', default=False, help="multi-scale feature extraction")
    parser.add_argument("--img_size", default=512, type=int, help="image scale for multi-scale feature extraction")
    parser.add_argument('--start_index', default=0, type=int)
    parser.add_argument('--end_index', default=1000000, type=int)
    
    parser.add_argument('--json_path', type=str)
    parser.add_argument('--t5_save_root', default='data/data_toy/caption_feature_wmask', type=str)
    parser.add_argument('--vae_save_root', default='data/data_toy/img_vae_features', type=str)
    parser.add_argument('--dataset_root', default='data/data_toy', type=str)
    parser.add_argument('--pretrained_models_dir', default='output/pretrained_models', type=str)

    ### for multi-scale(ms) vae feauture extraction
    parser.add_argument('--json_file', type=str)
    return parser.parse_args()


if __name__ == '__main__':

    args = get_args()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    image_resize = args.img_size

    # prepare extracted caption t5 features for training
    extract_caption_t5()

    # prepare extracted image vae features for training
    if args.multi_scale:
        print(f'Extracting Multi-scale Image Resolution based on {image_resize}')
        extract_img_vae_multiscale(bs=1)    # recommend bs = 1 for AspectRatioBatchSampler
    else:
        print(f'Extracting Single Image Resolution {image_resize}')
        extract_img_vae()

================================================
FILE: PixArt-alpha-ToCa/train.sh
================================================
CUDA_VISIBLE_DEVICES=5,6,7 python -m torch.distributed.launch --nproc_per_node=3 \
    --master_port=26662 train_scripts/train_controlnet.py \
    configs/pixart_app_config/PixArt_xl2_img1024_controlHed_Half.py \
    --work-dir output/debug

================================================
FILE: PixArt-alpha-ToCa/train_latents.py
================================================
import os

import sys
import types
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import argparse
import datetime
import time
import warnings
warnings.filterwarnings("ignore")  # ignore warning
import torch
import torch.nn as nn
from accelerate import Accelerator, InitProcessGroupKwargs
from accelerate.utils import DistributedType
from diffusers.models import AutoencoderKL
from torch.utils.data import RandomSampler
from mmcv.runner import LogBuffer
from copy import deepcopy
from PIL import Image
import numpy as np

from diffusion import IDDPM
from diffusion.utils.checkpoint import save_checkpoint, load_checkpoint
from diffusion.utils.dist_utils import synchronize, get_world_size, clip_grad_norm_
from diffusion.data.builder import build_dataset, build_dataloader, set_data_root
from diffusion.model.builder import build_model
from diffusion.utils.logger import get_root_logger
from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow
from diffusion.utils.optimizer import build_optimizer, auto_scale_lr
from diffusion.utils.lr_scheduler import build_lr_scheduler
from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler

def set_fsdp_env():
    os.environ["ACCELERATE_USE_FSDP"] = 'true'
    os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP'
    os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE'
    os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock'


def ema_update(model_dest: nn.Module, model_src: nn.Module, rate):
    param_dict_src = dict(model_src.named_parameters())
    for p_name, p_dest in model_dest.named_parameters():
        p_src = param_dict_src[p_name]
        assert p_src is not p_dest
        p_dest.data.mul_(rate).add_((1 - rate) * p_src.data)


def train():
    if config.get('debug_nan', False):
        DebugUnderflowOverflow(model)
        logger.info('NaN debugger registered. Start to detect overflow during training.')
    time_start, last_tic = time.time(), time.time()
    log_buffer = LogBuffer()
        
    start_step = start_epoch * len(train_dataloader)
    global_step = 0
    total_steps = len(train_dataloader) * config.num_epochs

    # load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False)
    # Now you train the model
    for epoch in range(start_epoch + 1, config.num_epochs + 1):
        data_time_start= time.time()
        data_time_all = 0
        for step, batch in enumerate(train_dataloader):
            data_time_all += time.time() - data_time_start
            # if load_vae_feat:
            z = batch[0]
            # else:
            #     with torch.no_grad():
            #         with torch.cuda.amp.autocast(enabled=config.mixed_precision == 'fp16'):
            #             posterior = vae.encode(batch[0]).latent_dist
            #             if config.sample_posterior:
            #                 z = posterior.sample()
            #             else:
            #                 z = posterior.mode()
            clean_images = z * config.scale_factor
            y = batch[1]
            y_mask = batch[2]
            data_info = batch[3]

            # Sample a random timestep for each image
            bs = clean_images.shape[0]
            timesteps = torch.randint(0, config.train_sampling_steps, (bs,), device=clean_images.device).long()
            grad_norm = None
            with accelerator.accumulate(model):
                # Predict the noise residual
                optimizer.zero_grad()
                loss_term = train_diffusion.training_losses(model, clean_images, timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info))
                loss = loss_term['loss'].mean()
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip)
                optimizer.step()
                lr_scheduler.step()
                if accelerator.sync_gradients:
                    ema_update(model_ema, model, config.ema_rate)

            lr = lr_scheduler.get_last_lr()[0]
            logs = {args.loss_report_name: accelerator.gather(loss).mean().item()}
            if grad_norm is not None:
                logs.update(grad_norm=accelerator.gather(grad_norm).mean().item())
            log_buffer.update(logs)
            
            # logging on terminal
            if (step + 1) % config.log_interval == 0 or (step + 1) == 1:
                t = (time.time() - last_tic) / config.log_interval
                t_d = data_time_all / config.log_interval
                avg_time = (time.time() - time_start) / (global_step + 1)
                eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1))))
                eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1))))
                # avg_loss = sum(loss_buffer) / len(loss_buffer)
                log_buffer.average()
                info = f"Step/Epoch [{(epoch-1)*len(train_dataloader)+step+1}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \
                       f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({model.module.h}, {model.module.w}), "
                info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()])
                logger.info(info)
                last_tic = time.time()
                log_buffer.clear()
                data_time_all = 0
            logs.update(lr=lr)
            accelerator.log(logs, step=global_step + start_step)

            global_step += 1
            data_time_start= time.time()

            synchronize()
            if accelerator.is_main_process:
                if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0:
                    os.umask(0o000)
                    save_checkpoint(os.path.join(config.work_dir, 'checkpoints'),
                                    epoch=epoch,
                                    step=(epoch - 1) * len(train_dataloader) + step + 1,
                                    model=accelerator.unwrap_model(model),
                                    model_ema=accelerator.unwrap_model(model_ema),
                                    optimizer=optimizer,
                                    lr_scheduler=lr_scheduler
                                    )
            synchronize()

        synchronize()
        if accelerator.is_main_process:
            if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs:
                os.umask(0o000)
                save_checkpoint(os.path.join(config.output_dir, 'checkpoints'),
                                epoch=epoch,
                                step=(epoch - 1) * len(train_dataloader) + step + 1,
                                model=accelerator.unwrap_model(model),
                                model_ema=accelerator.unwrap_model(model_ema),
                                optimizer=optimizer,
                                lr_scheduler=lr_scheduler
                                )
            ########### EVAL ###################
            if epoch % config.save_image_epochs == 0 or epoch == config.num_epochs:                
                if config.validation_prompts is not None:
                    logger.info("Running inference for collecting generated images...")
      
                    assert config.eval_sampler in ['iddpm', 'dpm-solver', 'sa-solver']
                    sample_steps_dict = {'iddpm': 100, 'dpm-solver': 20, 'sa-solver': 25}
                    sample_steps = config.eval_steps if config.eval_steps != -1 else sample_steps_dict[config.eval_sampler]
                    # base_ratios = eval(f'ASPECT_RATIO_{config.image_size}_TEST')
                    
                    eval_dir = os.path.join(config.output_dir, 'eval')
                    os.makedirs(eval_dir, exist_ok=True)
                    save_path = os.path.join(eval_dir, f'{epoch}_{global_step}.png')
                    
                    model.eval()
                    images = []
                    # device = t5.device
                    for ip, prompt in enumerate(config.validation_prompts):
                        prompts = [prompt]
                        # prompts = []
                        # prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(prompt, base_ratios, device=device, show=False)  # ar for aspect ratio
                        # if config.image_size == 1024:
                            # latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8)
                        # else:
                        #     hw = torch.tensor([[config.image_size, config.image_size]], dtype=torch.float, device=device).repeat(bs, 1)
                        #     ar = torch.tensor([[1.]], device=device).repeat(bs, 1)
                        #     latent_size_h, latent_size_w = latent_size, latent_size
                        # prompts.append(prompt_clean.strip())
                        null_y = model.module.y_embedder.y_embedding[None].repeat(len(prompts), 1, 1)[:, None]
                        
                        with torch.no_grad():
                            caption_embs, emb_masks, len_prompts = val_txt_embs[ip]
                            # caption_embs, emb_masks = t5.get_text_embeddings(prompts)
                            # caption_embs = caption_embs.float()[:, None]
                            print(f'finish embedding')
                            n = len_prompts
                            if config.eval_sampler == 'iddpm':
                                # Create sampling noise:
                                z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device).repeat(2, 1, 1, 1)
                                model_kwargs = dict(y=torch.cat([caption_embs, null_y]),
                                                    cfg_scale=config.cfg_scale, data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks)
                                diffusion = IDDPM(str(sample_steps))
                                # Sample images:
                                samples = diffusion.p_sample_loop(
                                    model.module.forward_with_cfg, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True,
                                    device=device
                                )
                                samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
                            elif config.eval_sampler == 'dpm-solver':
                                # Create sampling noise:
                                z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device)
                                model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks)
                                dpm_solver = DPMS(model.module.forward_with_dpmsolver,
                                                condition=caption_embs,
                                                uncondition=null_y,
                                                cfg_scale=config.cfg_scale,
                                                model_kwargs=model_kwargs)
                                samples = dpm_solver.sample(
                                    z,
                                    steps=sample_steps,
                                    order=2,
                                    skip_type="time_uniform",
                                    method="multistep",
                                )
                            elif config.eval_sampler == 'sa-solver':
                                # Create sampling noise:
                                model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks)
                                sa_solver = SASolverSampler(model.module.forward_with_dpmsolver, device=device)
                                samples = sa_solver.sample(
                                    S=25,
                                    batch_size=n,
                                    shape=(4, latent_size_h, latent_size_w),
                                    eta=1,
                                    conditioning=caption_embs,
                                    unconditional_conditioning=null_y,
                                    unconditional_guidance_scale=config.cfg_scale,
                                    model_kwargs=model_kwargs,
                                )[0]
                        samples = vae.decode(samples / 0.18215).sample
                        # decode image
                        image = make_grid(samples, nrow=1, normalize=True, value_range=(-1, 1))
                        image = image.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
                        image = Image.fromarray(image)
                        images.append(image)
                        
                    image_grid = make_image_grid(images, 2, len(images)//2)
                    image_grid.save(save_path)
                    for tracker in accelerator.trackers:
                        if tracker.name == "tensorboard":
                            np_images = np.stack([np.asarray(img) for img in images])
                            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
                        elif tracker.name == "comet_ml":
                            logger.info('Logging validation images')
                            tracker.writer.log_image(image_grid, name=f"{epoch}", step=global_step)
                        else:
                            logger.warn(f"image logging not implemented for {tracker.name}")
                    
                    del images, image, samples, image_grid
                    torch.cuda.empty_cache()
                        
        model.train()
        synchronize()


def parse_args():
    parser = argparse.ArgumentParser(description="Process some integers.")
    parser.add_argument("config", type=str, help="config")
    parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine")
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument('--resume-from', help='the dir to resume the training')
    parser.add_argument('--load-from', default=None, help='the dir to load a ckpt for training')
    parser.add_argument('--local-rank', type=int, default=-1)
    parser.add_argument('--local_rank', type=int, default=-1)
    parser.add_argument('--debug', action='store_true')
    parser.add_argument(
        "--report_to",
        type=str,
        default="tensorboard",
        help=(
            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
        ),
    )
    parser.add_argument(
        "--tracker_project_name",
        type=str,
        default="text2image-fine-tune",
        help=(
            "The `project_name` argument passed to Accelerator.init_trackers for"
            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
        ),
    )
    parser.add_argument("--loss_report_name", type=str, default="loss")
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    config = read_config(args.config)
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        config.work_dir = args.work_dir
    if args.cloud:
        config.data_root = '/data/data'
    if args.resume_from is not None:
        config.load_from = None
        config.resume_from = dict(
            checkpoint=args.resume_from,
            load_ema=False,
            resume_optimizer=True,
            resume_lr_scheduler=True)
    if args.debug:
        config.log_interval = 1
        config.train_batch_size = 8
        config.valid_num = 100

    os.umask(0o000)
    config.output_dir = os.path.join(config.work_dir, 
                                     f"""{config.model}_{config.dataset_alias}_{config.image_size}_batch{config.train_batch_size}_{config.lr_schedule}_lr{config.optimizer['lr']}_warmup{config.lr_schedule_args['num_warmup_steps']}_gas{config.gradient_accumulation_steps}""")        
    os.makedirs(config.output_dir, exist_ok=True)

    init_handler = InitProcessGroupKwargs()
    init_handler.timeout = datetime.timedelta(seconds=5400)  # change timeout to avoid a strange NCCL bug
    # Initialize accelerator and tensorboard logging
    if config.use_fsdp:
        init_train = 'FSDP'
        from accelerate import FullyShardedDataParallelPlugin
        from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
        set_fsdp_env()
        fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),)
    else:
        init_train = 'DDP'
        fsdp_plugin = None

    even_batches = True
    if config.multi_scale:
        even_batches=False,
        
    if args.report_to == "comet_ml":
        import comet_ml
        comet_ml.init(
            project_name=args.tracker_project_name,
        )     

    accelerator = Accelerator(
        mixed_precision=config.mixed_precision,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        log_with=args.report_to,
        project_dir=os.path.join(config.output_dir, "logs"),
        fsdp_plugin=fsdp_plugin,
        even_batches=even_batches,
        kwargs_handlers=[init_handler]
    )

    logger = get_root_logger(os.path.join(config.output_dir, 'train_log.log'))

    config.seed = init_random_seed(config.get('seed', None))
    set_random_seed(config.seed)

    if accelerator.is_main_process:
        config.dump(os.path.join(config.output_dir, 'config.py'))

    logger.info(f"Config: \n{config.pretty_text}")
    logger.info(f"World_size: {get_world_size()}, seed: {config.seed}")
    logger.info(f"Initializing: {init_train} for training")
    image_size = config.image_size  # @param [256, 512]
    latent_size = int(image_size) // 8
    pred_sigma = getattr(config, 'pred_sigma', True)
    learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma
    model_kwargs={"window_block_indexes": config.window_block_indexes, "window_size": config.window_size,
                  "use_rel_pos": config.use_rel_pos, "lewei_scale": config.lewei_scale, 'config':config,
                  'model_max_length': config.model_max_length}
    
    if config.validation_prompts is not None:
        logger.info('Precompute validation prompt embeddings')
        from diffusion.model.utils import prepare_prompt_ar
        from diffusion import IDDPM, DPMS, SASolverSampler
        from diffusion.model.t5 import T5Embedder
        from diffusion.data.datasets import ASPECT_RATIO_256_TEST, ASPECT_RATIO_512_TEST, ASPECT_RATIO_1024_TEST
        from diffusers.utils import  make_image_grid
        from torchvision.utils import make_grid
        
        t5 = T5Embedder(device="cuda", local_cache=True, cache_dir='output/pretrained_models/t5_ckpts', torch_dtype=torch.float)
        device = t5.device
        base_ratios = eval(f'ASPECT_RATIO_{config.image_size}_TEST')
        pbs = 1
        val_txt_embs = []
        for prompt in config.validation_prompts:
            prompts = []
            prompt_clean, _, hw, ar, custom_hw = prepare_prompt_ar(prompt, base_ratios, device=device, show=False)  # ar for aspect ratio
            if config.image_size == 1024:
                latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8)
            else:
                hw = torch.tensor([[config.image_size, config.image_size]], dtype=torch.float, device=device).repeat(pbs, 1)
                ar = torch.tensor([[1.]], device=device).repeat(pbs, 1)
                latent_size_h, latent_size_w = latent_size, latent_size
            prompts.append(prompt_clean.strip())
            
            with torch.no_grad():
                caption_embs, emb_masks = t5.get_text_embeddings(prompts)
                caption_embs = caption_embs.float()[:, None]
                val_txt_embs.append([caption_embs, emb_masks, len(prompts)])
        del t5
        import gc         # garbage collect library
        gc.collect()
        torch.cuda.empty_cache()
        logger.info('[ DONE ]')

    # build models
    train_diffusion = IDDPM(str(config.train_sampling_steps), learn_sigma=learn_sigma, pred_sigma=pred_sigma, snr=config.snr_loss)
    model = build_model(config.model,
                        config.grad_checkpointing,
                        config.get('fp32_attention', False),
                        input_size=latent_size,
                        learn_sigma=learn_sigma,
                        pred_sigma=pred_sigma,
                        **model_kwargs).train()
    logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):,}")
    logger.info(f"T5 max token length: {config.model_max_length}")
    model_ema = deepcopy(model).eval()

    if config.load_from is not None:
        if args.load_from is not None:
            config.load_from = args.load_from
        missing, unexpected = load_checkpoint(config.load_from, model, load_ema=config.get('load_ema', False))
        logger.warning(f'Missing keys: {missing}')
        logger.warning(f'Unexpected keys: {unexpected}')

    ema_update(model_ema, model, 0.)
    if not config.data.load_vae_feat:
        vae = AutoencoderKL.from_pretrained(config.vae_pretrained).cuda()

    # prepare for FSDP clip grad norm calculation
    if accelerator.distributed_type == DistributedType.FSDP:
        for m in accelerator._models:
            m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m)

    # build dataloader
    set_data_root(config.data_root)
    dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type)
    if config.multi_scale:
        batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
                                                batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True,
                                                ratio_nums=dataset.ratio_nums, config=config, valid_num=config.valid_num)
        # used for balanced sampling
        # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
        #                                                 batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio,
        #                                                 ratio_nums=dataset.ratio_nums)
        train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers)
    else:
        logger.info(f'Batch size {config.train_batch_size}')
        train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True)

    # build optimizer and lr scheduler
    lr_scale_ratio = 1
    if config.get('auto_lr', None):
        lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps,
                                       config.optimizer, **config.auto_lr)
    optimizer = build_optimizer(model, config.optimizer)
    lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio)

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())

    if accelerator.is_main_process:
        tracker_config = dict(vars(config))
        accelerator.init_trackers(args.tracker_project_name, tracker_config)
        accelerator.get_tracker("comet_ml").writer.add_tags([config.model, 
                                                            config.dataset_alias, 
                                                            config.image_size, 
                                                            config.lr_schedule, 
                                                            f'bs{config.train_batch_size}',
                                                            f'gs{config.gradient_accumulation_steps}'
                                                            ])

    start_epoch = 0
    if config.resume_from is not None and config.resume_from['checkpoint'] is not None:
        start_epoch, missing, unexpected = load_checkpoint(**config.resume_from,
                                                           model=model,
                                                           model_ema=model_ema,
                                                           optimizer=optimizer,
                                                           lr_scheduler=lr_scheduler,
                                                           )

        logger.warning(f'Missing keys: {missing}')
        logger.warning(f'Unexpected keys: {unexpected}')
    # Prepare everything
    # There is no specific order to remember, you just need to unpack the
    # objects in the same order you gave them to the prepare method.
    model, model_ema = accelerator.prepare(model, model_ema)
    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
    train()


================================================
FILE: PixArt-alpha-ToCa/train_scripts/train.py
================================================
import argparse
import datetime
import os
import sys
import time
import types
import warnings
from copy import deepcopy
from pathlib import Path

import torch
import torch.nn as nn
from accelerate import Accelerator, InitProcessGroupKwargs
from accelerate.utils import DistributedType
from diffusers.models import AutoencoderKL
from mmcv.runner import LogBuffer
from torch.utils.data import RandomSampler

from diffusion import IDDPM
from diffusion.data.builder import build_dataset, build_dataloader, set_data_root
from diffusion.model.builder import build_model
from diffusion.utils.checkpoint import save_checkpoint, load_checkpoint
from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler
from diffusion.utils.dist_utils import get_world_size, clip_grad_norm_
from diffusion.utils.logger import get_root_logger
from diffusion.utils.lr_scheduler import build_lr_scheduler
from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow
from diffusion.utils.optimizer import build_optimizer, auto_scale_lr

warnings.filterwarnings("ignore")  # ignore warning

current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))


def set_fsdp_env():
    os.environ["ACCELERATE_USE_FSDP"] = 'true'
    os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP'
    os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE'
    os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock'


def ema_update(model_dest: nn.Module, model_src: nn.Module, rate):
    param_dict_src = dict(model_src.named_parameters())
    for p_name, p_dest in model_dest.named_parameters():
        p_src = param_dict_src[p_name]
        assert p_src is not p_dest
        p_dest.data.mul_(rate).add_((1 - rate) * p_src.data)

def train():
    if config.get('debug_nan', False):
        DebugUnderflowOverflow(model)
        logger.info('NaN debugger registered. Start to detect overflow during training.')
    time_start, last_tic = time.time(), time.time()
    log_buffer = LogBuffer()

    start_step = start_epoch * len(train_dataloader)
    global_step = 0
    total_steps = len(train_dataloader) * config.num_epochs

    load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False)
    # Now you train the model
    for epoch in range(start_epoch + 1, config.num_epochs + 1):
        data_time_start= time.time()
        data_time_all = 0
        for step, batch in enumerate(train_dataloader):
            data_time_all += time.time() - data_time_start
            if load_vae_feat:
                z = batch[0]
            else:
                with torch.no_grad():
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision == 'fp16'):
                        posterior = vae.encode(batch[0]).latent_dist
                        if config.sample_posterior:
                            z = posterior.sample()
                        else:
                            z = posterior.mode()
            clean_images = z * config.scale_factor
            y = batch[1]
            y_mask = batch[2]
            data_info = batch[3]

            # Sample a random timestep for each image
            bs = clean_images.shape[0]
            timesteps = torch.randint(0, config.train_sampling_steps, (bs,), device=clean_images.device).long()
            grad_norm = None
            with accelerator.accumulate(model):
                # Predict the noise residual
                optimizer.zero_grad()
                loss_term = train_diffusion.training_losses(model, clean_images, timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info))
                loss = loss_term['loss'].mean()
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip)
                optimizer.step()
                lr_scheduler.step()
                if accelerator.sync_gradients:
                    ema_update(model_ema, model, config.ema_rate)

            lr = lr_scheduler.get_last_lr()[0]
            logs = {args.loss_report_name: accelerator.gather(loss).mean().item()}
            if grad_norm is not None:
                logs.update(grad_norm=accelerator.gather(grad_norm).mean().item())
            log_buffer.update(logs)
            if (step + 1) % config.log_interval == 0 or (step + 1) == 1:
                t = (time.time() - last_tic) / config.log_interval
                t_d = data_time_all / config.log_interval
                avg_time = (time.time() - time_start) / (global_step + 1)
                eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1))))
                eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1))))
                # avg_loss = sum(loss_buffer) / len(loss_buffer)
                log_buffer.average()
                info = f"Step/Epoch [{(epoch-1)*len(train_dataloader)+step+1}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \
                       f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({model.module.h}, {model.module.w}), "
                info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()])
                logger.info(info)
                last_tic = time.time()
                log_buffer.clear()
                data_time_all = 0
            logs.update(lr=lr)
            accelerator.log(logs, step=global_step + start_step)

            global_step += 1
            data_time_start= time.time()

            if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0:
                accelerator.wait_for_everyone()
                if accelerator.is_main_process:
                    os.umask(0o000)
                    save_checkpoint(os.path.join(config.work_dir, 'checkpoints'),
                                    epoch=epoch,
                                    step=(epoch - 1) * len(train_dataloader) + step + 1,
                                    model=accelerator.unwrap_model(model),
                                    model_ema=accelerator.unwrap_model(model_ema),
                                    optimizer=optimizer,
                                    lr_scheduler=lr_scheduler
                                    )

        if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs:
            accelerator.wait_for_everyone()
            if accelerator.is_main_process:
                os.umask(0o000)
                save_checkpoint(os.path.join(config.work_dir, 'checkpoints'),
                                epoch=epoch,
                                step=(epoch - 1) * len(train_dataloader) + step + 1,
                                model=accelerator.unwrap_model(model),
                                model_ema=accelerator.unwrap_model(model_ema),
                                optimizer=optimizer,
                                lr_scheduler=lr_scheduler
                                )


def parse_args():
    parser = argparse.ArgumentParser(description="Process some integers.")
    parser.add_argument("config", type=str, help="config")
    parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine")
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument('--resume-from', help='the dir to resume the training')
    parser.add_argument('--load-from', default=None, help='the dir to load a ckpt for training')
    parser.add_argument('--local-rank', type=int, default=-1)
    parser.add_argument('--local_rank', type=int, default=-1)
    parser.add_argument('--debug', action='store_true')
    parser.add_argument(
        "--report_to",
        type=str,
        default="tensorboard",
        help=(
            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
        ),
    )
    parser.add_argument(
        "--tracker_project_name",
        type=str,
        default="text2image-fine-tune",
        help=(
            "The `project_name` argument passed to Accelerator.init_trackers for"
            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
        ),
    )
    parser.add_argument("--loss_report_name", type=str, default="loss")
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    config = read_config(args.config)
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        config.work_dir = args.work_dir
    if args.cloud:
        config.data_root = '/data/data'
    if args.resume_from is not None:
        config.load_from = None
        config.resume_from = dict(
            checkpoint=args.resume_from,
            load_ema=False,
            resume_optimizer=True,
            resume_lr_scheduler=True)
    if args.debug:
        config.log_interval = 1
        config.train_batch_size = 8
        config.valid_num = 100

    os.umask(0o000)
    os.makedirs(config.work_dir, exist_ok=True)

    init_handler = InitProcessGroupKwargs()
    init_handler.timeout = datetime.timedelta(seconds=5400)  # change timeout to avoid a strange NCCL bug
    # Initialize accelerator and tensorboard logging
    if config.use_fsdp:
        init_train = 'FSDP'
        from accelerate import FullyShardedDataParallelPlugin
        from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
        set_fsdp_env()
        fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),)
    else:
        init_train = 'DDP'
        fsdp_plugin = None

    even_batches = True
    if config.multi_scale:
        even_batches=False,

    accelerator = Accelerator(
        mixed_precision=config.mixed_precision,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        log_with=args.report_to,
        project_dir=os.path.join(config.work_dir, "logs"),
        fsdp_plugin=fsdp_plugin,
        even_batches=even_batches,
        kwargs_handlers=[init_handler]
    )

    logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log'))

    config.seed = init_random_seed(config.get('seed', None))
    set_random_seed(config.seed)

    if accelerator.is_main_process:
        config.dump(os.path.join(config.work_dir, 'config.py'))

    logger.info(f"Config: \n{config.pretty_text}")
    logger.info(f"World_size: {get_world_size()}, seed: {config.seed}")
    logger.info(f"Initializing: {init_train} for training")
    image_size = config.image_size  # @param [256, 512, 1024]
    latent_size = int(image_size) // 8
    pred_sigma = getattr(config, 'pred_sigma', True)
    learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma
    model_kwargs={"window_block_indexes": config.window_block_indexes, "window_size": config.window_size,
                  "use_rel_pos": config.use_rel_pos, "lewei_scale": config.lewei_scale, 'config':config,
                  'model_max_length': config.model_max_length}

    # build models
    train_diffusion = IDDPM(str(config.train_sampling_steps), learn_sigma=learn_sigma, pred_sigma=pred_sigma, snr=config.snr_loss)
    model = build_model(config.model,
                        config.grad_checkpointing,
                        config.get('fp32_attention', False),
                        input_size=latent_size,
                        learn_sigma=learn_sigma,
                        pred_sigma=pred_sigma,
                        **model_kwargs).train()
    logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):,}")
    model_ema = deepcopy(model).eval()

    if config.load_from is not None:
        if args.load_from is not None:
            config.load_from = args.load_from
        missing, unexpected = load_checkpoint(config.load_from, model, load_ema=config.get('load_ema', False))
        logger.warning(f'Missing keys: {missing}')
        logger.warning(f'Unexpected keys: {unexpected}')

    ema_update(model_ema, model, 0.)
    if not config.data.load_vae_feat:
        vae = AutoencoderKL.from_pretrained(config.vae_pretrained).cuda()

    # prepare for FSDP clip grad norm calculation
    if accelerator.distributed_type == DistributedType.FSDP:
        for m in accelerator._models:
            m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m)

    # build dataloader
    set_data_root(config.data_root)
    dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type)
    if config.multi_scale:
        batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
                                                batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True,
                                                ratio_nums=dataset.ratio_nums, config=config, valid_num=config.valid_num)
        # used for balanced sampling
        # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
        #                                                 batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio,
        #                                                 ratio_nums=dataset.ratio_nums)
        train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers)
    else:
        train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True)

    # build optimizer and lr scheduler
    lr_scale_ratio = 1
    if config.get('auto_lr', None):
        lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps,
                                       config.optimizer, **config.auto_lr)
    optimizer = build_optimizer(model, config.optimizer)
    lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio)

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())

    if accelerator.is_main_process:
        tracker_config = dict(vars(config))
        try:
            accelerator.init_trackers(args.tracker_project_name, tracker_config)
        except:
            accelerator.init_trackers(f"tb_{timestamp}")

    start_epoch = 0
    if config.resume_from is not None and config.resume_from['checkpoint'] is not None:
        start_epoch, missing, unexpected = load_checkpoint(**config.resume_from,
                                                           model=model,
                                                           model_ema=model_ema,
                                                           optimizer=optimizer,
                                                           lr_scheduler=lr_scheduler,
                                                           )

        logger.warning(f'Missing keys: {missing}')
        logger.warning(f'Unexpected keys: {unexpected}')
    # Prepare everything
    # There is no specific order to remember, you just need to unpack the
    # objects in the same order you gave them to the prepare method.
    model, model_ema = accelerator.prepare(model, model_ema)
    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
    train()


================================================
FILE: PixArt-alpha-ToCa/train_scripts/train_controlnet.py
================================================
import argparse
import datetime
import os
import sys
import time
import types
import warnings
from pathlib import Path

current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))

import torch
from accelerate import Accelerator, InitProcessGroupKwargs
from accelerate.utils import DistributedType
from mmcv.runner import LogBuffer
from torch.utils.data import RandomSampler

from diffusion import IDDPM
from diffusion.data.builder import build_dataset, build_dataloader, set_data_root
from diffusion.model.builder import build_model
from diffusion.model.nets import PixArtMS, ControlPixArtHalf, ControlPixArtMSHalf
from diffusion.utils.checkpoint import save_checkpoint, load_checkpoint
from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler
from diffusion.utils.dist_utils import synchronize, get_world_size, clip_grad_norm_
from diffusion.utils.logger import get_root_logger
from diffusion.utils.lr_scheduler import build_lr_scheduler
from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow
from diffusion.utils.optimizer import build_optimizer, auto_scale_lr

warnings.filterwarnings("ignore")  # ignore warning


def set_fsdp_env():
    os.environ["ACCELERATE_USE_FSDP"] = 'true'
    os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP'
    os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE'
    os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock'


def train():
    if config.get('debug_nan', False):
        DebugUnderflowOverflow(model)
        logger.info('NaN debugger registered. Start to detect overflow during training.')
    time_start, last_tic = time.time(), time.time()
    log_buffer = LogBuffer()

    start_step = start_epoch * len(train_dataloader)
    global_step = 0
    total_steps = len(train_dataloader) * config.num_epochs

    load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False)
    if not load_vae_feat:
        raise ValueError("Only support load vae features for now.")
    # Now you train the model
    for epoch in range(start_epoch + 1, config.num_epochs + 1):
        data_time_start = time.time()
        data_time_all = 0
        for step, batch in enumerate(train_dataloader):
            data_time_all += time.time() - data_time_start
            z = batch[0]  # 4 x 4 x 128 x 128 z:vae output, 3x1024x1024->vae->4x128x128
            clean_images = z * config.scale_factor  # vae needed scale factor
            y = batch[1]  # 4 x 1 x 120 x 4096 # T5 extracted feature of caption, 120 token, 4096
            y_mask = batch[2]  # 4 x 1 x 1 x 120 # caption indicate whether valid
            data_info = batch[3]

            # Sample a random timestep for each image
            bs = clean_images.shape[0]
            timesteps = torch.randint(0, config.train_sampling_steps, (bs,), device=clean_images.device).long()
            grad_norm = None
            with accelerator.accumulate(model):
                # Predict the noise residual
                optimizer.zero_grad()
                loss_term = train_diffusion.training_losses(model, clean_images, timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info, c=data_info['condition'] * config.scale_factor))
                loss = loss_term['loss'].mean()
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip)
                optimizer.step()
                lr_scheduler.step()

            lr = lr_scheduler.get_last_lr()[0]
            logs = {"loss": accelerator.gather(loss).mean().item()}
            if grad_norm is not None:
                logs.update(grad_norm=accelerator.gather(grad_norm).mean().item())
            log_buffer.update(logs)
            if (step + 1) % config.log_interval == 0 or (step + 1) == 1:
                t = (time.time() - last_tic) / config.log_interval
                t_d = data_time_all / config.log_interval
                avg_time = (time.time() - time_start) / (global_step + 1)
                eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1))))
                eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1))))
                # avg_loss = sum(loss_buffer) / len(loss_buffer)
                log_buffer.average()
                info = f"Step/Epoch [{(epoch - 1) * len(train_dataloader) + step + 1}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \
                       f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({data_info['img_hw'][0][0].item()}, {data_info['img_hw'][0][1].item()}), "
                info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()])
                logger.info(info)
                last_tic = time.time()
                log_buffer.clear()
                data_time_all = 0
            logs.update(lr=lr)
            accelerator.log(logs, step=global_step + start_step)

            if (global_step + 1) % 1000 == 0 and config.s3_work_dir is not None:
                logger.info(f"s3_work_dir: {config.s3_work_dir}")

            global_step += 1
            data_time_start = time.time()

            synchronize()
            if accelerator.is_main_process:
                if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0:
                    os.umask(0o000)  # file permission: 666; dir permission: 777
                    save_checkpoint(os.path.join(config.work_dir, 'checkpoints'),
                                    epoch=epoch,
                                    step=(epoch - 1) * len(train_dataloader) + step + 1,
                                    model=accelerator.unwrap_model(model),
                                    optimizer=optimizer,
                                    lr_scheduler=lr_scheduler
                                    )
            synchronize()

        synchronize()
        # After each epoch you optionally sample some demo images with evaluate() and save the model
        if accelerator.is_main_process:
            if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs:
                os.umask(0o000)  # file permission: 666; dir permission: 777
                save_checkpoint(os.path.join(config.work_dir, 'checkpoints'),
                                epoch=epoch,
                                step=(epoch - 1) * len(train_dataloader) + step + 1,
                                model=accelerator.unwrap_model(model),
                                optimizer=optimizer,
                                lr_scheduler=lr_scheduler
                                )
        synchronize()


def parse_args():
    parser = argparse.ArgumentParser(description="Process some integers.")
    parser.add_argument("config", type=str, help="config")
    parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine")
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument('--resume_from', help='the dir to save logs and models')
    parser.add_argument('--local-rank', type=int, default=-1)
    parser.add_argument('--local_rank', type=int, default=-1)
    parser.add_argument('--debug', action='store_true')
    parser.add_argument(
        "--report_to",
        type=str,
        default="tensorboard",
        help=(
            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
        ),
    )
    parser.add_argument(
        "--tracker_project_name",
        type=str,
        default="text2image-fine-tune",
        help=(
            "The `project_name` argument passed to Accelerator.init_trackers for"
            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
        ),
    )
    parser.add_argument('--lr', type=float, default=2e-4)
    parser.add_argument('--data_root', type=str, default=None)
    parser.add_argument('--resume_optimizer', action='store_true')
    parser.add_argument('--resume_lr_scheduler', action='store_true')

    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    config = read_config(args.config)
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        config.work_dir = args.work_dir
    if args.cloud:
        config.data_root = '/data/data'
    if args.data_root:
        config.data_root = args.data_root
    if args.resume_from is not None:
        config.load_from = None
        config.resume_from = dict(
            checkpoint=args.resume_from,
            load_ema=False,
            resume_optimizer=args.resume_optimizer,
            resume_lr_scheduler=args.resume_lr_scheduler)
    if args.debug:
        config.log_interval = 1
        config.train_batch_size = 6
        config.optimizer.update({'lr': args.lr})

    os.umask(0o000)  # file permission: 666; dir permission: 777
    os.makedirs(config.work_dir, exist_ok=True)

    init_handler = InitProcessGroupKwargs()
    init_handler.timeout = datetime.timedelta(seconds=9600)  # change timeout to avoid a strange NCCL bug
    # Initialize accelerator and tensorboard logging
    if config.use_fsdp:
        init_train = 'FSDP'
        from accelerate import FullyShardedDataParallelPlugin
        from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
        set_fsdp_env()
        fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),)
    else:
        init_train = 'DDP'
        fsdp_plugin = None

    even_batches = True
    if config.multi_scale:
        even_batches=False,

    accelerator = Accelerator(
        mixed_precision=config.mixed_precision,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        log_with=args.report_to,
        project_dir=os.path.join(config.work_dir, "logs"),
        fsdp_plugin=fsdp_plugin,
        even_batches=even_batches,
        kwargs_handlers=[init_handler]
    )

    logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log'))

    config.seed = init_random_seed(config.get('seed', None))
    set_random_seed(config.seed)

    if accelerator.is_main_process:
        config.dump(os.path.join(config.work_dir, 'config.py'))

    logger.info(f"Config: \n{config.pretty_text}")
    logger.info(f"World_size: {get_world_size()}, seed: {config.seed}")
    logger.info(f"Initializing: {init_train} for training")
    image_size = config.image_size  # @param [512, 1024]
    latent_size = int(image_size) // 8
    pred_sigma = getattr(config, 'pred_sigma', True)
    learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma
    model_kwargs={"window_block_indexes": config.window_block_indexes, "window_size": config.window_size,
                  "use_rel_pos": config.use_rel_pos, "lewei_scale": config.lewei_scale, 'config':config,
                  'model_max_length': config.model_max_length}

    # build models
    train_diffusion = IDDPM(str(config.train_sampling_steps))
    model: PixArtMS = build_model(config.model,
                                  config.grad_checkpointing,
                                  config.get('fp32_attention', False),
                                  input_size=latent_size,
                                  learn_sigma=learn_sigma,
                                  pred_sigma=pred_sigma,
                                  **model_kwargs)

    if config.load_from is not None and args.resume_from is None:
        # load from PixArt model
        missing, unexpected = load_checkpoint(config.load_from, model)
        logger.warning(f'Missing keys: {missing}')
        logger.warning(f'Unexpected keys: {unexpected}')

    if image_size == 1024:
        model: ControlPixArtMSHalf = ControlPixArtMSHalf(model, copy_blocks_num=config.copy_blocks_num).train()
    else:
        model: ControlPixArtHalf = ControlPixArtHalf(model, copy_blocks_num=config.copy_blocks_num).train()

    logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):,}")
    logger.info(f"T5 max token length: {config.model_max_length}")

    # if args.local_rank == 0:
    #     for name, params in model.named_parameters():
    #         if params.requires_grad == False: logger.info(f"freeze param: {name}")
    #
    #     for name, params in model.named_parameters():
    #         if params.requires_grad == True: logger.info(f"trainable param: {name}")

    # prepare for FSDP clip grad norm calculation
    if accelerator.distributed_type == DistributedType.FSDP:
        for m in accelerator._models:
            m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m)

    # build dataloader
    set_data_root(config.data_root)
    dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type, train_ratio=config.train_ratio)
    if config.multi_scale:
        batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
                                                batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True,
                                                ratio_nums=dataset.ratio_nums, config=config, valid_num=1)
        # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
        #                                                 batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio,
        #                                                 ratio_nums=dataset.ratio_nums)
        train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers)
    else:
        train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True)

    # build optimizer and lr scheduler
    lr_scale_ratio = 1
    if config.get('auto_lr', None):
        lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps,
                                       config.optimizer, **config.auto_lr)
    optimizer = build_optimizer(model.controlnet, config.optimizer)
    lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio)

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())

    if accelerator.is_main_process:
        tracker_config = dict(vars(config))
        try:
            accelerator.init_trackers(args.tracker_project_name, tracker_config)
        except:
            accelerator.init_trackers(f"tb_{timestamp}")

    start_epoch = 0
    if config.resume_from is not None and config.resume_from['checkpoint'] is not None:
        if args.resume_optimizer == False or args.resume_lr_scheduler == False:
            missing, unexpected = load_checkpoint(args.resume_from, model)
        else:
            start_epoch, missing, unexpected = load_checkpoint(**config.resume_from,
                                                               model=model,
                                                               optimizer=optimizer,
                                                               lr_scheduler=lr_scheduler,
                                                               )

        logger.warning(f'Missing keys: {missing}')
        logger.warning(f'Unexpected keys: {unexpected}')
    # Prepare everything
    # There is no specific order to remember, you just need to unpack the
    # objects in the same order you gave them to the prepare method.
    model = accelerator.prepare(model,)
    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
    train()


================================================
FILE: PixArt-alpha-ToCa/train_scripts/train_diffusers.py
================================================
import argparse
import datetime
import os
import sys
import time
import types
import warnings
from pathlib import Path

current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))

import accelerate
import gc
import numpy as np
import torch
import torch.nn as nn
from accelerate import Accelerator, InitProcessGroupKwargs
from accelerate.utils import DistributedType
from copy import deepcopy
from diffusers import AutoencoderKL, Transformer2DModel, PixArtAlphaPipeline, DPMSolverMultistepScheduler
from mmcv.runner import LogBuffer
from packaging import version
from torch.utils.data import RandomSampler
from transformers import T5Tokenizer, T5EncoderModel

from diffusion import IDDPM
from diffusion.data.builder import build_dataset, build_dataloader, set_data_root
from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler
from diffusion.utils.dist_utils import get_world_size, clip_grad_norm_, flush
from diffusion.utils.logger import get_root_logger, rename_file_with_creation_time
from diffusion.utils.lr_scheduler import build_lr_scheduler
from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow
from diffusion.utils.optimizer import build_optimizer, auto_scale_lr

warnings.filterwarnings("ignore")  # ignore warning


def set_fsdp_env():
    os.environ["ACCELERATE_USE_FSDP"] = 'true'
    os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP'
    os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE'
    os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'Transformer2DModel'


def ema_update(model_dest: nn.Module, model_src: nn.Module, rate):
    param_dict_src = dict(model_src.named_parameters())
    for p_name, p_dest in model_dest.named_parameters():
        p_src = param_dict_src[p_name]
        assert p_src is not p_dest
        p_dest.data.mul_(rate).add_((1 - rate) * p_src.data)


def token_drop(y, y_mask, force_drop_ids=None):
    """
    Drops labels to enable classifier-free guidance.
    """
    if force_drop_ids is None:
        drop_ids = torch.rand(y.shape[0]).cuda() < config.class_dropout_prob
    else:
        drop_ids = force_drop_ids == 1
    y = torch.where(drop_ids[:, None, None], uncond_prompt_embeds, y)
    y_mask = torch.where(drop_ids[:, None], uncond_prompt_attention_mask, y_mask)
    return y, y_mask


def get_null_embed(npz_file, max_length=120):
    if os.path.exists(npz_file) and (npz_file.endswith('.npz') or npz_file.endswith('.pth')):
        data = torch.load(npz_file)
        uncond_prompt_embeds = data['uncond_prompt_embeds'].to(accelerator.device)
        uncond_prompt_attention_mask = data['uncond_prompt_attention_mask'].to(accelerator.device)
    else:
        tokenizer = T5Tokenizer.from_pretrained(args.pipeline_load_from, subfolder="tokenizer")
        text_encoder = T5EncoderModel.from_pretrained(args.pipeline_load_from, subfolder="text_encoder")
        uncond = tokenizer("", max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
        uncond_prompt_embeds = text_encoder(uncond.input_ids, attention_mask=uncond.attention_mask)[0]

        torch.save({
            'uncond_prompt_embeds': uncond_prompt_embeds.cpu(),
            'uncond_prompt_attention_mask': uncond.attention_mask.cpu()
        }, npz_file)

        uncond_prompt_embeds = uncond_prompt_embeds.to(accelerator.device)
        uncond_prompt_attention_mask = uncond.attention_mask.to(accelerator.device)

    return uncond_prompt_embeds, uncond_prompt_attention_mask


def prepare_vis():
    if accelerator.is_main_process:
        # preparing embeddings for visualization. We put it here for saving GPU memory
        validation_prompts = [
            "dog",
            "portrait photo of a girl, photograph, highly detailed face, depth of field",
            "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
            "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
            "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
        ]
        logger.info("Preparing Visualization prompt embeddings...")
        logger.info(f"Loading text encoder and tokenizer from {args.pipeline_load_from} ...")
        skip = True
        for prompt in validation_prompts:
            if not os.path.exists(f'output/tmp/{prompt}_{max_length}token.pth'):
                skip = False
                break
        if accelerator.is_main_process and not skip:
            print(f"Saving visualizate prompt text embedding at output/tmp/")
            tokenizer = T5Tokenizer.from_pretrained(args.pipeline_load_from, subfolder="tokenizer")
            text_encoder = T5EncoderModel.from_pretrained(args.pipeline_load_from, subfolder="text_encoder").to(accelerator.device)
            for prompt in validation_prompts:
                caption_token = tokenizer(prompt, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt").to(accelerator.device)
                caption_emb = text_encoder(caption_token.input_ids, attention_mask=caption_token.attention_mask)[0]
                torch.save({'caption_embeds': caption_emb, 'emb_mask': caption_token.attention_mask}, f'output/tmp/{prompt}_{max_length}token.pth')
        flush()


@torch.inference_mode()
def log_validation(model, accelerator, weight_dtype, step):


    logger.info("Running validation... ")

    model = accelerator.unwrap_model(model)
    pipeline = PixArtAlphaPipeline.from_pretrained(
        args.pipeline_load_from,
        transformer=model,
        tokenizer=None,
        text_encoder=None,
        torch_dtype=weight_dtype,
    )
    pipeline = pipeline.to(accelerator.device)
    pipeline.set_progress_bar_config(disable=True)

    generator = torch.Generator(device=accelerator.device).manual_seed(0)

    validation_prompts = [
        "dog",
        "portrait photo of a girl, photograph, highly detailed face, depth of field",
        "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
        "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
    ]
    image_logs = []
    images = []
    latents = []
    for _, prompt in enumerate(validation_prompts):
        embed = torch.load(f'output/tmp/{prompt}_{max_length}token.pth', map_location='cpu')
        caption_embs, emb_masks = embed['caption_embeds'].to(accelerator.device), embed['emb_mask'].to(accelerator.device)
        latents.append(pipeline(
            num_inference_steps=14,
            num_images_per_prompt=1,
            generator=generator,
            guidance_scale=4.5,
            prompt_embeds=caption_embs,
            prompt_attention_mask=emb_masks,
            negative_prompt=None,
            negative_prompt_embeds=uncond_prompt_embeds,
            negative_prompt_attention_mask=uncond_prompt_attention_mask,
            output_type="latent",
        ).images)

    flush()

    for latent in latents:
        images.append(pipeline.vae.decode(latent.to(weight_dtype) / pipeline.vae.config.scaling_factor, return_dict=False)[0])
    for prompt, image in zip(validation_prompts, images):
        image = pipeline.image_processor.postprocess(image, output_type="pil")
        image_logs.append({"validation_prompt": prompt, "images": image})

    for tracker in accelerator.trackers:
        if tracker.name == "tensorboard":
            for log in image_logs:
                images = log["images"]
                validation_prompt = log["validation_prompt"]
                formatted_images = []
                for image in images:
                    formatted_images.append(np.asarray(image))

                formatted_images = np.stack(formatted_images)

                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
        elif tracker.name == "wandb":
            import wandb
            formatted_images = []

            for log in image_logs:
                images = log["images"]
                validation_prompt = log["validation_prompt"]
                for image in images:
                    image = wandb.Image(image, caption=validation_prompt)
                    formatted_images.append(image)

            tracker.log({"validation": formatted_images})
        else:
            logger.warn(f"image logging not implemented for {tracker.name}")

    del pipeline
    gc.collect()
    torch.cuda.empty_cache()
    return image_logs


def train(model):
    if config.get('debug_nan', False):
        DebugUnderflowOverflow(model)
        logger.info('NaN debugger registered. Start to detect overflow during training.')
    time_start, last_tic = time.time(), time.time()
    log_buffer = LogBuffer()

    global_step = start_step + 1

    load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False)

    # Now you train the model
    for epoch in range(start_epoch + 1, config.num_epochs + 1):
        data_time_start= time.time()
        data_time_all = 0
        for step, batch in enumerate(train_dataloader):
            data_time_all += time.time() - data_time_start
            if load_vae_feat:
                z = batch[0]
            else:
                with torch.no_grad():
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision == 'fp16'):
                        posterior = vae.encode(batch[0]).latent_dist
                        if config.sample_posterior:
                            z = posterior.sample()
                        else:
                            z = posterior.mode()
            latents = (z * config.scale_factor).to(weight_dtype)
            y = batch[1].squeeze(1).to(weight_dtype)
            y_mask = batch[2].squeeze(1).squeeze(1).to(weight_dtype)
            y, y_mask = token_drop(y, y_mask)   # classifier-free guidance
            data_info = {'resolution': batch[3]['img_hw'].to(weight_dtype), 'aspect_ratio': batch[3]['aspect_ratio'].to(weight_dtype),}

            # Sample a random timestep for each image
            bs = latents.shape[0]
            timesteps = torch.randint(0, config.train_sampling_steps, (bs,), device=latents.device).long()
            grad_norm = None
            with accelerator.accumulate(model):
                # Predict the noise residual
                optimizer.zero_grad()
                loss_term = train_diffusion.training_losses_diffusers(
                    model, latents, timesteps,
                    model_kwargs = dict(encoder_hidden_states=y, encoder_attention_mask=y_mask, added_cond_kwargs=data_info),
                )
                loss = loss_term['loss'].mean()
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip)
                optimizer.step()
                lr_scheduler.step()

                # if accelerator.sync_gradients:
                #     ema_update(model_ema, accelerator.unwrap_model(model), config.ema_rate)

            lr = lr_scheduler.get_last_lr()[0]
            logs = {args.loss_report_name: accelerator.gather(loss).mean().item()}
            if grad_norm is not None:
                logs.update(grad_norm=accelerator.gather(grad_norm).mean().item())
            log_buffer.update(logs)
            if (step + 1) % config.log_interval == 0 or (step + 1) == 1:
                t = (time.time() - last_tic) / config.log_interval
                t_d = data_time_all / config.log_interval
                avg_time = (time.time() - time_start) / (global_step - start_step)
                eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - global_step - 1))))
                eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1))))
                # avg_loss = sum(loss_buffer) / len(loss_buffer)
                log_buffer.average()
                info = f"Step/Epoch [{global_step}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \
                       f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}," \
                       f"s:({data_info['resolution'][0][0].item()}, {data_info['resolution'][0][1].item()}), "
                       # f"s:({data_info['resolution'][0][0].item() * relative_to_1024 // 8}, {data_info['resolution'][0][1].item() * relative_to_1024 // 8}), "
                info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()])
                logger.info(info)
                last_tic = time.time()
                log_buffer.clear()
                data_time_all = 0
            logs.update(lr=lr)
            accelerator.log(logs, step=global_step)

            global_step += 1
            data_time_start= time.time()

            accelerator.wait_for_everyone()
            if accelerator.is_main_process:
                if global_step % config.save_model_steps == 0:
                    save_path = os.path.join(os.path.join(config.work_dir, 'checkpoints'), f"checkpoint-{global_step}")
                    os.umask(0o000)
                    logger.info(f"Start to save state to {save_path}")
                    accelerator.save_state(save_path)
                    logger.info(f"Saved state to {save_path}")

                if global_step % config.eval_sampling_steps == 0 or (step + 1) == 1:
                    log_validation(model, accelerator, weight_dtype, global_step)

        accelerator.wait_for_everyone()
        if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs:
            os.umask(0o000)
            save_path = os.path.join(os.path.join(config.work_dir, 'checkpoints'), f"checkpoint-{global_step}")
            logger.info(f"Start to save state to {save_path}")
            model = accelerator.unwrap_model(model)
            model.save_pretrained(save_path)
            logger.info(f"Saved state to {save_path}")


def parse_args():
    parser = argparse.ArgumentParser(description="Process some integers.")
    parser.add_argument("config", type=str, help="config")
    parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine")
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument('--resume-from', help='the dir to resume the training')
    parser.add_argument('--load-from', default=None, help='the dir to load a ckpt for training')
    parser.add_argument('--local-rank', type=int, default=-1)
    parser.add_argument('--local_rank', type=int, default=-1)
    parser.add_argument('--debug', action='store_true')
    parser.add_argument("--pipeline_load_from", default='output/pretrained_models/pixart_omega_sdxl_256px_diffusers_from512', type=str, help="path for loading text_encoder, tokenizer and vae")
    parser.add_argument(
        "--report_to",
        type=str,
        default="tensorboard",
        help=(
            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
        ),
    )
    parser.add_argument(
        "--tracker_project_name",
        type=str,
        default="text2image-pixart-omega",
        help=(
            "The `project_name` argument passed to Accelerator.init_trackers for"
            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
        ),
    )
    parser.add_argument("--loss_report_name", type=str, default="loss")
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    config = read_config(args.config)
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        config.work_dir = args.work_dir
    if args.cloud:
        config.data_root = '/data/data'
    if args.resume_from is not None:
        config.resume_from = args.resume_from
    if args.debug:
        config.log_interval = 1
        config.train_batch_size = 32
        config.valid_num = 100

    os.umask(0o000)
    os.makedirs(config.work_dir, exist_ok=True)

    init_handler = InitProcessGroupKwargs()
    init_handler.timeout = datetime.timedelta(seconds=5400)  # change timeout to avoid a strange NCCL bug
    # Initialize accelerator and tensorboard logging
    if config.use_fsdp:
        init_train = 'FSDP'
        from accelerate import FullyShardedDataParallelPlugin
        from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
        set_fsdp_env()
        fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),)
    else:
        init_train = 'DDP'
        fsdp_plugin = None

    even_batches = True
    if config.multi_scale:
        even_batches=False,

    accelerator = Accelerator(
        mixed_precision=config.mixed_precision,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        log_with=args.report_to,
        project_dir=os.path.join(config.work_dir, "logs"),
        fsdp_plugin=fsdp_plugin,
        even_batches=even_batches,
        kwargs_handlers=[init_handler]
    )

    log_name = 'train_log.log'
    if accelerator.is_main_process:
        if os.path.exists(os.path.join(config.work_dir, log_name)):
            rename_file_with_creation_time(os.path.join(config.work_dir, log_name))
    logger = get_root_logger(os.path.join(config.work_dir, log_name))

    logger.info(accelerator.state)
    config.seed = init_random_seed(config.get('seed', None))
    set_random_seed(config.seed)

    if accelerator.is_main_process:
        config.dump(os.path.join(config.work_dir, 'config.py'))

    logger.info(f"Config: \n{config.pretty_text}")
    logger.info(f"World_size: {get_world_size()}, seed: {config.seed}")
    logger.info(f"Initializing: {init_train} for training")
    image_size = config.image_size  # @param [256, 512, 1024]
    latent_size = int(image_size) // 8
    relative_to_1024 = float(image_size / 1024)
    pred_sigma = getattr(config, 'pred_sigma', True)
    learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma

    # Create for unconditional prompt embedding for classifier free guidance
    logger.info("Embedding for classifier free guidance")
    max_length = config.model_max_length
    uncond_prompt_embeds, uncond_prompt_attention_mask = get_null_embed(
        f'output/pretrained_models/null_embed_diffusers_{max_length}token.pth', max_length=max_length
    )
    # preparing embeddings for visualization. We put it here for saving GPU memory
    prepare_vis()

    # build models
    train_diffusion = IDDPM(str(config.train_sampling_steps), learn_sigma=learn_sigma, pred_sigma=pred_sigma, snr=config.snr_loss)
    model = Transformer2DModel.from_pretrained(config.load_from, subfolder="transformer").train()
    logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):,}")
    logger.info(f"lewei scale: {model.pos_embed.interpolation_scale} base size: {model.pos_embed.base_size}")
    # model_ema = deepcopy(model).eval()

    # 9. Handle mixed precision and device placement
    # For mixed precision training we cast all non-trainable weigths to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
        weight_dtype = torch.float16
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

    # 11. Enable optimizations
    # model.enable_xformers_memory_efficient_attention()    # not available for now

    # for name, params in model.named_parameters():
    #     if params.requires_grad == False: logger.info(f"freeze param: {name}")
    #
    # for name, params in model.named_parameters():
    #     if params.requires_grad == True: logger.info(f"trainable param: {name}")

    # 10. Handle saving and loading of checkpoints
    # `accelerate` 0.16.0 will have better support for customized saving
    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
        def save_model_hook(models, weights, output_dir):
            if accelerator.is_main_process:
                transformer_ = accelerator.unwrap_model(models[0])
                # save weights in peft format to be able to load them back
                transformer_.save_pretrained(output_dir)

                for _, model in enumerate(models):
                    # make sure to pop weight so that corresponding model is not saved again
                    weights.pop()

        def load_model_hook(models, input_dir):

            for i in range(len(models)):
                # pop models so that they are not loaded again
                model = models.pop()

                # load diffusers style into model
                load_model = Transformer2DModel.from_pretrained(input_dir)
                model.register_to_config(**load_model.config)

                model.load_state_dict(load_model.state_dict())
                del load_model

        accelerator.register_save_state_pre_hook(save_model_hook)
        accelerator.register_load_state_pre_hook(load_model_hook)

    if config.grad_checkpointing:
        model.enable_gradient_checkpointing()

    if not config.data.load_vae_feat:
        vae = AutoencoderKL.from_pretrained(config.vae_pretrained).cuda()

    # prepare for FSDP clip grad norm calculation
    if accelerator.distributed_type == DistributedType.FSDP:
        for m in accelerator._models:
            m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m)

    # build dataloader
    set_data_root(config.data_root)
    logger.info(f"ratio of real user prompt: {config.real_prompt_ratio}")
    dataset = build_dataset(
        config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type,
        real_prompt_ratio=config.real_prompt_ratio, max_length=max_length, config=config,
    )
    if config.multi_scale:
        batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
                                                batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True,
                                                ratio_nums=dataset.ratio_nums, config=config, valid_num=config.valid_num)
        # used for balanced sampling
        # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
        #                                                 batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio,
        #                                                 ratio_nums=dataset.ratio_nums)
        train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers)
    else:
        train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True)

    # build optimizer and lr scheduler
    lr_scale_ratio = 1
    if config.get('auto_lr', None):
        lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps,
                                       config.optimizer, **config.auto_lr)
    optimizer = build_optimizer(model, config.optimizer)
    lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio)

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())

    if accelerator.is_main_process:
        tracker_config = dict(vars(config))
        accelerator.init_trackers(f"tb_{timestamp}_{args.tracker_project_name}")
        logger.info(f"Training tracker at tb_{timestamp}_{args.tracker_project_name}")

    start_epoch = 0
    start_step = 0
    total_steps = len(train_dataloader) * config.num_epochs

    # Prepare everything
    # There is no specific order to remember, you just need to unpack the
    # objects in the same order you gave them to the prepare method.
    # model, model_ema = accelerator.prepare(model, model_ema)
    model = accelerator.prepare(model)
    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)

    if config.resume_from is not None:
        if config.resume_from != "latest":
            path = os.path.basename(config.resume_from)
        else:
            # Get the most recent checkpoint
            dirs = os.listdir(os.path.join(config.work_dir, 'checkpoints'))
            dirs = [d for d in dirs if d.startswith("checkpoint")]
            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
            path = dirs[-1] if len(dirs) > 0 else None

        if path is None:
            accelerator.print(f"Checkpoint '{config.resume_from}' does not exist. Starting a new training run.")
            config.resume_from = None
        else:
            accelerator.print(f"Resuming from checkpoint {path}")
            accelerator.load_state(os.path.join(config.work_dir, 'checkpoints', path))
            start_step = int(path.split("-")[1])
            start_epoch = start_step // len(train_dataloader)

    train(model)

================================================
FILE: PixArt-alpha-ToCa/train_scripts/train_dreambooth.py
================================================
import os
import sys
import types
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import argparse
import datetime
import time
import warnings
warnings.filterwarnings("ignore")  # ignore warning

from mmcv.runner import LogBuffer
from copy import deepcopy
from diffusion.utils.checkpoint import save_checkpoint, load_checkpoint

import torch
import torch.nn as nn
from accelerate import Accelerator, InitProcessGroupKwargs
from accelerate.utils import DistributedType
from torch.utils.data import RandomSampler

from diffusion import IDDPM
from diffusion.utils.dist_utils import synchronize, get_world_size, clip_grad_norm_
from diffusion.data.builder import build_dataset, build_dataloader, set_data_root
from diffusion.model.builder import build_model
from diffusion.utils.logger import get_root_logger
from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow
from diffusion.utils.optimizer import build_optimizer, auto_scale_lr
from diffusion.utils.lr_scheduler import build_lr_scheduler
from diffusion.model.t5 import T5Embedder
from diffusion.utils.data_sampler import AspectRatioBatchSampler

def set_fsdp_env():
    os.environ["ACCELERATE_USE_FSDP"] = 'true'
    os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP'
    os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE'
    os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock'


def ema_update(model_dest: nn.Module, model_src: nn.Module, rate):
    param_dict_src = dict(model_src.named_parameters())
    for p_name, p_dest in model_dest.named_parameters():
        p_src = param_dict_src[p_name]
        assert p_src is not p_dest
        p_dest.data.mul_(rate).add_((1 - rate) * p_src.data)

def train():
    if config.get('debug_nan', False):
        DebugUnderflowOverflow(model)
        logger.info('NaN debugger registered. Start to detect overflow during training.')
    time_start, last_tic = time.time(), time.time()
    log_buffer = LogBuffer()

    start_step = start_epoch * len(train_dataloader)
    global_step = 0
    total_steps = len(train_dataloader) * config.num_epochs
    # txt related
    prompt = config.data.prompt if isinstance(config.data.prompt, list) else [config.data.prompt]
    llm_embed_model = T5Embedder(device="cpu", local_cache=True, cache_dir='output/pretrained_models/t5_ckpts', torch_dtype=torch.float)
    prompt_embs, attention_mask = llm_embed_model.get_text_embeddings(prompt)
    prompt_embs, attention_mask = prompt_embs[None].cuda(), attention_mask[None].cuda()
    del llm_embed_model

    # Now you train the model
    for epoch in range(start_epoch + 1, config.num_epochs + 1):
        data_time_start= time.time()
        data_time_all = 0
        for step, batch in enumerate(train_dataloader):
            data_time_all += time.time() - data_time_start
            z = batch[0]
            clean_images = z * config.scale_factor
            y = prompt_embs
            y_mask = attention_mask
            data_info = batch[1]

            # Sample a random timestep for each image
            bs = clean_images.shape[0]
            timesteps = torch.randint(0, config.train_sampling_steps, (bs,), device=clean_images.device).long()
            grad_norm = None
            with accelerator.accumulate(model):
                # Predict the noise residual
                optimizer.zero_grad()
                loss_term = train_diffusion.training_losses(model, clean_images, timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info))
                loss = loss_term['loss'].mean()
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip)
                optimizer.step()
                lr_scheduler.step()
                if accelerator.sync_gradients:
                    ema_update(model_ema, model, config.ema_rate)

            lr = lr_scheduler.get_last_lr()[0]
            logs = {"loss": accelerator.gather(loss).mean().item()}
            if grad_norm is not None:
                logs.update(grad_norm=accelerator.gather(grad_norm).mean().item())
            log_buffer.update(logs)
            if (step + 1) % config.log_interval == 0:
                t = (time.time() - last_tic) / config.log_interval
                t_d = data_time_all / config.log_interval
                avg_time = (time.time() - time_start) / (global_step + 1)
                eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1))))
                eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1))))
                # avg_loss = sum(loss_buffer) / len(loss_buffer)
                log_buffer.average()
                info = f"Steps [{(epoch-1)*len(train_dataloader)+step+1}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \
                       f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({model.module.h}, {model.module.w}), "
                info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()])
                logger.info(info)
                last_tic = time.time()
                log_buffer.clear()
                data_time_all = 0
            logs.update(lr=lr)
            accelerator.log(logs, step=global_step + start_step)

            global_step += 1
            data_time_start= time.time()

            synchronize()
            if accelerator.is_main_process:
                if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0:
                    os.umask(0o000)
                    save_checkpoint(os.path.join(config.work_dir, 'checkpoints'),
                                    epoch=epoch,
                                    step=(epoch - 1) * len(train_dataloader) + step + 1,
                                    model=accelerator.unwrap_model(model),
                                    model_ema=accelerator.unwrap_model(model_ema),
                                    optimizer=optimizer,
                                    lr_scheduler=lr_scheduler
                                    )
            synchronize()

        synchronize()
        if accelerator.is_main_process:
            if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs:
                os.umask(0o000)
                save_checkpoint(os.path.join(config.work_dir, 'checkpoints'),
                                epoch=epoch,
                                step=(epoch - 1) * len(train_dataloader) + step + 1,
                                model=accelerator.unwrap_model(model),
                                model_ema=accelerator.unwrap_model(model_ema),
                                optimizer=optimizer,
                                lr_scheduler=lr_scheduler
                                )
        synchronize()


def parse_args():
    parser = argparse.ArgumentParser(description="Process some integers.")
    parser.add_argument("config", type=str, help="config")
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument('--resume-from', help='the dir to resume the training')
    parser.add_argument('--load-from', default=None, help='the dir to load a ckpt for training')
    parser.add_argument('--local-rank', type=int, default=-1)
    parser.add_argument('--local_rank', type=int, default=-1)
    parser.add_argument('--debug', action='store_true')

    parser.add_argument('--save_step', type=int, default=100)
    parser.add_argument('--lr', type=float, default=5e-6)
    parser.add_argument('--train_class', type=str)
    parser.add_argument('--prompt', type=str, default='a photo of sks dog')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    config = read_config(args.config)
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        config.work_dir = args.work_dir
    if args.resume_from is not None:
        config.resume_from = dict(
            checkpoint=args.resume_from,
            load_ema=False,
            resume_optimizer=True,
            resume_lr_scheduler=True)
    if args.debug:
        config.log_interval = 1
        config.train_batch_size = 1

        config.save_model_steps=args.save_step
        config.data.update({'prompt': [args.prompt], 'root': args.train_class})
        config.optimizer.update({'lr': args.lr})

    os.umask(0o000)
    os.makedirs(config.work_dir, exist_ok=True)

    init_handler = InitProcessGroupKwargs()
    init_handler.timeout = datetime.timedelta(seconds=5400)  # change timeout to avoid a strange NCCL bug
    # Initialize accelerator and tensorboard logging
    if config.use_fsdp:
        init_train = 'FSDP'
        from accelerate import FullyShardedDataParallelPlugin
        from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
        set_fsdp_env()
        fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),)
    else:
        init_train = 'DDP'
        fsdp_plugin = None

    even_batches = True
    if config.multi_scale:
        even_batches=False,

    accelerator = Accelerator(
        mixed_precision=config.mixed_precision,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        log_with="tensorboard",
        project_dir=os.path.join(config.work_dir, "logs"),
        fsdp_plugin=fsdp_plugin,
        even_batches=even_batches,
        kwargs_handlers=[init_handler]
    )

    logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log'))

    config.seed = init_random_seed(config.get('seed', None))
    set_random_seed(config.seed)

    if accelerator.is_main_process:
        config.dump(os.path.join(config.work_dir, 'config.py'))

    logger.info(f"Config: \n{config.pretty_text}")
    logger.info(f"World_size: {get_world_size()}, seed: {config.seed}")
    logger.info(f"Initializing: {init_train} for training")
    image_size = config.image_size  # @param [256, 512]
    latent_size = int(image_size) // 8
    pred_sigma = getattr(config, 'pred_sigma', True)
    learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma
    model_kwargs={"window_block_indexes": config.window_block_indexes, "window_size": config.window_size,
                  "use_rel_pos": config.use_rel_pos, "lewei_scale": config.lewei_scale, 'config':config,
                  'model_max_length': config.model_max_length}

    # build models
    train_diffusion = IDDPM(str(config.train_sampling_steps))
    eval_diffusion = IDDPM(str(config.eval_sampling_steps))

    model = build_model(config.model,
                        config.grad_checkpointing,
                        config.get('fp32_attention', False),
                        input_size=latent_size,
                        learn_sigma=learn_sigma,
                        pred_sigma=pred_sigma,
                        **model_kwargs).train()
    logger.info(f"{config.model} Model Parameters: {sum(p.numel() for p in model.parameters()):,}")
    model_ema = deepcopy(model).eval()

    if config.load_from is not None:
        if args.load_from is not None:
            config.load_from = args.load_from
        missing, unexpected = load_checkpoint(config.load_from, model, load_ema=config.get('load_ema', False))
        # model.reparametrize()
        if accelerator.is_main_process:
            print('Warning Missing keys: ', missing)
            print('Warning Unexpected keys', unexpected)

    ema_update(model_ema, model, 0.)

    # prepare for FSDP clip grad norm calculation
    if accelerator.distributed_type == DistributedType.FSDP:
        for m in accelerator._models:
            m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m)

    # build dataloader
    logger.warning(f"Training prompt: {config.data['prompt']}, Training data class: {config.data['root']}")
    set_data_root(config.data_root)
    dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type)
    if config.multi_scale:
        batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
                                                batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True,
                                                ratio_nums=dataset.ratio_nums, config=config, valid_num=1)
        # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
        #                                                 batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio,
        #                                                 ratio_nums=dataset.ratio_nums)
        train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers)
    else:
        train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True)

    # build optimizer and lr scheduler
    lr_scale_ratio = 1
    if config.get('auto_lr', None):
        lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps,
                                       config.optimizer,
                                       **config.auto_lr)
    optimizer = build_optimizer(model, config.optimizer)
    lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio)

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())

    if accelerator.is_main_process:
        accelerator.init_trackers(f"tb_{timestamp}")

    start_epoch = 0
    if config.resume_from is not None and config.resume_from['checkpoint'] is not None:
        start_epoch, missing, unexpected = load_checkpoint(**config.resume_from,
                                                           model=model,
                                                           model_ema=model_ema,
                                                           optimizer=optimizer,
                                                           lr_scheduler=lr_scheduler,
                                                           )

        if accelerator.is_main_process:
            print('Warning Missing keys: ', missing)
            print('Warning Unexpected keys', unexpected)
    # Prepare everything
    # There is no specific order to remember, you just need to unpack the
    # objects in the same order you gave them to the prepare method.
    model, model_ema = accelerator.prepare(model, model_ema)
    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
    train()

================================================
FILE: PixArt-alpha-ToCa/train_scripts/train_pixart_lcm.py
================================================
import os
import sys
import types
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import argparse
import datetime
import time
import warnings
warnings.filterwarnings("ignore")  # ignore warning
import torch
import torch.nn as nn
from accelerate import Accelerator, InitProcessGroupKwargs
from accelerate.utils import DistributedType
from diffusers.models import AutoencoderKL
from torch.utils.data import RandomSampler
from mmcv.runner import LogBuffer
from copy import deepcopy
import numpy as np
import torch.nn.functional as F
from tqdm import tqdm

from diffusion import IDDPM
from diffusion.utils.checkpoint import save_checkpoint, load_checkpoint
from diffusion.utils.dist_utils import synchronize, get_world_size, clip_grad_norm_
from diffusion.data.builder import build_dataset, build_dataloader, set_data_root
from diffusion.model.builder import build_model
from diffusion.utils.logger import get_root_logger
from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow
from diffusion.utils.optimizer import build_optimizer, auto_scale_lr
from diffusion.utils.lr_scheduler import build_lr_scheduler
from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler
from diffusion.lcm_scheduler import LCMScheduler
from torchvision.utils import save_image


def set_fsdp_env():
    os.environ["ACCELERATE_USE_FSDP"] = 'true'
    os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP'
    os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE'
    os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock'


def ema_update(model_dest: nn.Module, model_src: nn.Module, rate):
    param_dict_src = dict(model_src.named_parameters())
    for p_name, p_dest in model_dest.named_parameters():
        p_src = param_dict_src[p_name]
        assert p_src is not p_dest
        p_dest.data.mul_(rate).add_((1 - rate) * p_src.data)


def append_dims(x, target_dims):
    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
    dims_to_append = target_dims - x.ndim
    if dims_to_append < 0:
        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
    return x[(...,) + (None,) * dims_to_append]


# From LCMScheduler.get_scalings_for_boundary_condition_discrete
def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0):
    c_skip = sigma_data**2 / ((timestep / 0.1) ** 2 + sigma_data**2)
    c_out = (timestep / 0.1) / ((timestep / 0.1) ** 2 + sigma_data**2) ** 0.5
    return c_skip, c_out


def extract_into_tensor(a, t, x_shape):
    b, *_ = t.shape
    out = a.gather(-1, t)
    return out.reshape(b, *((1,) * (len(x_shape) - 1)))


class DDIMSolver:
    def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50):
        # DDIM sampling parameters
        step_ratio = timesteps // ddim_timesteps

        self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1
        self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps]
        self.ddim_alpha_cumprods_prev = np.asarray(
            [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist()
        )
        # convert to torch tensors
        self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long()
        self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods)
        self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev)

    def to(self, device):
        self.ddim_timesteps = self.ddim_timesteps.to(device)
        self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device)
        self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device)
        return self

    def ddim_step(self, pred_x0, pred_noise, timestep_index):
        alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape)
        dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise
        x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt
        return x_prev


@torch.no_grad()
def log_validation(model, step, device):
    if hasattr(model, 'module'):
        model = model.module
    scheduler = LCMScheduler(beta_start=0.0001, beta_end=0.02, beta_schedule="linear", prediction_type="epsilon")
    scheduler.set_timesteps(4, 50)
    infer_timesteps = scheduler.timesteps

    dog_embed = torch.load('data/tmp/dog.pth', map_location='cpu')
    caption_embs, emb_masks = dog_embed['dog_text'].to(device), dog_embed['dog_mask'].to(device)
    hw = torch.tensor([[1024, 1024]], dtype=torch.float, device=device).repeat(1, 1)
    ar = torch.tensor([[1.]], device=device).repeat(1, 1)
    # Create sampling noise:
    infer_latents = torch.randn(1, 4, 1024, 1024, device=device)
    model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=emb_masks)
    logger.info("Running validation... ")

    # 7. LCM MultiStep Sampling Loop:
    for i, t in tqdm(list(enumerate(infer_timesteps))):
        ts = torch.full((1,), t, device=device, dtype=torch.long)

        # model prediction (v-prediction, eps, x)
        model_pred = model(infer_latents, ts, caption_embs, **model_kwargs)[:, :4]

        # compute the previous noisy sample x_t -> x_t-1
        infer_latents, denoised = scheduler.step(model_pred, i, t, infer_latents, return_dict=False)
    samples = vae.decode(denoised / 0.18215).sample
    torch.cuda.empty_cache()
    save_image(samples[0], f'output_cv/vis/{step}.jpg', nrow=1, normalize=True, value_range=(-1, 1))


def train():
    if config.get('debug_nan', False):
        DebugUnderflowOverflow(model)
        logger.info('NaN debugger registered. Start to detect overflow during training.')
    time_start, last_tic = time.time(), time.time()
    log_buffer = LogBuffer()

    start_step = start_epoch * len(train_dataloader)
    global_step = 0
    total_steps = len(train_dataloader) * config.num_epochs

    load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False)

    # Create uncond embeds for classifier free guidance
    uncond_prompt_embeds = model.module.y_embedder.y_embedding.repeat(config.train_batch_size, 1, 1, 1)

    # Now you train the model
    for epoch in range(start_epoch + 1, config.num_epochs + 1):
        data_time_start= time.time()
        data_time_all = 0
        for step, batch in enumerate(train_dataloader):
            data_time_all += time.time() - data_time_start
            if load_vae_feat:
                z = batch[0]
            else:
                with torch.no_grad():
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision == 'fp16'):
                        posterior = vae.encode(batch[0]).latent_dist
                        if config.sample_posterior:
                            z = posterior.sample()
                        else:
                            z = posterior.mode()
            latents = z * config.scale_factor
            y = batch[1]
            y_mask = batch[2]
            data_info = batch[3]

            # Sample a random timestep for each image
            grad_norm = None
            with accelerator.accumulate(model):
                # Predict the noise residual
                optimizer.zero_grad()
                # Sample noise that we'll add to the latents
                noise = torch.randn_like(latents)
                bsz = latents.shape[0]

                # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias.
                topk = config.train_sampling_steps // config.num_ddim_timesteps
                index = torch.randint(0, config.num_ddim_timesteps, (bsz,), device=latents.device).long()
                start_timesteps = solver.ddim_timesteps[index]
                timesteps = start_timesteps - topk
                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)

                # Get boundary scalings for start_timesteps and (end) timesteps.
                c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps)
                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
                c_skip, c_out = scalings_for_boundary_conditions(timesteps)
                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]

                # Sample a random guidance scale w from U[w_min, w_max] and embed it
                # w = (config.w_max - config.w_min) * torch.rand((bsz,)) + config.w_min
                w = config.cfg_scale * torch.ones((bsz,))
                w = w.reshape(bsz, 1, 1, 1)
                w = w.to(device=latents.device, dtype=latents.dtype)

                # Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k}
                _, pred_x_0, noisy_model_input = train_diffusion.training_losses(model, latents, start_timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info), noise=noise)

                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0

                # Use the ODE solver to predict the kth step in the augmented PF-ODE trajectory after
                # noisy_latents with both the conditioning embedding c and unconditional embedding 0
                # Get teacher model prediction on noisy_latents and conditional embedding
                with torch.no_grad():
                    with torch.autocast("cuda"):
                        cond_teacher_output, cond_pred_x0, _ = train_diffusion.training_losses(model_teacher, latents, start_timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info), noise=noise)

                        # Get teacher model prediction on noisy_latents and unconditional embedding
                        uncond_teacher_output, uncond_pred_x0, _ = train_diffusion.training_losses(model_teacher, latents, start_timesteps, model_kwargs=dict(y=uncond_prompt_embeds, mask=y_mask, data_info=data_info), noise=noise)

                        # Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation)
                        pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
                        pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output)
                        x_prev = solver.ddim_step(pred_x0, pred_noise, index)

                # Get target LCM prediction on x_prev, w, c, t_n
                with torch.no_grad():
                    with torch.autocast("cuda", enabled=True):
                        _, pred_x_0, _ = train_diffusion.training_losses(model_ema, x_prev.float(), timesteps, model_kwargs=dict(y=y, mask=y_mask, data_info=data_info), skip_noise=True)

                    target = c_skip * x_prev + c_out * pred_x_0

                # Calculate loss
                if config.loss_type == "l2":
                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                elif config.loss_type == "huber":
                    loss = torch.mean(torch.sqrt((model_pred.float() - target.float()) ** 2 + config.huber_c**2) - config.huber_c)

                # Backpropagation on the online student model (`model`)
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad(set_to_none=True)

                if accelerator.sync_gradients:
                    ema_update(model_ema, model, config.ema_decay)

            lr = lr_scheduler.get_last_lr()[0]
            logs = {"loss": accelerator.gather(loss).mean().item()}
            if grad_norm is not None:
                logs.update(grad_norm=accelerator.gather(grad_norm).mean().item())
            log_buffer.update(logs)
            if (step + 1) % config.log_interval == 0 or (step + 1) == 1:
                t = (time.time() - last_tic) / config.log_interval
                t_d = data_time_all / config.log_interval
                avg_time = (time.time() - time_start) / (global_step + 1)
                eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1))))
                eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1))))
                # avg_loss = sum(loss_buffer) / len(loss_buffer)
                log_buffer.average()
                info = f"Step/Epoch [{(epoch-1)*len(train_dataloader)+step+1}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \
                       f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({data_info['resolution'][0][0].item()}, {data_info['resolution'][0][1].item()}), "
                info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()])
                logger.info(info)
                last_tic = time.time()
                log_buffer.clear()
                data_time_all = 0
            logs.update(lr=lr)
            accelerator.log(logs, step=global_step + start_step)

            global_step += 1
            data_time_start= time.time()

            synchronize()
            torch.cuda.empty_cache()
            if accelerator.is_main_process:
                # log_validation(model_ema, step, model.device)
                if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0:
                    os.umask(0o000)
                    save_checkpoint(os.path.join(config.work_dir, 'checkpoints'),
                                    epoch=epoch,
                                    step=(epoch - 1) * len(train_dataloader) + step + 1,
                                    model=accelerator.unwrap_model(model),
                                    model_ema=accelerator.unwrap_model(model_ema),
                                    optimizer=optimizer,
                                    lr_scheduler=lr_scheduler
                                    )
            synchronize()

        synchronize()
        if accelerator.is_main_process:
            if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs:
                os.umask(0o000)
                save_checkpoint(os.path.join(config.work_dir, 'checkpoints'),
                                epoch=epoch,
                                step=(epoch - 1) * len(train_dataloader) + step + 1,
                                model=accelerator.unwrap_model(model),
                                model_ema=accelerator.unwrap_model(model_ema),
                                optimizer=optimizer,
                                lr_scheduler=lr_scheduler
                                )
        synchronize()


def parse_args():
    parser = argparse.ArgumentParser(description="Process some integers.")
    parser.add_argument("config", type=str, help="config")
    parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine")
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument('--resume-from', help='the dir to resume the training')
    parser.add_argument('--load-from', default=None, help='the dir to load a ckpt for training')
    parser.add_argument('--local-rank', type=int, default=-1)
    parser.add_argument('--local_rank', type=int, default=-1)
    parser.add_argument('--debug', action='store_true')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    config = read_config(args.config)
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        config.work_dir = args.work_dir
    if args.cloud:
        config.data_root = '/data/data'
    if args.resume_from is not None:
        config.load_from = None
        config.resume_from = dict(
            checkpoint=args.resume_from,
            load_ema=False,
            resume_optimizer=True,
            resume_lr_scheduler=True)
    if args.debug:
        config.log_interval = 1
        config.train_batch_size = 11
        config.valid_num = 100
        config.load_from = None

    os.umask(0o000)
    os.makedirs(config.work_dir, exist_ok=True)

    init_handler = InitProcessGroupKwargs()
    init_handler.timeout = datetime.timedelta(seconds=5400)  # change timeout to avoid a strange NCCL bug
    # Initialize accelerator and tensorboard logging
    if config.use_fsdp:
        init_train = 'FSDP'
        from accelerate import FullyShardedDataParallelPlugin
        from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
        set_fsdp_env()
        fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),)
    else:
        init_train = 'DDP'
        fsdp_plugin = None

    even_batches = True
    if config.multi_scale:
        even_batches=False,

    accelerator = Accelerator(
        mixed_precision=config.mixed_precision,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        log_with="tensorboard",
        project_dir=os.path.join(config.work_dir, "logs"),
        fsdp_plugin=fsdp_plugin,
        even_batches=even_batches,
        kwargs_handlers=[init_handler]
    )

    logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log'))

    config.seed = init_random_seed(config.get('seed', None))
    set_random_seed(config.seed)

    if accelerator.is_main_process:
        config.dump(os.path.join(config.work_dir, 'config.py'))

    logger.info(f"Config: \n{config.pretty_text}")
    logger.info(f"World_size: {get_world_size()}, seed: {config.seed}")
    logger.info(f"Initializing: {init_train} for training")
    image_size = config.image_size  # @param [256, 512]
    latent_size = int(image_size) // 8
    pred_sigma = getattr(config, 'pred_sigma', True)
    learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma
    model_kwargs={"window_block_indexes": config.window_block_indexes, "window_size": config.window_size,
                  "use_rel_pos": config.use_rel_pos, "lewei_scale": config.lewei_scale, 'config':config,
                  'model_max_length': config.model_max_length}

    # build models
    train_diffusion = IDDPM(str(config.train_sampling_steps), learn_sigma=learn_sigma, pred_sigma=pred_sigma,
                            snr=config.snr_loss, return_startx=True)
    model = build_model(config.model,
                        config.grad_checkpointing,
                        config.get('fp32_attention', False),
                        input_size=latent_size,
                        learn_sigma=learn_sigma,
                        pred_sigma=pred_sigma,
                        **model_kwargs).train()
    logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):,}")

    if config.load_from is not None:
        if args.load_from is not None:
            config.load_from = args.load_from
        missing, unexpected = load_checkpoint(config.load_from, model, load_ema=config.get('load_ema', False))
        logger.warning(f'Missing keys: {missing}')
        logger.warning(f'Unexpected keys: {unexpected}')

    model_ema = deepcopy(model).eval()
    model_teacher = deepcopy(model).eval()

    if not config.data.load_vae_feat:
        vae = AutoencoderKL.from_pretrained(config.vae_pretrained).cuda()

    # prepare for FSDP clip grad norm calculation
    if accelerator.distributed_type == DistributedType.FSDP:
        for m in accelerator._models:
            m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m)

    # build dataloader
    set_data_root(config.data_root)
    dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type)
    if config.multi_scale:
        batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
                                                batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True,
                                                ratio_nums=dataset.ratio_nums, config=config, valid_num=config.valid_num)
        # used for balanced sampling
        # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
        #                                                 batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio,
        #                                                 ratio_nums=dataset.ratio_nums)
        train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers)
    else:
        train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True)

    # build optimizer and lr scheduler
    lr_scale_ratio = 1
    if config.get('auto_lr', None):
        lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps,
                                       config.optimizer,
                                       **config.auto_lr)
    optimizer = build_optimizer(model, config.optimizer)
    lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio)

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())

    if accelerator.is_main_process:
        accelerator.init_trackers(f"tb_{timestamp}")

    start_epoch = 0
    if config.resume_from is not None and config.resume_from['checkpoint'] is not None:
        start_epoch, missing, unexpected = load_checkpoint(**config.resume_from,
                                                           model=model,
                                                           model_ema=model_ema,
                                                           optimizer=optimizer,
                                                           lr_scheduler=lr_scheduler,
                                                           )

        logger.warning(f'Missing keys: {missing}')
        logger.warning(f'Unexpected keys: {unexpected}')

    solver = DDIMSolver(train_diffusion.alphas_cumprod, timesteps=config.train_sampling_steps, ddim_timesteps=config.num_ddim_timesteps)
    solver.to(accelerator.device)
    # Prepare everything
    # There is no specific order to remember, you just need to unpack the
    # objects in the same order you gave them to the prepare method.
    model, model_ema, model_teacher = accelerator.prepare(model, model_ema, model_teacher)
    # model, model_ema = accelerator.prepare(model, model_ema)
    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
    train()


================================================
FILE: PixArt-alpha-ToCa/train_scripts/train_pixart_lcm_lora.py
================================================
import os
import sys
import types
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import argparse
import datetime
import time
import warnings
warnings.filterwarnings("ignore")  # ignore warning
import torch
from accelerate import Accelerator, InitProcessGroupKwargs
from accelerate.utils import DistributedType
from torch.utils.data import RandomSampler
from mmcv.runner import LogBuffer
import torch.nn.functional as F
import numpy as np
import re
from packaging import version
import accelerate

from diffusion import IDDPM
from diffusion.utils.dist_utils import get_world_size, clip_grad_norm_
from diffusion.data.builder import build_dataset, build_dataloader, set_data_root
from diffusion.utils.logger import get_root_logger
from diffusion.utils.misc import set_random_seed, read_config, init_random_seed, DebugUnderflowOverflow
from diffusion.utils.optimizer import build_optimizer, auto_scale_lr
from diffusion.utils.lr_scheduler import build_lr_scheduler
from diffusion.utils.data_sampler import AspectRatioBatchSampler, BalancedAspectRatioBatchSampler
from peft import LoraConfig, get_peft_model, get_peft_model_state_dict
from diffusers import AutoencoderKL, Transformer2DModel, StableDiffusionPipeline, PixArtAlphaPipeline


def set_fsdp_env():
    os.environ["ACCELERATE_USE_FSDP"] = 'true'
    os.environ["FSDP_AUTO_WRAP_POLICY"] = 'TRANSFORMER_BASED_WRAP'
    os.environ["FSDP_BACKWARD_PREFETCH"] = 'BACKWARD_PRE'
    os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = 'PixArtBlock'

def filter_keys(key_set):
    def _f(dictionary):
        return {k: v for k, v in dictionary.items() if k in key_set}

    return _f


def append_dims(x, target_dims):
    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
    dims_to_append = target_dims - x.ndim
    if dims_to_append < 0:
        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
    return x[(...,) + (None,) * dims_to_append]


# From LCMScheduler.get_scalings_for_boundary_condition_discrete
def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0):
    c_skip = sigma_data**2 / ((timestep / 0.1) ** 2 + sigma_data**2)
    c_out = (timestep / 0.1) / ((timestep / 0.1) ** 2 + sigma_data**2) ** 0.5
    return c_skip, c_out


# Compare LCMScheduler.step, Step 4
def predicted_origin(model_output, timesteps, sample, prediction_type, alphas, sigmas):
    if prediction_type == "epsilon":
        sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
        alphas = extract_into_tensor(alphas, timesteps, sample.shape)
        pred_x_0 = (sample - sigmas * model_output) / alphas
    elif prediction_type == "v_prediction":
        sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
        alphas = extract_into_tensor(alphas, timesteps, sample.shape)
        pred_x_0 = alphas * sample - sigmas * model_output
    else:
        raise ValueError(f"Prediction type {prediction_type} currently not supported.")

    return pred_x_0


def extract_into_tensor(a, t, x_shape):
    b, *_ = t.shape
    out = a.gather(-1, t)
    return out.reshape(b, *((1,) * (len(x_shape) - 1)))


class DDIMSolver:
    def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50):
        # DDIM sampling parameters
        step_ratio = timesteps // ddim_timesteps

        self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1
        self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps]
        self.ddim_alpha_cumprods_prev = np.asarray(
            [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist()
        )
        # convert to torch tensors
        self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long()
        self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods)
        self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev)

    def to(self, device):
        self.ddim_timesteps = self.ddim_timesteps.to(device)
        self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device)
        self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device)
        return self

    def ddim_step(self, pred_x0, pred_noise, timestep_index):
        alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape)
        dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise
        x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt
        return x_prev


def train(model):
    if config.get('debug_nan', False):
        DebugUnderflowOverflow(model)
        logger.info('NaN debugger registered. Start to detect overflow during training.')
    time_start, last_tic = time.time(), time.time()
    log_buffer = LogBuffer()

    global_step = start_step

    load_vae_feat = getattr(train_dataloader.dataset, 'load_vae_feat', False)

    # Create uncond embeds for classifier free guidance
    uncond_prompt_embeds = torch.load('output/pretrained_models/null_embed.pth', map_location='cpu').to(accelerator.device).repeat(config.train_batch_size, 1, 1, 1)

    # Now you train the model
    for epoch in range(start_epoch + 1, config.num_epochs + 1):
        data_time_start= time.time()
        data_time_all = 0
        for step, batch in enumerate(train_dataloader):
            data_time_all += time.time() - data_time_start
            if load_vae_feat:
                z = batch[0]
            else:
                with torch.no_grad():
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision == 'fp16'):
                        posterior = vae.encode(batch[0]).latent_dist
                        if config.sample_posterior:
                            z = posterior.sample()
                        else:
                            z = posterior.mode()
            latents = (z * config.scale_factor).to(weight_dtype)
            y = batch[1].squeeze(1).to(weight_dtype)
            y_mask = batch[2].squeeze(1).squeeze(1).to(weight_dtype)
            data_info = {'resolution': batch[3]['img_hw'].to(weight_dtype), 'aspect_ratio': batch[3]['aspect_ratio'].to(weight_dtype),}

            # Sample a random timestep for each image
            grad_norm = None
            with accelerator.accumulate(model):
                # Predict the noise residual
                optimizer.zero_grad()
                # Sample noise that we'll add to the latents
                noise = torch.randn_like(latents)
                bsz = latents.shape[0]

                # Sample a random timestep for each image t_n ~ U[0, N - k - 1] without bias.
                topk = config.train_sampling_steps // config.num_ddim_timesteps
                index = torch.randint(0, config.num_ddim_timesteps, (bsz,), device=latents.device).long()
                start_timesteps = solver.ddim_timesteps[index]
                timesteps = start_timesteps - topk
                timesteps = torch.where(timesteps < 0, torch.zeros_like(timesteps), timesteps)

                # Get boundary scalings for start_timesteps and (end) timesteps.
                c_skip_start, c_out_start = scalings_for_boundary_conditions(start_timesteps)
                c_skip_start, c_out_start = [append_dims(x, latents.ndim) for x in [c_skip_start, c_out_start]]
                c_skip, c_out = scalings_for_boundary_conditions(timesteps)
                c_skip, c_out = [append_dims(x, latents.ndim) for x in [c_skip, c_out]]

                # Sample a random guidance scale w from U[w_min, w_max] and embed it
                # w = (config.w_max - config.w_min) * torch.rand((bsz,)) + config.w_min
                w = config.cfg_scale * torch.ones((bsz,))
                w = w.reshape(bsz, 1, 1, 1)
                w = w.to(device=latents.device, dtype=latents.dtype)

                # Get online LCM prediction on z_{t_{n + k}}, w, c, t_{n + k}
                _, pred_x_0, noisy_model_input  = train_diffusion.training_losses_diffusers(
                    model, latents, start_timesteps,
                    model_kwargs=dict(encoder_hidden_states=y, encoder_attention_mask=y_mask, added_cond_kwargs=data_info),
                    noise=noise
                )
                model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0

                with torch.no_grad():
                    with torch.autocast("cuda"):
                        cond_teacher_output, cond_pred_x0, _ = train_diffusion.training_losses_diffusers(
                            model_teacher, latents, start_timesteps,
                            model_kwargs=dict(encoder_hidden_states=y, encoder_attention_mask=y_mask, added_cond_kwargs=data_info),
                            noise=noise
                        )
                        # Get teacher model prediction on noisy_latents and unconditional embedding
                        uncond_teacher_output, uncond_pred_x0, _ = train_diffusion.training_losses_diffusers(
                            model_teacher, latents, start_timesteps,
                            model_kwargs=dict(encoder_hidden_states=uncond_prompt_embeds, encoder_attention_mask=y_mask, added_cond_kwargs=data_info),
                            noise=noise
                        )

                        # Perform "CFG" to get x_prev estimate (using the LCM paper's CFG formulation)
                        pred_x0 = cond_pred_x0 + w * (cond_pred_x0 - uncond_pred_x0)
                        pred_noise = cond_teacher_output + w * (cond_teacher_output - uncond_teacher_output)
                        x_prev = solver.ddim_step(pred_x0, pred_noise, index)

                # Get target LCM prediction on x_prev, w, c, t_n
                with torch.no_grad():
                    with torch.autocast("cuda", enabled=True):
                        _, pred_x_0, _ = train_diffusion.training_losses_diffusers(
                            model, x_prev.float(), timesteps,
                            model_kwargs=dict(encoder_hidden_states=y, encoder_attention_mask=y_mask, added_cond_kwargs=data_info),
                            skip_noise=True
                        )

                    target = c_skip * x_prev + c_out * pred_x_0

                # Calculate loss
                if config.loss_type == "l2":
                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                elif config.loss_type == "huber":
                    loss = torch.mean(torch.sqrt((model_pred.float() - target.float()) ** 2 + config.huber_c**2) - config.huber_c)

                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    grad_norm = accelerator.clip_grad_norm_(model.parameters(), config.gradient_clip)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad(set_to_none=True)

            lr = lr_scheduler.get_last_lr()[0]
            logs = {"loss": accelerator.gather(loss).mean().item()}
            if grad_norm is not None:
                logs.update(grad_norm=accelerator.gather(grad_norm).mean().item())
            log_buffer.update(logs)
            if (step + 1) % config.log_interval == 0 or (step + 1) == 1:
                t = (time.time() - last_tic) / config.log_interval
                t_d = data_time_all / config.log_interval
                avg_time = (time.time() - time_start) / (global_step + 1)
                eta = str(datetime.timedelta(seconds=int(avg_time * (total_steps - start_step - global_step - 1))))
                eta_epoch = str(datetime.timedelta(seconds=int(avg_time * (len(train_dataloader) - step - 1))))
                # avg_loss = sum(loss_buffer) / len(loss_buffer)
                log_buffer.average()
                info = f"Step/Epoch [{(epoch-1)*len(train_dataloader)+step+1}/{epoch}][{step + 1}/{len(train_dataloader)}]:total_eta: {eta}, " \
                       f"epoch_eta:{eta_epoch}, time_all:{t:.3f}, time_data:{t_d:.3f}, lr:{lr:.3e}, s:({data_info['resolution'][0][0].item()}, {data_info['resolution'][0][1].item()}), "
                info += ', '.join([f"{k}:{v:.4f}" for k, v in log_buffer.output.items()])
                logger.info(info)
                last_tic = time.time()
                log_buffer.clear()
                data_time_all = 0
            logs.update(lr=lr)
            accelerator.log(logs, step=global_step + start_step)

            global_step += 1
            data_time_start= time.time()

            accelerator.wait_for_everyone()
            if accelerator.is_main_process:
                if ((epoch - 1) * len(train_dataloader) + step + 1) % config.save_model_steps == 0:
                    save_path = os.path.join(os.path.join(config.work_dir, 'checkpoints'), f"checkpoint-{(epoch - 1) * len(train_dataloader) + step + 1}")
                    os.umask(0o000)
                    logger.info(f"Start to save state to {save_path}")
                    accelerator.save_state(save_path)
                    logger.info(f"Saved state to {save_path}")


        accelerator.wait_for_everyone()
        if epoch % config.save_model_epochs == 0 or epoch == config.num_epochs:
            os.umask(0o000)
            save_path = os.path.join(os.path.join(config.work_dir, 'checkpoints'), f"checkpoint-{(epoch - 1) * len(train_dataloader) + step + 1}")
            logger.info(f"Start to save state to {save_path}")
            model = accelerator.unwrap_model(model)
            model.save_pretrained(save_path)
            lora_state_dict = get_peft_model_state_dict(model, adapter_name="default")
            StableDiffusionPipeline.save_lora_weights(os.path.join(save_path, "transformer_lora"), lora_state_dict)
            logger.info(f"Saved state to {save_path}")


def parse_args():
    parser = argparse.ArgumentParser(description="Process some integers.")
    parser.add_argument("config", type=str, help="config")
    parser.add_argument("--cloud", action='store_true', default=False, help="cloud or local machine")
    parser.add_argument("--work-dir", default='output', help='the dir to save logs and models')
    parser.add_argument("--resume-from", help='the dir to save logs and models')
    parser.add_argument("--local-rank", type=int, default=-1)
    parser.add_argument("--local_rank", type=int, default=-1)
    parser.add_argument("--debug", action='store_true')
    parser.add_argument("--lora_rank", type=int, default=64, help="The rank of the LoRA projection matrix.", )
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    config = read_config(args.config)

    config.resume_from = None
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        config.work_dir = args.work_dir
    if args.cloud:
        config.data_root = '/data/data'
    if args.resume_from is not None:
        config.resume_from = args.resume_from
    if args.debug:
        config.log_interval = 1
        config.train_batch_size = 4
        config.valid_num = 10
        config.save_model_steps = 10

    os.umask(0o000)
    os.makedirs(config.work_dir, exist_ok=True)

    init_handler = InitProcessGroupKwargs()
    init_handler.timeout = datetime.timedelta(seconds=5400)  # change timeout to avoid a strange NCCL bug
    # Initialize accelerator and tensorboard logging
    if config.use_fsdp:
        init_train = 'FSDP'
        from accelerate import FullyShardedDataParallelPlugin
        from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
        set_fsdp_env()
        fsdp_plugin = FullyShardedDataParallelPlugin(state_dict_config=FullStateDictConfig(offload_to_cpu=False, rank0_only=False),)
    else:
        init_train = 'DDP'
        fsdp_plugin = None

    even_batches = True
    if config.multi_scale:
        even_batches=False,

    accelerator = Accelerator(
        mixed_precision=config.mixed_precision,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        log_with="tensorboard",
        project_dir=os.path.join(config.work_dir, "logs"),
        fsdp_plugin=fsdp_plugin,
        even_batches=even_batches,
        kwargs_handlers=[init_handler]
    )

    logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log'))

    logger.info(accelerator.state)
    config.seed = init_random_seed(config.get('seed', None))
    set_random_seed(config.seed)

    if accelerator.is_main_process:
        config.dump(os.path.join(config.work_dir, 'config.py'))

    logger.info(f"Config: \n{config.pretty_text}")
    logger.info(f"World_size: {get_world_size()}, seed: {config.seed}")
    logger.info(f"Initializing: {init_train} for training")
    image_size = config.image_size  # @param [256, 512]
    latent_size = int(image_size) // 8
    pred_sigma = getattr(config, 'pred_sigma', True)
    learn_sigma = getattr(config, 'learn_sigma', True) and pred_sigma

    # prepare null_embedding for training
    if not os.path.exists('output/pretrained_models/null_embed.pth'):
        logger.info(f"Creating output/pretrained_models/null_embed.pth")
        os.makedirs('output/pretrained_models/', exist_ok=True)
        pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16, use_safetensors=True,).to("cuda")
        torch.save(pipe.encode_prompt(""), 'output/pretrained_models/null_embed.pth')
        del pipe
        torch.cuda.empty_cache()

    # build models
    train_diffusion = IDDPM(str(config.train_sampling_steps), learn_sigma=learn_sigma, pred_sigma=pred_sigma, return_startx=True)
    model_teacher = Transformer2DModel.from_pretrained(config.load_from, subfolder="transformer")
    model_teacher.requires_grad_(False)
    model = Transformer2DModel.from_pretrained(config.load_from, subfolder="transformer").train()
    logger.info(f"{model.__class__.__name__} Model Parameters: {sum(p.numel() for p in model.parameters()):}")

    lora_config = LoraConfig(
        r=config.lora_rank,
        target_modules=[
            "to_q",
            "to_k",
            "to_v",
            "to_out.0",
            "proj_in",
            "proj_out",
            "ff.net.0.proj",
            "ff.net.2",
            "proj",
            "linear",
            "linear_1",
            "linear_2",
            # "scale_shift_table",      # not available due to the implementation in huggingface/peft, working on it.
        ],
    )
    print(lora_config)
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # 9. Handle mixed precision and device placement
    # For mixed precision training we cast all non-trainable weigths to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
        weight_dtype = torch.float16
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

    # 11. Enable optimizations
    # model.enable_xformers_memory_efficient_attention()
    # model_teacher.enable_xformers_memory_efficient_attention()

    lora_layers = filter(lambda p: p.requires_grad, model.parameters())

    # for name, params in model.named_parameters():
    #     if params.requires_grad == False: logger.info(f"freeze param: {name}")
    #
    # for name, params in model.named_parameters():
    #     if params.requires_grad == True: logger.info(f"trainable param: {name}")

    # 10. Handle saving and loading of checkpoints
    # `accelerate` 0.16.0 will have better support for customized saving
    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
        def save_model_hook(models, weights, output_dir):
            if accelerator.is_main_process:
                transformer_ = accelerator.unwrap_model(models[0])
                lora_state_dict = get_peft_model_state_dict(transformer_, adapter_name="default")
                StableDiffusionPipeline.save_lora_weights(os.path.join(output_dir, "transformer_lora"), lora_state_dict)
                # save weights in peft format to be able to load them back
                transformer_.save_pretrained(output_dir)

                for _, model in enumerate(models):
                    # make sure to pop weight so that corresponding model is not saved again
                    weights.pop()

        def load_model_hook(models, input_dir):
            # load the LoRA into the model
            transformer_ = accelerator.unwrap_model(models[0])
            transformer_.load_adapter(input_dir, "default", is_trainable=True)

            for _ in range(len(models)):
                # pop models so that they are not loaded again
                models.pop()

        accelerator.register_save_state_pre_hook(save_model_hook)
        accelerator.register_load_state_pre_hook(load_model_hook)

    if config.grad_checkpointing:
        model.enable_gradient_checkpointing()

    if not config.data.load_vae_feat:
        vae = AutoencoderKL.from_pretrained(config.vae_pretrained).cuda()

    # prepare for FSDP clip grad norm calculation
    if accelerator.distributed_type == DistributedType.FSDP:
        for m in accelerator._models:
            m.clip_grad_norm_ = types.MethodType(clip_grad_norm_, m)

    # build dataloader
    set_data_root(config.data_root)
    dataset = build_dataset(config.data, resolution=image_size, aspect_ratio_type=config.aspect_ratio_type)
    if config.multi_scale:
        batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
                                                batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio, drop_last=True,
                                                ratio_nums=dataset.ratio_nums, config=config, valid_num=config.valid_num)
        # used for balanced sampling
        # batch_sampler = BalancedAspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
        #                                                 batch_size=config.train_batch_size, aspect_ratios=dataset.aspect_ratio,
        #                                                 ratio_nums=dataset.ratio_nums)
        train_dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=config.num_workers)
    else:
        train_dataloader = build_dataloader(dataset, num_workers=config.num_workers, batch_size=config.train_batch_size, shuffle=True)

    # build optimizer and lr scheduler
    lr_scale_ratio = 1
    if config.get('auto_lr', None):
        lr_scale_ratio = auto_scale_lr(config.train_batch_size * get_world_size() * config.gradient_accumulation_steps,
                                       config.optimizer,
                                       **config.auto_lr)
    optimizer = build_optimizer(model, config.optimizer)
    lr_scheduler = build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio)

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())

    if accelerator.is_main_process:
        accelerator.init_trackers(f"tb_{timestamp}")

    start_epoch = 0
    start_step = 0
    total_steps = len(train_dataloader) * config.num_epochs

    solver = DDIMSolver(train_diffusion.alphas_cumprod, timesteps=config.train_sampling_steps, ddim_timesteps=config.num_ddim_timesteps)
    solver.to(accelerator.device)

    # Prepare everything
    # There is no specific order to remember, you just need to unpack the
    # objects in the same order you gave them to the prepare method.
    model, model_teacher = accelerator.prepare(model, model_teacher)
    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)

    if config.resume_from is not None:
        if config.resume_from != "latest":
            path = os.path.basename(config.resume_from)
        else:
            # Get the most recent checkpoint
            dirs = os.listdir(os.path.join(config.work_dir, 'checkpoints'))
            dirs = [d for d in dirs if d.startswith("checkpoint")]
            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
            path = dirs[-1] if len(dirs) > 0 else None

        if path is None:
            accelerator.print(f"Checkpoint '{config.resume_from}' does not exist. Starting a new training run.")
            config.resume_from = None
        else:
            accelerator.print(f"Resuming from checkpoint {path}")
            accelerator.load_state(os.path.join(config.work_dir, 'checkpoints', path))
            start_step = int(path.split("-")[1])
            start_epoch = start_step // len(train_dataloader)

    train(model)

================================================
FILE: PixArt-alpha-ToCa/train_scripts/train_pixart_lora_hf.py
================================================
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fine-tuning script for Stable Diffusion for text2image with support for LoRA."""

import argparse
import logging
import math
import os
import random
import shutil
from pathlib import Path
from typing import List, Union

import datasets
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
import transformers
import accelerate
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
from datasets import load_dataset
from huggingface_hub import create_repo, upload_folder
from packaging import version
from peft import LoraConfig, get_peft_model_state_dict, get_peft_model, PeftModel
from torchvision import transforms
from tqdm.auto import tqdm

import diffusers
from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, StableDiffusionPipeline, PixArtAlphaPipeline, Transformer2DModel
from transformers import T5EncoderModel, T5Tokenizer
from diffusers.optimization import get_scheduler
from diffusers.training_utils import compute_snr
from diffusers.utils import check_min_version, is_wandb_available
from diffusers.utils.import_utils import is_xformers_available


# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.25.0.dev0")

logger = get_logger(__name__, log_level="INFO")


# TODO: This function should be removed once training scripts are rewritten in PEFT
def text_encoder_lora_state_dict(text_encoder):
    state_dict = {}

    def text_encoder_attn_modules(text_encoder):
        from transformers import CLIPTextModel, CLIPTextModelWithProjection

        attn_modules = []

        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
                name = f"text_model.encoder.layers.{i}.self_attn"
                mod = layer.self_attn
                attn_modules.append((name, mod))

        return attn_modules

    for name, module in text_encoder_attn_modules(text_encoder):
        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v

        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v

        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v

        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v

    return state_dict


def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
    img_str = ""
    for i, image in enumerate(images):
        image.save(os.path.join(repo_folder, f"image_{i}.png"))
        img_str += f"![img_{i}](./image_{i}.png)\n"

    yaml = f"""
---
license: creativeml-openrail-m
base_model: {base_model}
tags:
- stable-diffusion
- stable-diffusion-diffusers
- text-to-image
- diffusers
- lora
inference: true
---
    """
    model_card = f"""
# LoRA text2image fine-tuning - {repo_id}
These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
{img_str}
"""
    with open(os.path.join(repo_folder, "README.md"), "w") as f:
        f.write(yaml + model_card)


def parse_args():
    parser = argparse.ArgumentParser(description="Simple example of a training script.")
    parser.add_argument(
        "--pretrained_model_name_or_path",
        type=str,
        default=None,
        required=True,
        help="Path to pretrained model or model identifier from huggingface.co/models.",
    )
    parser.add_argument(
        "--revision",
        type=str,
        default=None,
        required=False,
        help="Revision of pretrained model identifier from huggingface.co/models.",
    )
    parser.add_argument(
        "--variant",
        type=str,
        default=None,
        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
    )
    parser.add_argument(
        "--dataset_name",
        type=str,
        default=None,
        help=(
            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
            " or to a folder containing files that 🤗 Datasets can understand."
        ),
    )
    parser.add_argument(
        "--dataset_config_name",
        type=str,
        default=None,
        help="The config of the Dataset, leave as None if there's only one config.",
    )
    parser.add_argument(
        "--train_data_dir",
        type=str,
        default=None,
        help=(
            "A folder containing the training data. Folder contents must follow the structure described in"
            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
        ),
    )
    parser.add_argument(
        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
    )
    parser.add_argument(
        "--caption_column",
        type=str,
        default="text",
        help="The column of the dataset containing a caption or a list of captions.",
    )
    parser.add_argument(
        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
    )
    parser.add_argument(
        "--num_validation_images",
        type=int,
        default=4,
        help="Number of images that should be generated during validation with `validation_prompt`.",
    )
    parser.add_argument(
        "--validation_epochs",
        type=int,
        default=1,
        help=(
            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
            " `args.validation_prompt` multiple times: `args.num_validation_images`."
        ),
    )
    parser.add_argument(
        "--max_train_samples",
        type=int,
        default=None,
        help=(
            "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        ),
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="sd-model-finetuned-lora",
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--cache_dir",
        type=str,
        default=None,
        help="The directory where the downloaded models and datasets will be stored.",
    )
    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
    parser.add_argument(
        "--resolution",
        type=int,
        default=512,
        help=(
            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
            " resolution"
        ),
    )
    parser.add_argument(
        "--center_crop",
        default=False,
        action="store_true",
        help=(
            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
            " cropped. The images will be resized to the resolution first before cropping."
        ),
    )
    parser.add_argument(
        "--random_flip",
        action="store_true",
        help="whether to randomly flip images horizontally",
    )
    parser.add_argument(
        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
    )
    parser.add_argument("--num_train_epochs", type=int, default=100)
    parser.add_argument(
        "--max_train_steps",
        type=int,
        default=None,
        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--gradient_checkpointing",
        action="store_true",
        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
    )
    parser.add_argument(
        "--learning_rate",
        type=float,
        default=1e-6,
        help="Initial learning rate (after the potential warmup period) to use.",
    )
    parser.add_argument(
        "--scale_lr",
        action="store_true",
        default=False,
        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
    )
    parser.add_argument(
        "--lr_scheduler",
        type=str,
        default="constant",
        help=(
            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
            ' "constant", "constant_with_warmup"]'
        ),
    )
    parser.add_argument(
        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
    )
    parser.add_argument(
        "--snr_gamma",
        type=float,
        default=None,
        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
        "More details here: https://arxiv.org/abs/2303.09556.",
    )
    parser.add_argument(
        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
    )
    parser.add_argument(
        "--use_dora",
        action="store_true",
        default=False,
        help="Whether or not to use Dora. For more information, see"
        " https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig.use_dora"
    )
    parser.add_argument(
        "--use_rslora",
        action="store_true",
        default=False,
        help="Whether or not to use RS Lora. For more information, see"
        " https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig.use_rslora"
    )
    parser.add_argument(
        "--allow_tf32",
        action="store_true",
        help=(
            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
        ),
    )
    parser.add_argument(
        "--dataloader_num_workers",
        type=int,
        default=0,
        help=(
            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
        ),
    )
    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
    # ----Diffusion Training Arguments----
    parser.add_argument(
        "--proportion_empty_prompts",
        type=float,
        default=0,
        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
    )
    parser.add_argument(
        "--prediction_type",
        type=str,
        default=None,
        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
    )
    parser.add_argument(
        "--hub_model_id",
        type=str,
        default=None,
        help="The name of the repository to keep in sync with the local `output_dir`.",
    )
    parser.add_argument(
        "--logging_dir",
        type=str,
        default="logs",
        help=(
            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
        ),
    )
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16"],
        help=(
            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
        ),
    )
    parser.add_argument(
        "--report_to",
        type=str,
        default="tensorboard",
        help=(
            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
        ),
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    parser.add_argument(
        "--checkpointing_steps",
        type=int,
        default=500,
        help=(
            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
            " training using `--resume_from_checkpoint`."
        ),
    )
    parser.add_argument(
        "--checkpoints_total_limit",
        type=int,
        default=None,
        help=("Max number of checkpoints to store."),
    )
    parser.add_argument(
        "--resume_from_checkpoint",
        type=str,
        default=None,
        help=(
            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
        ),
    )
    parser.add_argument(
        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
    )
    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
    parser.add_argument(
        "--rank",
        type=int,
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )

    parser.add_argument("--local-rank", type=int, default=-1)

    args = parser.parse_args()
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
    if env_local_rank != -1 and env_local_rank != args.local_rank:
        args.local_rank = env_local_rank

    # Sanity checks
    if args.dataset_name is None and args.train_data_dir is None:
        raise ValueError("Need either a dataset name or a training folder.")

    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")

    return args


DATASET_NAME_MAPPING = {"lambdalabs/pokemon-blip-captions": ("image", "text"),}


def main():
    args = parse_args()
    logging_dir = Path(args.output_dir, args.logging_dir)

    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)

    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
        log_with=args.report_to,
        project_config=accelerator_project_config,
    )
    if args.report_to == "wandb":
        if not is_wandb_available():
            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
        import wandb

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state, main_process_only=False)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_warning()
        diffusers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()
        diffusers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)

        if args.push_to_hub:
            repo_id = create_repo(repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token).repo_id

    # See Section 3.1. of the paper.
    max_length = 120

    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora transformer) to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
        weight_dtype = torch.float16
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

    # Load scheduler, tokenizer and models.
    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler", torch_dtype=weight_dtype)
    tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, torch_dtype=weight_dtype)

    text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, torch_dtype=weight_dtype)
    text_encoder.requires_grad_(False)
    text_encoder.to(accelerator.device)

    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant, torch_dtype=weight_dtype)
    vae.requires_grad_(False)
    vae.to(accelerator.device)

    transformer = Transformer2DModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="transformer", torch_dtype=weight_dtype)

    # freeze parameters of models to save more memory
    transformer.requires_grad_(False)    
    
    # Freeze the transformer parameters before adding adapters
    for param in transformer.parameters():
        param.requires_grad_(False)

    lora_config = LoraConfig(
        r=args.rank,
        init_lora_weights="gaussian",
        target_modules=[
            "to_k",
            "to_q",
            "to_v",
            "to_out.0",
            "proj_in",
            "proj_out",
            "ff.net.0.proj",
            "ff.net.2",
            "proj",
            "linear",
            "linear_1",
            "linear_2",
            # "scale_shift_table",      # not available due to the implementation in huggingface/peft, working on it.
        ],
        use_dora = args.use_dora,
        use_rslora = args.use_rslora
    )

    # Move transformer, vae and text_encoder to device and cast to weight_dtype
    transformer.to(accelerator.device)
    
    def cast_training_params(model: Union[torch.nn.Module, List[torch.nn.Module]], dtype=torch.float32):
        if not isinstance(model, list):
            model = [model]
        for m in model:
            for param in m.parameters():
                # only upcast trainable parameters into fp32
                if param.requires_grad:
                    param.data = param.to(dtype)

    transformer = get_peft_model(transformer, lora_config)
    if args.mixed_precision == "fp16":
        # only upcast trainable parameters (LoRA) into fp32
        cast_training_params(transformer, dtype=torch.float32)

    transformer.print_trainable_parameters()

    # 10. Handle saving and loading of checkpoints
    # `accelerate` 0.16.0 will have better support for customized saving
    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
        def save_model_hook(models, weights, output_dir):
            if accelerator.is_main_process:
                transformer_ = accelerator.unwrap_model(transformer)
                lora_state_dict = get_peft_model_state_dict(transformer_, adapter_name="default")
                StableDiffusionPipeline.save_lora_weights(os.path.join(output_dir, "transformer_lora"), lora_state_dict)
                # save weights in peft format to be able to load them back
                transformer_.save_pretrained(output_dir)

                for _, model in enumerate(models):
                    # make sure to pop weight so that corresponding model is not saved again
                    weights.pop()

        def load_model_hook(models, input_dir):
            # load the LoRA into the model
            transformer_ = accelerator.unwrap_model(transformer)
            transformer_.load_adapter(input_dir, "default", is_trainable=True)

            for _ in range(len(models)):
                # pop models so that they are not loaded again
                models.pop()

        accelerator.register_save_state_pre_hook(save_model_hook)
        accelerator.register_load_state_pre_hook(load_model_hook)

    if args.enable_xformers_memory_efficient_attention:
        if is_xformers_available():
            import xformers

            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse("0.0.16"):
                logger.warn(
                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                )
            transformer.enable_xformers_memory_efficient_attention()
        else:
            raise ValueError("xformers is not available. Make sure it is installed correctly")

    lora_layers = filter(lambda p: p.requires_grad, transformer.parameters())

    # Enable TF32 for faster training on Ampere GPUs,
    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
    if args.allow_tf32:
        torch.backends.cuda.matmul.allow_tf32 = True

    if args.gradient_checkpointing:
        transformer.enable_gradient_checkpointing()

    if args.scale_lr:
        args.learning_rate = args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes

    # Initialize the optimizer
    if args.use_8bit_adam:
        try:
            import bitsandbytes as bnb
        except ImportError:
            raise ImportError("Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`")

        optimizer_cls = bnb.optim.AdamW8bit
    else:
        optimizer_cls = torch.optim.AdamW

    optimizer = optimizer_cls(
        lora_layers,
        lr=args.learning_rate,
        betas=(args.adam_beta1, args.adam_beta2),
        weight_decay=args.adam_weight_decay,
        eps=args.adam_epsilon,
    )

    # Get the datasets: you can either provide your own training and evaluation files (see below)
    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).

    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        dataset = load_dataset(
            args.dataset_name,
            args.dataset_config_name,
            cache_dir=args.cache_dir,
            data_dir=args.train_data_dir,
        )
    else:
        data_files = {}
        if args.train_data_dir is not None:
            data_files["train"] = os.path.join(args.train_data_dir, "**")
        dataset = load_dataset(
            "imagefolder",
            data_files=data_files,
            cache_dir=args.cache_dir,
        )
        # See more about loading custom images at
        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder

    # Preprocessing the datasets.
    # We need to tokenize inputs and targets.
    column_names = dataset["train"].column_names

    # 6. Get the column names for input/target.
    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
    if args.image_column is None:
        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
    else:
        image_column = args.image_column
        if image_column not in column_names:
            raise ValueError(
                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
            )
    if args.caption_column is None:
        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
    else:
        caption_column = args.caption_column
        if caption_column not in column_names:
            raise ValueError(
                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
            )

    # Preprocessing the datasets.
    # We need to tokenize input captions and transform the images.
    def tokenize_captions(examples, is_train=True, proportion_empty_prompts=0., max_length=120):
        captions = []
        for caption in examples[caption_column]:
            if random.random() < proportion_empty_prompts:
                captions.append("")
            elif isinstance(caption, str):
                captions.append(caption)
            elif isinstance(caption, (list, np.ndarray)):
                # take a random caption if there are multiple
                captions.append(random.choice(caption) if is_train else caption[0])
            else:
                raise ValueError(
                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
                )
        inputs = tokenizer(captions, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
        return inputs.input_ids, inputs.attention_mask

    # Preprocessing the datasets.
    train_transforms = transforms.Compose(
        [
            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5]),
        ]
    )

    def preprocess_train(examples):
        images = [image.convert("RGB") for image in examples[image_column]]
        examples["pixel_values"] = [train_transforms(image) for image in images]
        examples["input_ids"], examples['prompt_attention_mask'] = tokenize_captions(examples, proportion_empty_prompts=args.proportion_empty_prompts, max_length=max_length)
        return examples

    with accelerator.main_process_first():
        if args.max_train_samples is not None:
            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
        # Set the training transforms
        train_dataset = dataset["train"].with_transform(preprocess_train)

    def collate_fn(examples):
        pixel_values = torch.stack([example["pixel_values"] for example in examples])
        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
        input_ids = torch.stack([example["input_ids"] for example in examples])
        prompt_attention_mask = torch.stack([example["prompt_attention_mask"] for example in examples])
        return {"pixel_values": pixel_values, "input_ids": input_ids, 'prompt_attention_mask': prompt_attention_mask}

    # DataLoaders creation:
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=collate_fn,
        batch_size=args.train_batch_size,
        num_workers=args.dataloader_num_workers,
    )

    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
        overrode_max_train_steps = True

    lr_scheduler = get_scheduler(
        args.lr_scheduler,
        optimizer=optimizer,
        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
        num_training_steps=args.max_train_steps * accelerator.num_processes,
    )

    # Prepare everything with our `accelerator`.
    transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(transformer, optimizer, train_dataloader, lr_scheduler)

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if overrode_max_train_steps:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    # Afterwards we recalculate our number of training epochs
    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

    # We need to initialize the trackers we use, and also store our configuration.
    # The trackers initializes automatically on the main process.
    if accelerator.is_main_process:
        accelerator.init_trackers("text2image-fine-tune", config=vars(args))

    # Train!
    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    global_step = 0
    first_epoch = 0

    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint != "latest":
            path = os.path.basename(args.resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = os.listdir(args.output_dir)
            dirs = [d for d in dirs if d.startswith("checkpoint")]
            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
            path = dirs[-1] if len(dirs) > 0 else None

        if path is None:
            accelerator.print(
                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
            )
            args.resume_from_checkpoint = None
            initial_global_step = 0
        else:
            accelerator.print(f"Resuming from checkpoint {path}")
            accelerator.load_state(os.path.join(args.output_dir, path))
            global_step = int(path.split("-")[1])

            initial_global_step = global_step
            first_epoch = global_step // num_update_steps_per_epoch
    else:
        initial_global_step = 0

    progress_bar = tqdm(
        range(0, args.max_train_steps),
        initial=initial_global_step,
        desc="Steps",
        # Only show the progress bar once on each machine.
        disable=not accelerator.is_local_main_process,
    )

    for epoch in range(first_epoch, args.num_train_epochs):
        transformer.train()
        train_loss = 0.0
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(transformer):
                # Convert images to latent space
                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
                latents = latents * vae.config.scaling_factor

                # Sample noise that we'll add to the latents
                noise = torch.randn_like(latents)
                if args.noise_offset:
                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
                    noise += args.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)

                bsz = latents.shape[0]
                # Sample a random timestep for each image
                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
                timesteps = timesteps.long()

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                # Get the text embedding for conditioning
                prompt_embeds = text_encoder(batch["input_ids"], attention_mask=batch['prompt_attention_mask'])[0]
                prompt_attention_mask = batch['prompt_attention_mask']
                # Get the target for loss depending on the prediction type
                if args.prediction_type is not None:
                    # set prediction_type of scheduler if defined
                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)

                if noise_scheduler.config.prediction_type == "epsilon":
                    target = noise
                elif noise_scheduler.config.prediction_type == "v_prediction":
                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
                else:
                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")

                # Prepare micro-conditions.
                added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
                if getattr(transformer, 'module', transformer).config.sample_size == 128:
                    resolution = torch.tensor([args.resolution, args.resolution]).repeat(bsz, 1)
                    aspect_ratio = torch.tensor([float(args.resolution / args.resolution)]).repeat(bsz, 1)
                    resolution = resolution.to(dtype=weight_dtype, device=latents.device)
                    aspect_ratio = aspect_ratio.to(dtype=weight_dtype, device=latents.device)
                    added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}

                # Predict the noise residual and compute loss
                model_pred = transformer(noisy_latents,
                                         encoder_hidden_states=prompt_embeds,
                                         encoder_attention_mask=prompt_attention_mask,
                                         timestep=timesteps,
                                         added_cond_kwargs=added_cond_kwargs).sample.chunk(2, 1)[0]

                if args.snr_gamma is None:
                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                else:
                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                    # This is discussed in Section 4.2 of the same paper.
                    snr = compute_snr(noise_scheduler, timesteps)
                    if noise_scheduler.config.prediction_type == "v_prediction":
                        # Velocity objective requires that we add one to SNR values before we divide by them.
                        snr = snr + 1
                    mse_loss_weights = (torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr)

                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
                    loss = loss.mean()

                # Gather the losses across all processes for logging (if we use distributed training).
                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
                train_loss += avg_loss.item() / args.gradient_accumulation_steps

                # Backpropagate
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    params_to_clip = lora_layers
                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
                progress_bar.update(1)
                global_step += 1
                accelerator.log({"train_loss": train_loss}, step=global_step)
                train_loss = 0.0

                if global_step % args.checkpointing_steps == 0:
                    if accelerator.is_main_process:
                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
                        if args.checkpoints_total_limit is not None:
                            checkpoints = os.listdir(args.output_dir)
                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))

                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
                            if len(checkpoints) >= args.checkpoints_total_limit:
                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
                                removing_checkpoints = checkpoints[0:num_to_remove]

                                logger.info(f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints")
                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")

                                for removing_checkpoint in removing_checkpoints:
                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
                                    shutil.rmtree(removing_checkpoint)

                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                        accelerator.save_state(save_path)

                        unwrapped_transformer = accelerator.unwrap_model(transformer, keep_fp32_wrapper=False)
                        transformer_lora_state_dict = get_peft_model_state_dict(unwrapped_transformer)

                        StableDiffusionPipeline.save_lora_weights(
                            save_directory=save_path,
                            unet_lora_layers=transformer_lora_state_dict,
                            safe_serialization=True,
                        )

                        logger.info(f"Saved state to {save_path}")

            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
            progress_bar.set_postfix(**logs)

            if global_step >= args.max_train_steps:
                break

        if accelerator.is_main_process:
            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
                logger.info(
                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
                    f" {args.validation_prompt}."
                )
                # create pipeline
                pipeline = DiffusionPipeline.from_pretrained(
                    args.pretrained_model_name_or_path,
                    transformer=accelerator.unwrap_model(transformer, keep_fp32_wrapper=False),
                    text_encoder=text_encoder, vae=vae,
                    torch_dtype=weight_dtype,
                )
                pipeline = pipeline.to(accelerator.device)
                pipeline.set_progress_bar_config(disable=True)

                # run inference
                generator = torch.Generator(device=accelerator.device)
                if args.seed is not None:
                    generator = generator.manual_seed(args.seed)
                images = []
                for _ in range(args.num_validation_images):
                    images.append(pipeline(args.validation_prompt, num_inference_steps=20, generator=generator).images[0])

                for tracker in accelerator.trackers:
                    if tracker.name == "tensorboard":
                        np_images = np.stack([np.asarray(img) for img in images])
                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
                    if tracker.name == "wandb":
                        tracker.log(
                            {
                                "validation": [wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)]
                            }
                        )

                del pipeline
                torch.cuda.empty_cache()

    # Save the lora layers
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        transformer = accelerator.unwrap_model(transformer, keep_fp32_wrapper=False)
        transformer.save_pretrained(args.output_dir)
        lora_state_dict = get_peft_model_state_dict(transformer)
        StableDiffusionPipeline.save_lora_weights(os.path.join(args.output_dir, "transformer_lora"), lora_state_dict)

        if args.push_to_hub:
            save_model_card(
                repo_id,
                images=images,
                base_model=args.pretrained_model_name_or_path,
                dataset_name=args.dataset_name,
                repo_folder=args.output_dir,
            )
            upload_folder(
                repo_id=repo_id,
                folder_path=args.output_dir,
                commit_message="End of training",
                ignore_patterns=["step_*", "epoch_*"],
            )

    
    # Final inference
    # Load previous transformer
    transformer = Transformer2DModel.from_pretrained(args.pretrained_model_name_or_path, subfolder='transformer', torch_dtype=weight_dtype)
    # load lora weight
    transformer = PeftModel.from_pretrained(transformer, args.output_dir)
    # Load previous pipeline
    pipeline = DiffusionPipeline.from_pretrained(args.pretrained_model_name_or_path, transformer=transformer, text_encoder=text_encoder, vae=vae, torch_dtype=weight_dtype,)
    pipeline = pipeline.to(accelerator.device)

    del transformer
    torch.cuda.empty_cache()

    # run inference
    generator = torch.Generator(device=accelerator.device)
    if args.seed is not None:
        generator = generator.manual_seed(args.seed)
    images = []
    for _ in range(args.num_validation_images):
        images.append(pipeline(args.validation_prompt, num_inference_steps=20, generator=generator).images[0])

    if accelerator.is_main_process:
        for tracker in accelerator.trackers:
            if len(images) != 0:
                if tracker.name == "tensorboard":
                    np_images = np.stack([np.asarray(img) for img in images])
                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
                if tracker.name == "wandb":
                    tracker.log(
                        {
                            "test": [
                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
                                for i, image in enumerate(images)
                            ]
                        }
                    )

    accelerator.end_training()


if __name__ == "__main__":
    main()


================================================
FILE: PixArt-alpha-ToCa-tools/clip_score.py
================================================
import os
import torch
from PIL import Image
from torchvision.transforms import ToTensor
from torchmetrics.multimodal.clip_score import CLIPScore
from tqdm import tqdm
import torch.multiprocessing as mp

# Load prompts file
def load_prompts(txt_file):
    with open(txt_file, "r") as f:
        prompts = f.read().splitlines()
    return prompts

# Find matching image file: first, directly use the prompt as the filename, 
# and if not found, match using a prefix
def find_image_file(image_folder, prompt):
    img_filename = prompt + ".jpg"  # Assume filename is {prompt}.jpg
    img_path = os.path.join(image_folder, img_filename)
    
    if os.path.exists(img_path):
        return img_path

    # If direct match fails, use prefix matching
    for file in os.listdir(image_folder):
        if file.startswith(prompt[:20]):  # Use the first 20 characters as a prefix for matching
            return os.path.join(image_folder, file)

    return None

# Load a batch of images and convert them to Tensors
def load_images(image_folder, prompts_batch):
    images = []
    valid_prompts = []
    
    for prompt in prompts_batch:
        img_path = find_image_file(image_folder, prompt)
        
        if img_path:
            try:
                image = Image.open(img_path).convert("RGB")
                image_tensor = ToTensor()(image).unsqueeze(0)  # Shape (1, C, H, W)
                images.append(image_tensor)
                valid_prompts.append(prompt)
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")
        else:
            print(f"No image found for prompt: {prompt}")
    
    if len(images) > 0:
        images_tensor = torch.cat(images, dim=0)  # Combine into a single batch (N, C, H, W)
        return images_tensor, valid_prompts
    else:
        return None, None

# Single task: process a batch of prompts and corresponding images, and calculate CLIP Score
def process_batch(prompts_batch, image_folder, model_path, device):
    clip_score_metric = CLIPScore(model_name_or_path=model_path).to(device)
    
    # Load image batch
    images_tensor, valid_prompts = load_images(image_folder, prompts_batch)
    if images_tensor is not None:
        images_tensor = images_tensor.to(device)
        
        with torch.no_grad():  # Avoid building computation graph, reducing memory consumption
            # Calculate CLIP Score for each image and prompt
            for i, prompt in enumerate(valid_prompts):
                clip_score_metric.update(images_tensor[i].unsqueeze(0).float(), prompt)
        
        # Release memory
        del images_tensor
        torch.cuda.empty_cache()

        return clip_score_metric.compute().item()
    else:
        return None

# Split data into batches
def chunked(iterable, batch_size):
    """Yield successive n-sized chunks from iterable."""
    for i in range(0, len(iterable), batch_size):
        yield iterable[i:i + batch_size]

# Main processing function
def main_worker(rank, prompts, image_folder, model_path, device, batch_size, queue):
    # Split into batches
    prompts_batches = list(chunked(prompts, batch_size))
    
    clip_scores = []
    for batch in prompts_batches:
        score = process_batch(batch, image_folder, model_path, device)
        if score is not None:
            clip_scores.append(score)
        # After processing each batch, send information to the main process
        queue.put(1)  # Send signal indicating one batch is processed
    
    queue.put(clip_scores)  # Put final result into the queue for the main process

def main(prompt_file="prompts.txt", image_folder="images", batch_size=64, num_workers=4):
    # Load prompts
    prompts = load_prompts(prompt_file)
    model_path = "/root/autodl-tmp/pretrained_models/clip-vit-large-patch14"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create multiprocessing queue
    queue = mp.Queue()

    # Start multiple processes
    processes = []
    chunk_size = len(prompts) // num_workers
    total_batches = (len(prompts) + batch_size - 1) // batch_size  # Calculate total batch count
    for rank in range(num_workers):
        worker_prompts = prompts[rank * chunk_size: (rank + 1) * chunk_size]
        p = mp.Process(target=main_worker, args=(rank, worker_prompts, image_folder, model_path, device, batch_size, queue))
        p.start()
        processes.append(p)

    # Use tqdm to create a progress bar
    with tqdm(total=total_batches, desc="Processing batches") as pbar:
        all_scores = []
        finished_batches = 0

        # Get results or progress from the queue
        while finished_batches < total_batches:
            result = queue.get()
            if isinstance(result, list):  # If it's a list, it means final scores
                all_scores.extend(result)
            else:
                pbar.update(1)  # Update progress bar
                finished_batches += 1

    # Wait for subprocesses to end
    for p in processes:
        p.join()

    # Calculate final result
    if all_scores:
        final_clip_score = sum(all_scores) / len(all_scores)
        print(f"Final averaged CLIP Score for folder '{image_folder}': {final_clip_score}")
    else:
        print(f"No valid images found in folder '{image_folder}'.")

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Calculate CLIP Score for images and prompts with batch parallel processing.")
    parser.add_argument("--prompt_file", type=str, default="/root/autodl-tmp/COCO/COCO_caption_prompts_30k.txt", help="Path to the prompts text file.")
    parser.add_argument("--image_folder", type=str, default="/root/autodl-tmp/vis/2024-09-04_custom_epochunknown_stepunknown_scale4.5_step20_size256_bs100_sampdpm-solver_seed0", help="Path to the folder containing images.")
    parser.add_argument("--batch_size", type=int, default=64, help="Number of images to process in each batch.")
    parser.add_argument("--num_workers", type=int, default=4, help="Number of parallel workers.")
    args = parser.parse_args()
    
    # Set multiprocessing start method to 'spawn', suitable for CUDA
    mp.set_start_method('spawn', force=True)

    main(prompt_file=args.prompt_file, image_folder=args.image_folder, batch_size=args.batch_size, num_workers=args.num_workers)


================================================
FILE: README.md
================================================
<div align=center>
  
# **[ICLR 2025]** *ToCa*: Accelerating Diffusion Transformers with *To*ken-wise Feature *Ca*ching

<p>
<a href='https://arxiv.org/abs/2410.05317'><img src='https://img.shields.io/badge/Paper-arXiv-red'></a>
<a href='https://toca2024.github.io/ToCa/'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
</p>

</div>

## 🔥 News

* `2025/03/10` 🚀🚀 Our latest work "From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers" is released! Codes are available at [TaylorSeer](https://github.com/Shenyi-Z/TaylorSeer)! TaylorSeer supports lossless compression at a rate of 4.99x on FLUX.1-dev (with a latency speedup of 3.53x) and high-quality acceleration at a compression rate of 5.00x on HunyuanVideo (with a latency speedup of 4.65x)! We hope *TaylorSeer* can move the paradigm of feature caching methods from reusing to forecasting.For more details, please refer to our latest research [paper](https://arxiv.org/abs/2503.06923).
* `2025/02/19` 🚀🚀 ToCa solution for **FLUX** has been officially released after adjustments, now achieving up to **3.14× lossless acceleration**!
* `2025/01/22` 💥💥 ToCa is honored to be accepted by ICLR 2025!
* `2024/12/29` 🚀🚀 We release our work [DuCa](https://arxiv.org/abs/2412.18911) about accelerating diffusion transformers for FREE, which achieves nearly lossless acceleration of **2.50×** on [OpenSora](https://github.com/hpcaitech/Open-Sora)! 🎉 **DuCa also overcomes the limitation of ToCa by fully supporting FlashAttention, enabling broader compatibility and efficiency improvements.**
* `2024/12/24` 🤗🤗 We release an open-sourse repo "[Awesome-Token-Reduction-for-Model-Compression](https://github.com/xuyang-liu16/Awesome-Token-Reduction-for-Model-Compression)", which collects recent awesome token reduction papers! Feel free to contribute your suggestions!
* `2024/12/20` 💥💥 Our ToCa has achieved nearly lossless acceleration of **1.51×** on [FLUX](https://huggingface.co/spaces/black-forest-labs/FLUX.1-schnell), feel free to check the latest version of our [paper](https://arxiv.org/pdf/2410.05317#page=19)!
* `2024/12/10` 💥💥 Our team's recent work, **SiTo** (https://github.com/EvelynZhang-epiclab/SiTo), has been accepted to **AAAI 2025**. It accelerates diffusion models through adaptive **Token Pruning**.
* `2024/10/16` 🤗🤗 Users with autodl accounts can now quickly experience [OpenSora-ToCa](https://www.codewithgpu.com/i/Shenyi-Z/ToCa/OpenSora-ToCa) by directly using our publicly available image!
* `2024/10/12` 🚀🚀 We release our work [ToCa](https://arxiv.org/abs/2410.05317) about accelerating diffusion transformers for FREE, which achieves nearly lossless acceleration of **2.36×** on [OpenSora](https://github.com/hpcaitech/Open-Sora)!
* `2024/07/15` 🤗🤗 We release an open-sourse repo "[Awesome-Generation-Acceleration](https://github.com/xuyang-liu16/Awesome-Generation-Acceleration)", which collects recent awesome generation accleration papers! Feel free to contribute your suggestions!

## TODO:

- [x] Support for FLOPs calculation
- [x] Add the FLUX version of ToCa
- [ ] Further optimize the code logic to reduce the time consumption of tensor operations


##  Dependencies
``` cmd
Python>=3.9
CUDA>=11.8
```

## 🛠 Installation

``` cmd
git clone https://github.com/Shenyi-Z/ToCa.git
```

### Environment Settings

#### Original Models (recommended)

We evaluated our model under the same environments as the original models.
So you may set the environments through following the requirements of the mentioned original models.

Links:

| Original  Models |                     urls                     |
| :--------------: | :------------------------------------------: |
|       DiT        |   https://github.com/facebookresearch/DiT    |
|     PixArt-α     | https://github.com/PixArt-alpha/PixArt-alpha |
|     OpenSora     |    https://github.com/hpcaitech/Open-Sora    |
|       FLUX       |  https://github.com/black-forest-labs/flux   |

Besides, we provide a replica for our environment here:

<details>
<summary>From our environment.yaml</summary>

##### DiT

  ```bash
  cd DiT-ToCa
  conda env create -f environment-dit.yml
  ```

##### PixArt-α

  ```bash
  cd PixArt-alpha-ToCa
  conda env create -f environment-pixart.yml
  ```

##### OpenSora

  ```bash
  cd Open-Sora
  conda env create -f environment-opensora.yml
  pip install -v . # for development mode, `pip install -v -e .`
  ```

</details>

## 🚀 Run and evaluation

### Run DiT-ToCa

#### DDPM-250 Steps

sample images for **visualization**

```bash
cd DiT-ToCa
python sample.py --image-size 256 --num-sampling-steps 250 --cache-type attention --fresh-threshold 4 --fresh-ratio 0.07 --ratio-scheduler ToCa-ddpm250  --force-fresh global --soft-fresh-weight 0.25
```

sample images for **evaluation** (e.g 50k)

```bash
cd DiT-ToCa
torchrun --nnodes=1 --nproc_per_node=6 sample_ddp.py --model DiT-XL/2 --per-proc-batch-size 150 --image-size 256 --cfg-scale 1.5 --num-sampling-steps 250 --cache-type attention --fresh-ratio 0.07 --ratio-scheduler ToCa-ddpm250 --force-fresh global --fresh-threshold 4 --soft-fresh-weight 0.25 --num-fid-samples 50000
```

#### DDIM-50 Steps

sample images for **visualization**

```bash
cd DiT-ToCa
python sample.py --image-size 256 --num-sampling-steps 50 --cache-type attention --fresh-threshold 3 --fresh-ratio 0.07 --ratio-scheduler ToCa-ddim50  --force-fresh global --soft-fresh-weight 0.25 --ddim-sample
```

sample images for **evaluation** (e.g 50k)

```bash
cd DiT-ToCa
torchrun --nnodes=1 --nproc_per_node=6 sample_ddp.py --model DiT-XL/2 --per-proc-batch-size 150 --image-size 256 --cfg-scale 1.5 --num-sampling-steps 50 --cache-type attention --fresh-ratio 0.07 --ratio-scheduler ToCa-ddim50 --force-fresh global --fresh-threshold 3 --soft-fresh-weight 0.25 --num-fid-samples 50000 --ddim-sample
```

#### test FLOPs

Just add --test-FLOPs, here an example: 

```bash
cd DiT-ToCa
python sample.py --image-size 256 --num-sampling-steps 50 --cache-type attention --fresh-threshold 3 --fresh-ratio 0.07 --ratio-scheduler ToCa-ddim50  --force-fresh global --soft-fresh-weight 0.25 --ddim-sample --test-FLOPs
```

### Run PixArt-α-ToCa

sample images for **visualization**

```bash
cd PixArt-alpha-ToCa
python scripts/inference.py --model_path /root/autodl-tmp/pretrained_models/PixArt-XL-2-256x256.pth --image_size 256 --bs 100 --txt_file /root/autodl-tmp/test.txt --fresh_threshold 3 --fresh_ratio 0.30 --cache_type attention --force_fresh global --soft_fresh_weight 0.25 --ratio_scheduler ToCa
```

sample images for **evaluation** (e.g 30k for COCO, 1.6k for PartiPrompts)

```bash
cd PixArt-alpha-ToCa
torchrun --nproc_per_node=6 scripts/inference_ddp.py --model_path /root/autodl-tmp/pretrained_models/PixArt-XL-2-256x256.pth --image_size 256 --bs 100 --txt_file /root/autodl-tmp/COCO/COCO_caption_prompts_30k.txt --fresh_threshold 3 --fresh_ratio 0.30 --cache_type attention --force_fresh global --soft_fresh_weight 0.25 --ratio_scheduler ToCa
```

（Besides, if you need our npz file: https://drive.google.com/file/d/1vUdoSgdIvtXo1cAS_aOFCJ1-XC_i1KEQ/view?usp=sharing)

### Run OpenSora-ToCa

sample video for **visualization**

```bash
cd Open-Sora
python scripts/inference.py configs/opensora-v1-2/inference/sample.py   --num-frames 2s --resolution 480p --aspect-ratio 9:16   --prompt "a beautiful waterfall"
```

sample video for **VBench evaluation**

```bash
cd Open-Sora
bash eval/vbench/launch.sh /root/autodl-tmp/pretrained_models/hpcai-tech/OpenSora-STDiT-v3/model.safetensors 51 opensora-ToCa 480p 9:16
```

(remember replacing  "/root/autodl-tmp/pretrained_models/hpcai-tech/OpenSora-STDiT-v3/model.safetensors" with your own path!)

### Run FLUX-ToCa

First, you need to enter the environment adapted for FLUX. While the official documentation uses `venv` to build the environment, you can also set it up using `conda`, which you might be more familiar with.

<details>
<summary>How to build a conda environment for FLUX?</summary>

```bash
cd flux-ToCa
conda create -n flux python=3.10
pip install -e ".[all]"
```

</details>

For interactive sampling run

```bash
python -m flux --name <name> --loop
```

Or to generate a single sample run

```bash
python -m flux --name <name> \
  --height <height> --width <width> \
  --prompt "<prompt>"
```

Typically, `<name>` should be set to `flux-dev`.

Generate image samples with a txt file

```bash
python src/sample.py --prompt_file </path/to/your/prompt.txt> --width 1024 --height 1024 --model_name flux-dev --add_sampling_metadata --output_dir </path/to/your/generated/samples/folder> --num_steps 50
```

The `--add_sampling_metadata` parameter is used to control whether the prompt is added to the image's EXIF metadata.
We also provide function for FLOPs testing, but **in this mode, no generated samples are given**.

```bash
python src/sample.py --prompt_file </path/to/your/test/prompt.txt> --width 1024 --height 1024 --model_name flux-dev --add_sampling_metadata --output_dir </path/to/your/generated/samples/folder> --num_steps 50 --test_FLOPs
```

Use the framework of Geneval for evaluation


```bash
python src/geneval_flux.py /root/geneval/prompts/evaluation_metadata.jsonl --model_name flux-dev --n_samples 4 --steps 50 --width 1024 --height 1024 --seed 42 --output_dir /root/autodl-tmp/samples/flux-ToCa
```

<details>
<summary>How to prepare environment for geneval?</summary>

The environment required for Geneval's metric computation is somewhat specific. As of February 2025, it is not yet possible to set up the environment directly using the default method provided in the project. However, we can follow the guidance in this Geneval issue [https://github.com/djghosh13/geneval/issues/12](https://github.com/djghosh13/geneval/issues/12) to set up the environment. The instructions are very detailed.

</details>

#### Awesome acceleration results for the Latest Version of ToCa on FLUX


| Method       | Geneval $\uparrow$<br />overall score | ImageRewrd $\uparrow$<br />DrawBench200 | FLOPs $\downarrow$ | Latency $\downarrow$ | Compress Ratio $\uparrow$ | Speed Up $\uparrow$ |
| ------------ | :-----------------------------------: | :-------------------------------------: | :----------------: | :------------------: | :-----------------------: | :-----------------: |
| **original** |                0.6752                 |                 0.9898                  |      3719.50       |        33.87s        |           1.00            |        1.00         |
| 60% steps    |                0.6700                 |                 0.9739                  |      2231.70       |        20.49s        |           1.67            |        1.65         |
| 50% steps    |                0.6656                 |                 0.9429                  |      1859.75       |        17.12s        |           2.00            |        1.98         |
| 40% steps    |                0.6606                 |                 0.9317                  |      1487.80       |        13.77s        |           2.62            |        2.45         |
| **FORA3**    |                0.6594                 |                 0.9227                  |      1320.07       |        12.98s        |           2.82            |        2.61         |
| **ToCa4-01** |                0.6748                 |               **0.9798**                |      1263.22       |        11.91s        |           2.94            |        2.84         |
| **ToCa5-01** |              **0.6750**               |                 0.9731                  |      1126.76       |        10.80s        |           3.30            |        3.14         |
| **ToCa6-01** |                0.6653                 |                 0.9493                  |       990.30       |        9.48s         |           3.76            |        3.57         |


<details>
<summary>Explanation of the Improved ToCa</summary>

The **acceleration effect has significantly improved while maintaining generation quality** compared with the previous version. This is because, in the current version of the code, we have further optimized ToCa and adopted more reliable metrics (Image Reward on DrawBench200, Geneval).

</details>

## 👍 Acknowledgements

- Thanks to [DiT](https://github.com/facebookresearch/DiT) for their great work and codebase upon which we build DiT-ToCa.
- Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) for their great work and codebase upon which we build PixArt-α-ToCa.
- Thanks to [OpenSora](https://github.com/hpcaitech/Open-Sora) for their great work and codebase upon which we build OpenSora-ToCa.
- Thanks to [FLUX](https://github.com/black-forest-labs/flux) for their great work and codebase upon which we build FLUX-ToCa.

## 📌 Citation

```bibtex
@article{zou2024accelerating,
  title={Accelerating Diffusion Transformers with Token-wise Feature Caching},
  author={Zou, Chang and Liu, Xuyang and Liu, Ting and Huang, Siteng and Zhang, Linfeng},
  journal={arXiv preprint arXiv:2410.05317},
  year={2024}
}
```

## :e-mail: Contact

If you have any questions, please email [`shenyizou@outlook.com`](mailto:shenyizou@outlook.com).


================================================
FILE: flux-ToCa/.gitignore
================================================
# Created by https://www.toptal.com/developers/gitignore/api/linux,windows,macos,visualstudiocode,python
# Edit at https://www.toptal.com/developers/gitignore?templates=linux,windows,macos,visualstudiocode,python

### Linux ###
*~

# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*

# KDE directory preferences
.directory

# Linux trash folder which might appear on any partition or disk
.Trash-*

# .nfs files are created when an open file is removed but is still being accessed
.nfs*

### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon


# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace

# Local History for Visual Studio Code
.history/

### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide

### Windows ###
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db

# Dump file
*.stackdump

# Folder config file
[Dd]esktop.ini

# Recycle Bin used on file shares
$RECYCLE.BIN/

# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp

# Windows shortcuts
*.lnk

# End of https://www.toptal.com/developers/gitignore/api/linux,windows,macos,visualstudiocode,python


================================================
FILE: flux-ToCa/LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: flux-ToCa/README.md
================================================
# FLUX
by Black Forest Labs: https://blackforestlabs.ai. Documentation for our API can be found here: [docs.bfl.ml](https://docs.bfl.ml/).

![grid](assets/grid.jpg)

This repo contains minimal inference code to run image generation & editing with our Flux models.

## Local installation

```bash
cd $HOME && git clone https://github.com/black-forest-labs/flux
cd $HOME/flux

# Using pyvenv
python3.10 -m venv .venv
source .venv/bin/activate
pip install -e ".[all]"
```

### Models

We are offering an extensive suite of models. For more information about the individual models, please refer to the link under **Usage**.

| Name                        | Usage                                                      | HuggingFace repo                                               | License                                                               |
| --------------------------- | ---------------------------------------------------------- | -------------------------------------------------------------- | --------------------------------------------------------------------- |
| `FLUX.1 [schnell]`          | [Text to Image](docs/text-to-image.md)                     | https://huggingface.co/black-forest-labs/FLUX.1-schnell        | [apache-2.0](model_licenses/LICENSE-FLUX1-schnell)                    |
| `FLUX.1 [dev]`              | [Text to Image](docs/text-to-image.md)                     | https://huggingface.co/black-forest-labs/FLUX.1-dev            | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) |
| `FLUX.1 Fill [dev]`         | [In/Out-painting](docs/fill.md)                            | https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev       | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) |
| `FLUX.1 Canny [dev]`        | [Structural Conditioning](docs/structural-conditioning.md) | https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev      | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) |
| `FLUX.1 Depth [dev]`        | [Structural Conditioning](docs/structural-conditioning.md) | https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev      | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) |
| `FLUX.1 Canny [dev] LoRA`   | [Structural Conditioning](docs/structural-conditioning.md) | https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev-lora | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) |
| `FLUX.1 Depth [dev] LoRA`   | [Structural Conditioning](docs/structural-conditioning.md) | https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev-lora | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) |
| `FLUX.1 Redux [dev]`        | [Image variation](docs/image-variation.md)                 | https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev      | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) |
| `FLUX.1 [pro]`              | [Text to Image](docs/text-to-image.md)                     | [Available in our API.](https://docs.bfl.ml/)                  |                                                                       |
| `FLUX1.1 [pro]`             | [Text to Image](docs/text-to-image.md)                     | [Available in our API.](https://docs.bfl.ml/)                  |                                                                       |
| `FLUX1.1 [pro] Ultra/raw`   | [Text to Image](docs/text-to-image.md)                     | [Available in our API.](https://docs.bfl.ml/)                  |                                                                       |
| `FLUX.1 Fill [pro]`         | [In/Out-painting](docs/fill.md)                            | [Available in our API.](https://docs.bfl.ml/)                  |                                                                       |
| `FLUX.1 Canny [pro]`        | [Structural Conditioning](docs/structural-conditioning.md) | [Available in our API.](https://docs.bfl.ml/)                  |                                                                       |
| `FLUX.1 Depth [pro]`        | [Structural Conditioning](docs/structural-conditioning.md) | [Available in our API.](https://docs.bfl.ml/)                  |                                                                       |
| `FLUX1.1 Redux [pro]`       | [Image variation](docs/image-variation.md)                 | [Available in our API.](https://docs.bfl.ml/)                  |                                                                       |
| `FLUX1.1 Redux [pro] Ultra` | [Image variation](docs/image-variation.md)                 | [Available in our API.](https://docs.bfl.ml/)                  |                                                                       |

The weights of the autoencoder are also released under [apache-2.0](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md) and can be found in the HuggingFace repos above.

## API usage

Our API offers access to our models. It is documented here:
[docs.bfl.ml](https://docs.bfl.ml/).

In this repository we also offer an easy python interface. To use this, you
first need to register with the API on [api.bfl.ml](https://api.bfl.ml/), and
create a new API key.

To use the API key either run `export BFL_API_KEY=<your_key_here>` or provide
it via the `api_key=<your_key_here>` parameter. It is also expected that you
have installed the package as above.

Usage from python:

```python
from flux.api import ImageRequest

# this will create an api request directly but not block until the generation is finished
request = ImageRequest("A beautiful beach", name="flux.1.1-pro")
# or: request = ImageRequest("A beautiful beach", name="flux.1.1-pro", api_key="your_key_here")

# any of the following will block until the generation is finished
request.url
# -> https:<...>/sample.jpg
request.bytes
# -> b"..." bytes for the generated image
request.save("outputs/api.jpg")
# saves the sample to local storage
request.image
# -> a PIL image
```

Usage from the command line:

```bash
$ python -m flux.api --prompt="A beautiful beach" url
https:<...>/sample.jpg

# generate and save the result
$ python -m flux.api --prompt="A beautiful beach" save outputs/api

# open the image directly
$ python -m flux.api --prompt="A beautiful beach" image show
```

## Citation

If you find the provided code or models useful for your research, consider citing them as:

```bib
@misc{flux2023,
    author={Black Forest Labs},
    title={FLUX},
    year={2023},
    howpublished={\url{https://github.com/black-forest-labs/flux}},
}
```


================================================
FILE: flux-ToCa/demo_gr.py
================================================
import os
import time
import uuid

import gradio as gr
import numpy as np
import torch
from einops import rearrange
from PIL import ExifTags, Image
from transformers import pipeline

from flux.cli import SamplingOptions
from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack
from flux.ideas import denoise_cache
from flux.util import configs, embed_watermark, load_ae, load_clip, load_flow_model, load_t5

NSFW_THRESHOLD = 0.85


def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool):
    t5 = load_t5(device, max_length=256 if is_schnell else 512)
    clip = load_clip(device)
    model = load_flow_model(name, device="cpu" if offload else device)
    ae = load_ae(name, device="cpu" if offload else device)
    nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
    return model, ae, t5, clip, nsfw_classifier


class FluxGenerator:
    def __init__(self, model_name: str, device: str, offload: bool):
        self.device = torch.device(device)
        self.offload = offload
        self.model_name = model_name
        self.is_schnell = model_name == "flux-schnell"
        self.model, self.ae, self.t5, self.clip, self.nsfw_classifier = get_models(
            model_name,
            device=self.device,
            offload=self.offload,
            is_schnell=self.is_schnell,
        )

    @torch.inference_mode()
    def generate_image(
        self,
        width,
        height,
        num_steps,
        guidance,
        seed,
        prompt,
        init_image=None,
        image2image_strength=0.0,
        add_sampling_metadata=True,
    ):
        seed = int(seed)
        if seed == -1:
            seed = None

        opts = SamplingOptions(
            prompt=prompt,
            width=width,
            height=height,
            num_steps=num_steps,
            guidance=guidance,
            seed=seed,
        )

        if opts.seed is None:
            opts.seed = torch.Generator(device="cpu").seed()
        print(f"Generating '{opts.prompt}' with seed {opts.seed}")
        t0 = time.perf_counter()

        if init_image is not None:
            if isinstance(init_image, np.ndarray):
                init_image = torch.from_numpy(init_image).permute(2, 0, 1).float() / 255.0
                init_image = init_image.unsqueeze(0)
            init_image = init_image.to(self.device)
            init_image = torch.nn.functional.interpolate(init_image, (opts.height, opts.width))
            if self.offload:
                self.ae.encoder.to(self.device)
            init_image = self.ae.encode(init_image.to())
            if self.offload:
                self.ae = self.ae.cpu()
                torch.cuda.empty_cache()

        # prepare input
        x = get_noise(
            1,
            opts.height,
            opts.width,
            device=self.device,
            dtype=torch.bfloat16,
            seed=opts.seed,
        )
        timesteps = get_schedule(
            opts.num_steps,
            x.shape[-1] * x.shape[-2] // 4,
            shift=(not self.is_schnell),
        )
        if init_image is not None:
            t_idx = int((1 - image2image_strength) * num_steps)
            t = timesteps[t_idx]
            timesteps = timesteps[t_idx:]
            x = t * x + (1.0 - t) * init_image.to(x.dtype)

        if self.offload:
            self.t5, self.clip = self.t5.to(self.device), self.clip.to(self.device)
        inp = prepare(t5=self.t5, clip=self.clip, img=x, prompt=opts.prompt)

        # offload TEs to CPU, load model to gpu
        if self.offload:
            self.t5, self.clip = self.t5.cpu(), self.clip.cpu()
            torch.cuda.empty_cache()
            self.model = self.model.to(self.device)

        # denoise initial noise
        x = denoise_cache(self.model, **inp, timesteps=timesteps, guidance=opts.guidance)

        # offload model, load autoencoder to gpu
        if self.offload:
            self.model.cpu()
            torch.cuda.empty_cache()
            self.ae.decoder.to(x.device)

        # decode latents to pixel space
        x = unpack(x.float(), opts.height, opts.width)
        with torch.autocast(device_type=self.device.type, dtype=torch.bfloat16):
            x = self.ae.decode(x)

        if self.offload:
            self.ae.decoder.cpu()
            torch.cuda.empty_cache()

        t1 = time.perf_counter()

        print(f"Done in {t1 - t0:.1f}s.")
        # bring into PIL format
        x = x.clamp(-1, 1)
        x = embed_watermark(x.float())
        x = rearrange(x[0], "c h w -> h w c")

        img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
        nsfw_score = [x["score"] for x in self.nsfw_classifier(img) if x["label"] == "nsfw"][0]

        if nsfw_score < NSFW_THRESHOLD:
            filename = f"output/gradio/{uuid.uuid4()}.jpg"
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            exif_data = Image.Exif()
            if init_image is None:
                exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
            else:
                exif_data[ExifTags.Base.Software] = "AI generated;img2img;flux"
            exif_data[ExifTags.Base.Make] = "Black Forest Labs"
            exif_data[ExifTags.Base.Model] = self.model_name
            if add_sampling_metadata:
                exif_data[ExifTags.Base.ImageDescription] = prompt

            img.save(filename, format="jpeg", exif=exif_data, quality=95, subsampling=0)

            return img, str(opts.seed), filename, None
        else:
            return None, str(opts.seed), None, "Your generated image may contain NSFW content."


def create_demo(
    model_name: str, device: str = "cuda" if torch.cuda.is_available() else "cpu", offload: bool = False
):
    generator = FluxGenerator(model_name, device, offload)
    is_schnell = model_name == "flux-schnell"

    with gr.Blocks() as demo:
        gr.Markdown(f"# Flux Image Generation Demo - Model: {model_name}")

        with gr.Row():
            with gr.Column():
                prompt = gr.Textbox(
                    label="Prompt",
                    value='a photo of a forest with mist swirling around the tree trunks. The word "FLUX" is painted over it in big, red brush strokes with visible texture',
                )
                do_img2img = gr.Checkbox(label="Image to Image", value=False, interactive=not is_schnell)
                init_image = gr.Image(label="Input Image", visible=False)
                image2image_strength = gr.Slider(
                    0.0, 1.0, 0.8, step=0.1, label="Noising strength", visible=False
                )

                with gr.Accordion("Advanced Options", open=False):
                    width = gr.Slider(128, 8192, 1360, step=16, label="Width")
                    height = gr.Slider(128, 8192, 768, step=16, label="Height")
                    num_steps = gr.Slider(1, 50, 4 if is_schnell else 50, step=1, label="Number of steps")
                    guidance = gr.Slider(
                        1.0, 10.0, 3.5, step=0.1, label="Guidance", interactive=not is_schnell
                    )
                    seed = gr.Textbox(-1, label="Seed (-1 for random)")
                    add_sampling_metadata = gr.Checkbox(
                        label="Add sampling parameters to metadata?", value=True
                    )

                generate_btn = gr.Button("Generate")

            with gr.Column():
                output_image = gr.Image(label="Generated Image")
                seed_output = gr.Number(label="Used Seed")
                warning_text = gr.Textbox(label="Warning", visible=False)
                download_btn = gr.File(label="Download full-resolution")

        def update_img2img(do_img2img):
            return {
                init_image: gr.update(visible=do_img2img),
                image2image_strength: gr.update(visible=do_img2img),
            }

        do_img2img.change(update_img2img, do_img2img, [init_image, image2image_strength])

        generate_btn.click(
            fn=generator.generate_image,
            inputs=[
                width,
                height,
                num_steps,
                guidance,
                seed,
                prompt,
                init_image,
                image2image_strength,
                add_sampling_metadata,
            ],
            outputs=[output_image, seed_output, download_btn, warning_text],
        )

    return demo


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Flux")
    parser.add_argument(
        "--name", type=str, default="flux-schnell", choices=list(configs.keys()), help="Model name"
    )
    parser.add_argument(
        "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use"
    )
    parser.add_argument("--offload", action="store_true", help="Offload model to CPU when not in use")
    parser.add_argument("--share", action="store_true", help="Create a public link to your demo")
    args = parser.parse_args()

    demo = create_demo(args.name, args.device, args.offload)
    demo.launch(share=args.share)


================================================
FILE: flux-ToCa/demo_st.py
================================================
import os
import re
import time
from glob import iglob
from io import BytesIO

import streamlit as st
import torch
from einops import rearrange
from fire import Fire
from PIL import ExifTags, Image
from st_keyup import st_keyup
from torchvision import transforms
from transformers import pipeline

from flux.cli import SamplingOptions
from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack
from flux.ideas import denoise_cache
from flux.util import (
    configs,
    embed_watermark,
    load_ae,
    load_clip,
    load_flow_model,
    load_t5,
)

NSFW_THRESHOLD = 0.85


@st.cache_resource()
def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool):
    t5 = load_t5(device, max_length=256 if is_schnell else 512)
    clip = load_clip(device)
    model = load_flow_model(name, device="cpu" if offload else device)
    ae = load_ae(name, device="cpu" if offload else device)
    nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
    return model, ae, t5, clip, nsfw_classifier


def get_image() -> torch.Tensor | None:
    image = st.file_uploader("Input", type=["jpg", "JPEG", "png"])
    if image is None:
        return None
    image = Image.open(image).convert("RGB")

    transform = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Lambda(lambda x: 2.0 * x - 1.0),
        ]
    )
    img: torch.Tensor = transform(image)
    return img[None, ...]


@torch.inference_mode()
def main(
    device: str = "cuda" if torch.cuda.is_available() else "cpu",
    offload: bool = False,
    output_dir: str = "output",
):
    torch_device = torch.device(device)
    names = list(configs.keys())
    name = st.selectbox("Which model to load?", names)
    if name is None or not st.checkbox("Load model", False):
        return

    is_schnell = name == "flux-schnell"
    model, ae, t5, clip, nsfw_classifier = get_models(
        name,
        device=torch_device,
        offload=offload,
        is_schnell=is_schnell,
    )

    do_img2img = (
        st.checkbox(
            "Image to Image",
            False,
            disabled=is_schnell,
            help="Partially noise an image and denoise again to get variations.\n\nOnly works for flux-dev",
        )
        and not is_schnell
    )
    if do_img2img:
        init_image = get_image()
        if init_image is None:
            st.warning("Please add an image to do image to image")
        image2image_strength = st.number_input("Noising strength", min_value=0.0, max_value=1.0, value=0.8)
        if init_image is not None:
            h, w = init_image.shape[-2:]
            st.write(f"Got image of size {w}x{h} ({h*w/1e6:.2f}MP)")
        resize_img = st.checkbox("Resize image", False) or init_image is None
    else:
        init_image = None
        resize_img = True
        image2image_strength = 0.0

    # allow for packing and conversion to latent space
    width = int(
        16 * (st.number_input("Width", min_value=128, value=1360, step=16, disabled=not resize_img) // 16)
    )
    height = int(
        16 * (st.number_input("Height", min_value=128, value=768, step=16, disabled=not resize_img) // 16)
    )
    num_steps = int(st.number_input("Number of steps", min_value=1, value=(4 if is_schnell else 50)))
    guidance = float(st.number_input("Guidance", min_value=1.0, value=3.5, disabled=is_schnell))
    seed_str = st.text_input("Seed", disabled=is_schnell)
    if seed_str.isdecimal():
        seed = int(seed_str)
    else:
        st.info("No seed set, set to positive integer to enable")
        seed = None
    save_samples = st.checkbox("Save samples?", not is_schnell)
    add_sampling_metadata = st.checkbox("Add sampling parameters to metadata?", True)

    default_prompt = (
        "a photo of a forest with mist swirling around the tree trunks. The word "
        '"FLUX" is painted over it in big, red brush strokes with visible texture'
    )
    prompt = st_keyup("Enter a prompt", value=default_prompt, debounce=300, key="interactive_text")

    output_name = os.path.join(output_dir, "img_{idx}.jpg")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        idx = 0
    else:
        fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
        if len(fns) > 0:
            idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
        else:
            idx = 0

    rng = torch.Generator(device="cpu")

    if "seed" not in st.session_state:
        st.session_state.seed = rng.seed()

    def increment_counter():
        st.session_state.seed += 1

    def decrement_counter():
        if st.session_state.seed > 0:
            st.session_state.seed -= 1

    opts = SamplingOptions(
        prompt=prompt,
        width=width,
        height=height,
        num_steps=num_steps,
        guidance=guidance,
        seed=seed,
    )

    if name == "flux-schnell":
        cols = st.columns([5, 1, 1, 5])
        with cols[1]:
            st.button("↩", on_click=increment_counter)
        with cols[2]:
            st.button("↪", on_click=decrement_counter)
    if is_schnell or st.button("Sample"):
        if is_schnell:
            opts.seed = st.session_state.seed
        elif opts.seed is None:
            opts.seed = rng.seed()
        print(f"Generating '{opts.prompt}' with seed {opts.seed}")
        t0 = time.perf_counter()

        if init_image is not None:
            if resize_img:
                init_image = torch.nn.functional.interpolate(init_image, (opts.height, opts.width))
            else:
                h, w = init_image.shape[-2:]
                init_image = init_image[..., : 16 * (h // 16), : 16 * (w // 16)]
                opts.height = init_image.shape[-2]
                opts.width = init_image.shape[-1]
            if offload:
                ae.encoder.to(torch_device)
            init_image = ae.encode(init_image.to(torch_device))
            if offload:
                ae = ae.cpu()
                torch.cuda.empty_cache()

        # prepare input
        x = get_noise(
            1,
            opts.height,
            opts.width,
            device=torch_device,
            dtype=torch.bfloat16,
            seed=opts.seed,
        )
        # divide pixel space by 16**2 to account for latent space conversion
        timesteps = get_schedule(
            opts.num_steps,
            (x.shape[-1] * x.shape[-2]) // 4,
            shift=(not is_schnell),
        )
        if init_image is not None:
            t_idx = int((1 - image2image_strength) * num_steps)
            t = timesteps[t_idx]
            timesteps = timesteps[t_idx:]
            x = t * x + (1.0 - t) * init_image.to(x.dtype)

        if offload:
            t5, clip = t5.to(torch_device), clip.to(torch_device)
        inp = prepare(t5=t5, clip=clip, img=x, prompt=opts.prompt)

        # offload TEs to CPU, load model to gpu
        if offload:
            t5, clip = t5.cpu(), clip.cpu()
            torch.cuda.empty_cache()
            model = model.to(torch_device)

        # denoise initial noise
        x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance)

        # offload model, load autoencoder to gpu
        if offload:
            model.cpu()
            torch.cuda.empty_cache()
            ae.decoder.to(x.device)

        # decode latents to pixel space
        x = unpack(x.float(), opts.height, opts.width)
        with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
            x = ae.decode(x)

        if offload:
            ae.decoder.cpu()
            torch.cuda.empty_cache()

        t1 = time.perf_counter()

        fn = output_name.format(idx=idx)
        print(f"Done in {t1 - t0:.1f}s.")
        # bring into PIL format and save
        x = x.clamp(-1, 1)
        x = embed_watermark(x.float())
        x = rearrange(x[0], "c h w -> h w c")

        img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
        nsfw_score = [x["score"] for x in nsfw_classifier(img) if x["label"] == "nsfw"][0]

        if nsfw_score < NSFW_THRESHOLD:
            buffer = BytesIO()
            exif_data = Image.Exif()
            if init_image is None:
                exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
            else:
                exif_data[ExifTags.Base.Software] = "AI generated;img2img;flux"
            exif_data[ExifTags.Base.Make] = "Black Forest Labs"
            exif_data[ExifTags.Base.Model] = name
            if add_sampling_metadata:
                exif_data[ExifTags.Base.ImageDescription] = prompt
            img.save(buffer, format="jpeg", exif=exif_data, quality=95, subsampling=0)

            img_bytes = buffer.getvalue()
            if save_samples:
                print(f"Saving {fn}")
                with open(fn, "wb") as file:
                    file.write(img_bytes)
                idx += 1

            st.session_state["samples"] = {
                "prompt": opts.prompt,
                "img": img,
                "seed": opts.seed,
                "bytes": img_bytes,
            }
            opts.seed = None
        else:
            st.warning("Your generated image may contain NSFW content.")
            st.session_state["samples"] = None

    samples = st.session_state.get("samples", None)
    if samples is not None:
        st.image(samples["img"], caption=samples["prompt"])
        st.download_button(
            "Download full-resolution",
            samples["bytes"],
            file_name="generated.jpg",
            mime="image/jpg",
        )
        st.write(f"Seed: {samples['seed']}")


def app():
    Fire(main)


if __name__ == "__main__":
    app()


================================================
FILE: flux-ToCa/demo_st_fill.py
================================================
import os
import re
import tempfile
import time
from glob import iglob
from io import BytesIO

import numpy as np
import streamlit as st
import torch
from einops import rearrange
from PIL import ExifTags, Image
from st_keyup import st_keyup
from streamlit_drawable_canvas import st_canvas
from transformers import pipeline

from flux.sampling import denoise, get_noise, get_schedule, prepare_fill, unpack
from flux.ideas import denoise_cache
from flux.util import embed_watermark, load_ae, load_clip, load_flow_model, load_t5

NSFW_THRESHOLD = 0.85


def add_border_and_mask(image, zoom_all=1.0, zoom_left=0, zoom_right=0, zoom_up=0, zoom_down=0, overlap=0):
    """Adds a black border around the image with individual side control and mask overlap"""
    orig_width, orig_height = image.size

    # Calculate padding for each side (in pixels)
    left_pad = int(orig_width * zoom_left)
    right_pad = int(orig_width * zoom_right)
    top_pad = int(orig_height * zoom_up)
    bottom_pad = int(orig_height * zoom_down)

    # Calculate overlap in pixels
    overlap_left = int(orig_width * overlap)
    overlap_right = int(orig_width * overlap)
    overlap_top = int(orig_height * overlap)
    overlap_bottom = int(orig_height * overlap)

    # If using the all-sides zoom, add it to each side
    if zoom_all > 1.0:
        extra_each_side = (zoom_all - 1.0) / 2
        left_pad += int(orig_width * extra_each_side)
        right_pad += int(orig_width * extra_each_side)
        top_pad += int(orig_height * extra_each_side)
        bottom_pad += int(orig_height * extra_each_side)

    # Calculate new dimensions (ensure they're multiples of 32)
    new_width = 32 * round((orig_width + left_pad + right_pad) / 32)
    new_height = 32 * round((orig_height + top_pad + bottom_pad) / 32)

    # Create new image with black border
    bordered_image = Image.new("RGB", (new_width, new_height), (0, 0, 0))
    # Paste original image in position
    paste_x = left_pad
    paste_y = top_pad
    bordered_image.paste(image, (paste_x, paste_y))

    # Create mask (white where the border is, black where the original image was)
    mask = Image.new("L", (new_width, new_height), 255)  # White background
    # Paste black rectangle with overlap adjustment
    mask.paste(
        0,
        (
            paste_x + overlap_left,  # Left edge moves right
            paste_y + overlap_top,  # Top edge moves down
            paste_x + orig_width - overlap_right,  # Right edge moves left
            paste_y + orig_height - overlap_bottom,  # Bottom edge moves up
        ),
    )

    return bordered_image, mask


@st.cache_resource()
def get_models(name: str, device: torch.device, offload: bool):
    t5 = load_t5(device, max_length=128)
    clip = load_clip(device)
    model = load_flow_model(name, device="cpu" if offload else device)
    ae = load_ae(name, device="cpu" if offload else device)
    nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
    return model, ae, t5, clip, nsfw_classifier


def resize(img: Image.Image, min_mp: float = 0.5, max_mp: float = 2.0) -> Image.Image:
    width, height = img.size
    mp = (width * height) / 1_000_000  # Current megapixels

    if min_mp <= mp <= max_mp:
        # Even if MP is in range, ensure dimensions are multiples of 32
        new_width = int(32 * round(width / 32))
        new_height = int(32 * round(height / 32))
        if new_width != width or new_height != height:
            return img.resize((new_width, new_height), Image.Resampling.LANCZOS)
        return img

    # Calculate scaling factor
    if mp < min_mp:
        scale = (min_mp / mp) ** 0.5
    else:  # mp > max_mp
        scale = (max_mp / mp) ** 0.5

    new_width = int(32 * round(width * scale / 32))
    new_height = int(32 * round(height * scale / 32))

    return img.resize((new_width, new_height), Image.Resampling.LANCZOS)


def clear_canvas_state():
    """Clear all canvas-related state"""
    keys_to_clear = ["canvas", "last_image_dims"]
    for key in keys_to_clear:
        if key in st.session_state:
            del st.session_state[key]


def set_new_image(img: Image.Image):
    """Safely set a new image and clear relevant state"""
    st.session_state["current_image"] = img
    clear_canvas_state()
    st.rerun()


def downscale_image(img: Image.Image, scale_factor: float) -> Image.Image:
    """Downscale image by a given factor while maintaining 32-pixel multiple dimensions"""
    if scale_factor >= 1.0:
        return img

    width, height = img.size
    new_width = int(32 * round(width * scale_factor / 32))
    new_height = int(32 * round(height * scale_factor / 32))

    # Ensure minimum dimensions
    new_width = max(64, new_width)  # minimum 64 pixels
    new_height = max(64, new_height)  # minimum 64 pixels

    return img.resize((new_width, new_height), Image.Resampling.LANCZOS)


@torch.inference_mode()
def main(
    device: str = "cuda" if torch.cuda.is_available() else "cpu",
    offload: bool = False,
    output_dir: str = "output",
):
    torch_device = torch.device(device)
    st.title("Flux Fill: Inpainting & Outpainting")

    # Model selection and loading
    name = "flux-dev-fill"
    if not st.checkbox("Load model", False):
        return

    try:
        model, ae, t5, clip, nsfw_classifier = get_models(
            name,
            device=torch_device,
            offload=offload,
        )
    except Exception as e:
        st.error(f"Error loading models: {e}")
        return

    # Mode selection
    mode = st.radio("Select Mode", ["Inpainting", "Outpainting"])

    # Image handling - either from previous generation or new upload
    if "input_image" in st.session_state:
        image = st.session_state["input_image"]
        del st.session_state["input_image"]
        set_new_image(image)
        st.write("Continuing from previous result")
    else:
        uploaded_image = st.file_uploader("Upload image", type=["jpg", "jpeg", "png"])
        if uploaded_image is None:
            st.warning("Please upload an image")
            return

        if (
            "current_image_name" not in st.session_state
            or st.session_state["current_image_name"] != uploaded_image.name
        ):
            try:
                image = Image.open(uploaded_image).convert("RGB")
                st.session_state["current_image_name"] = uploaded_image.name
                set_new_image(image)
            except Exception as e:
                st.error(f"Error loading image: {e}")
                return
        else:
            image = st.session_state.get("current_image")
            if image is None:
                st.error("Error: Image state is invalid. Please reupload the image.")
                clear_canvas_state()
                return

    # Add downscale control
    with st.expander("Image Size Control"):
        current_mp = (image.size[0] * image.size[1]) / 1_000_000
        st.write(f"Current image size: {image.size[0]}x{image.size[1]} ({current_mp:.1f}MP)")

        scale_factor = st.slider(
            "Downscale Factor",
            min_value=0.1,
            max_value=1.0,
            value=1.0,
            step=0.1,
            help="1.0 = original size, 0.5 = half size, etc.",
        )

        if scale_factor < 1.0 and st.button("Apply Downscaling"):
            image = downscale_image(image, scale_factor)
            set_new_image(image)
            st.rerun()

    # Resize image with validation
    try:
        original_mp = (image.size[0] * image.size[1]) / 1_000_000
        image = resize(image)
        width, height = image.size
        current_mp = (width * height) / 1_000_000

        if width % 32 != 0 or height % 32 != 0:
            st.error("Error: Image dimensions must be multiples of 32")
            return

        st.write(f"Image dimensions: {width}x{height} pixels")
        if original_mp != current_mp:
            st.write(
                f"Image has been resized from {original_mp:.1f}MP to {current_mp:.1f}MP to stay within bounds (0.5MP - 2MP)"
            )
    except Exception as e:
        st.error(f"Error processing image: {e}")
        return

    if mode == "Outpainting":
        # Outpainting controls
        zoom_all = st.slider("Zoom Out Amount (All Sides)", min_value=1.0, max_value=3.0, value=1.0, step=0.1)

        with st.expander("Advanced Zoom Controls"):
            st.info("These controls add additional zoom to specific sides")
            col1, col2 = st.columns(2)
            with col1:
                zoom_left = st.slider("Left", min_value=0.0, max_value=1.0, value=0.0, step=0.1)
                zoom_right = st.slider("Right", min_value=0.0, max_value=1.0, value=0.0, step=0.1)
            with col2:
                zoom_up = st.slider("Up", min_value=0.0, max_value=1.0, value=0.0, step=0.1)
                zoom_down = st.slider("Down", min_value=0.0, max_value=1.0, value=0.0, step=0.1)

        overlap = st.slider("Overlap", min_value=0.01, max_value=0.25, value=0.01, step=0.01)

        # Generate bordered image and mask
        image_for_generation, mask = add_border_and_mask(
            image,
            zoom_all=zoom_all,
            zoom_left=zoom_left,
            zoom_right=zoom_right,
            zoom_up=zoom_up,
            zoom_down=zoom_down,
            overlap=overlap,
        )
        width, height = image_for_generation.size

        # Show preview
        col1, col2 = st.columns(2)
        with col1:
            st.image(image_for_generation, caption="Image with Border")
        with col2:
            st.image(mask, caption="Mask (white areas will be generated)")

    else:  # Inpainting mode
        # Canvas setup with dimension tracking
        canvas_key = f"canvas_{width}_{height}"
        if "last_image_dims" not in st.session_state:
            st.session_state.last_image_dims = (width, height)
        elif st.session_state.last_image_dims != (width, height):
            clear_canvas_state()
            st.session_state.last_image_dims = (width, height)
            st.rerun()

        try:
            canvas_result = st_canvas(
                fill_color="rgba(255, 255, 255, 0.0)",
                stroke_width=st.slider("Brush size", 1, 500, 50),
                stroke_color="#fff",
                background_image=image,
                height=height,
                width=width,
                drawing_mode="freedraw",
                key=canvas_key,
                display_toolbar=True,
            )
        except Exception as e:
            st.error(f"Error creating canvas: {e}")
            clear_canvas_state()
            st.rerun()
            return

    # Sampling parameters
    num_steps = int(st.number_input("Number of steps", min_value=1, value=50))
    guidance = float(st.number_input("Guidance", min_value=1.0, value=30.0))
    seed_str = st.text_input("Seed")
    if seed_str.isdecimal():
        seed = int(seed_str)
    else:
        st.info("No seed set, using random seed")
        seed = None

    save_samples = st.checkbox("Save samples?", True)
    add_sampling_metadata = st.checkbox("Add sampling parameters to metadata?", True)

    # Prompt input
    prompt = st_keyup("Enter a prompt", value="", debounce=300, key="interactive_text")

    # Setup output path
    output_name = os.path.join(output_dir, "img_{idx}.jpg")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        idx = 0
    else:
        fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
        idx = len(fns)

    if st.button("Generate"):
        valid_input = False

        if mode == "Inpainting" and canvas_result.image_data is not None:
            valid_input = True
            # Create mask from canvas
            try:
                mask = Image.fromarray(canvas_result.image_data)
                mask = mask.getchannel("A")  # Get alpha channel
                mask_array = np.array(mask)
                mask_array = (mask_array > 0).astype(np.uint8) * 255
                mask = Image.fromarray(mask_array)
                image_for_generation = image
            except Exception as e:
                st.error(f"Error creating mask: {e}")
                return

        elif mode == "Outpainting":
            valid_input = True
            # image_for_generation and mask are already set above

        if not valid_input:
            st.error("Please draw a mask or configure outpainting settings")
            return

        # Create temporary files
        with (
            tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img,
            tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_mask,
        ):
            try:
                image_for_generation.save(tmp_img.name)
                mask.save(tmp_mask.name)
            except Exception as e:
                st.error(f"Error saving temporary files: {e}")
                return

            try:
                # Generate inpainting/outpainting
                rng = torch.Generator(device="cpu")
                if seed is None:
                    seed = rng.seed()

                print(f"Generating with seed {seed}:\n{prompt}")
                t0 = time.perf_counter()

                x = get_noise(
                    1,
                    height,
                    width,
                    device=torch_device,
                    dtype=torch.bfloat16,
                    seed=seed,
                )

                if offload:
                    t5, clip, ae = t5.to(torch_device), clip.to(torch_device), ae.to(torch_device)

                inp = prepare_fill(
                    t5,
                    clip,
                    x,
                    prompt=prompt,
                    ae=ae,
                    img_cond_path=tmp_img.name,
                    mask_path=tmp_mask.name,
                )

                timesteps = get_schedule(num_steps, inp["img"].shape[1], shift=True)

                if offload:
                    t5, clip, ae = t5.cpu(), clip.cpu(), ae.cpu()
                    torch.cuda.empty_cache()
                    model = model.to(torch_device)

                x = denoise_cache(model, **inp, timesteps=timesteps, guidance=guidance)

                if offload:
                    model.cpu()
                    torch.cuda.empty_cache()
                    ae.decoder.to(x.device)

                x = unpack(x.float(), height, width)
                with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
                    x = ae.decode(x)

                t1 = time.perf_counter()
                print(f"Done in {t1 - t0:.1f}s")

                # Process and display result
                x = x.clamp(-1, 1)
                x = embed_watermark(x.float())
                x = rearrange(x[0], "c h w -> h w c")
                img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())

                nsfw_score = [x["score"] for x in nsfw_classifier(img) if x["label"] == "nsfw"][0]

                if nsfw_score < NSFW_THRESHOLD:
                    buffer = BytesIO()
                    exif_data = Image.Exif()
                    exif_data[ExifTags.Base.Software] = "AI generated;inpainting;flux"
                    exif_data[ExifTags.Base.Make] = "Black Forest Labs"
                    exif_data[ExifTags.Base.Model] = name
                    if add_sampling_metadata:
                        exif_data[ExifTags.Base.ImageDescription] = prompt
                    img.save(buffer, format="jpeg", exif=exif_data, quality=95, subsampling=0)

                    img_bytes = buffer.getvalue()
                    if save_samples:
                        fn = output_name.format(idx=idx)
                        print(f"Saving {fn}")
                        with open(fn, "wb") as file:
                            file.write(img_bytes)

                    st.session_state["samples"] = {
                        "prompt": prompt,
                        "img": img,
                        "seed": seed,
                        "bytes": img_bytes,
                    }
                else:
                    st.warning("Your generated image may contain NSFW content.")
                    st.session_state["samples"] = None

            except Exception as e:
                st.error(f"Error during generation: {e}")
                return
            finally:
                # Clean up temporary files
                try:
                    os.unlink(tmp_img.name)
                    os.unlink(tmp_mask.name)
                except Exception as e:
                    print(f"Error cleaning up temporary files: {e}")

    # Display results
    samples = st.session_state.get("samples", None)
    if samples is not None:
        st.image(samples["img"], caption=samples["prompt"])
        col1, col2 = st.columns(2)
        with col1:
            st.download_button(
                "Download full-resolution",
                samples["bytes"],
                file_name="generated.jpg",
                mime="image/jpg",
            )
        with col2:
            if st.button("Continue from this image"):
                # Store the generated image
                new_image = samples["img"]
                # Clear ALL canvas state
                clear_canvas_state()
                if "samples" in st.session_state:
                    del st.session_state["samples"]
                # Set as current image
                st.session_state["current_image"] = new_image
                st.rerun()

        st.write(f"Seed: {samples['seed']}")


if __name__ == "__main__":
    st.set_page_config(layout="wide")
    main()


================================================
FILE: flux-ToCa/docs/fill.md
================================================
## Models

FLUX.1 Fill introduces advanced inpainting and outpainting capabilities. It allows for seamless edits that integrate naturally with existing images.

| Name                | HuggingFace repo                                         | License                                                               | sha256sum                                                        |
| ------------------- | -------------------------------------------------------- | --------------------------------------------------------------------- | ---------------------------------------------------------------- |
| `FLUX.1 Fill [dev]` | https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 03e289f530df51d014f48e675a9ffa2141bc003259bf5f25d75b957e920a41ca |
| `FLUX.1 Fill [pro]` | Only available in our API.                               |

## Examples

![inpainting](../assets/docs/inpainting.png)
![outpainting](../assets/docs/outpainting.png)

## Open-weights usage

The weights will be downloaded automatically from HuggingFace once you start one of the demos. To download `FLUX.1 Fill [dev]`, you will need to be logged in, see [here](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-login). Alternatively, if you have downloaded the model weights manually from [here](https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev), you can specify the downloaded paths via environment variables:

```bash
export FLUX_DEV_FILL=<path_to_flux_dev_fill_sft_file>
export AE=<path_to_ae_sft_file>
```

For interactive sampling run

```bash
python -m src.flux.cli_fill --loop
```

Or to generate a single sample run

```bash
python -m src.flux.cli_fill \
  --img_cond_path <path_to_input_image> \
  --img_mask_path <path_to_input_mask>
```

The input_mask should be an image of the same size as the conditioning image that only contains black and white pixels; see [an example mask](../assets/cup_mask.png) for [this image](../assets/cup.png).

We also provide an interactive streamlit demo. The demo can be run via

```bash
streamlit run demo_st_fill.py
```


================================================
FILE: flux-ToCa/docs/image-variation.md
================================================
## Models

FLUX.1 Redux is an adapter for the FLUX.1 text-to-image base models, FLUX.1 [dev] and FLUX.1 [schnell], which can be used to generate image variations. 
In addition, FLUX.1 Redux [pro] is available in our API and, augmenting the [dev] adapter, the API endpoint allows users to modify an image given a textual description. The feature is supported in our latest model FLUX1.1 [pro] Ultra, allowing for combining input images and text prompts to create high-quality 4-megapixel outputs with flexible aspect ratios.

| Name                        | HuggingFace repo                                                                                | License                                                               | sha256sum                                                        |
| --------------------------- | ----------------------------------------------------------------------------------------------- | --------------------------------------------------------------------- | ---------------------------------------------------------------- |
| `FLUX.1 Redux [dev]`        | https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev                                       | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | a1b3bdcb4bdc58ce04874b9ca776d61fc3e914bb6beab41efb63e4e2694dca45 |
| `FLUX.1 Redux [pro]`        | [Available in our API.](https://docs.bfl.ml/) Supports image variations.                        |
| `FLUX1.1 Redux [pro] Ultra` | [Available in our API.](https://docs.bfl.ml/) Supports image variations based on a text prompt. |

## Examples

![redux](../assets/docs/redux.png)

## Open-weights usage

The text-to-image base model weights and the autoencoder weights will be downloaded automatically from HuggingFace once you start the demo. To download `FLUX.1 [dev]`, you will need to be logged in, see [here](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-login). You need to manually download the adapter weights from [here](https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev) and specify them via an environment variable `export FLUX_REDUX=<path_to_flux_redux_sft_file>`. In general, you may specify any manually downloaded weights via environment variables:

```bash
export FLUX_REDUX=<path_to_flux_redux_sft_file>
export FLUX_SCHNELL=<path_to_flux_schnell_sft_file>
export FLUX_DEV=<path_to_flux_dev_sft_file>
export AE=<path_to_ae_sft_file>
```

For interactive sampling run

```bash
python -m src.flux.cli_redux --loop --name <name>
```

where `name` is one of `flux-dev` or `flux-schnell`.


================================================
FILE: flux-ToCa/docs/structural-conditioning.md
================================================
## Models

Structural conditioning uses canny edge or depth detection to maintain precise control during image transformations. By preserving the original image's structure through edge or depth maps, users can make text-guided edits while keeping the core composition intact. This is particularly effective for retexturing images. We release four variations: two based on edge maps (full model and LoRA for FLUX.1 [dev]) and two based on depth maps (full model and LoRA for FLUX.1 [dev]).

| Name                      | HuggingFace repo                                               | License                                                               | sha256sum                                                        |
| ------------------------- | -------------------------------------------------------------- | --------------------------------------------------------------------- | ---------------------------------------------------------------- |
| `FLUX.1 Canny [dev]`      | https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev      | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 996876670169591cb412b937fbd46ea14cbed6933aef17c48a2dcd9685c98cdb |
| `FLUX.1 Depth [dev]`      | https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev      | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 41360d1662f44ca45bc1b665fe6387e91802f53911001630d970a4f8be8dac21 |
| `FLUX.1 Canny [dev] LoRA` | https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev-lora | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 8eaa21b9c43d5e7242844deb64b8cf22ae9010f813f955ca8c05f240b8a98f7e |
| `FLUX.1 Depth [dev] LoRA` | https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev-lora | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 1938b38ea0fdd98080fa3e48beb2bedfbc7ad102d8b65e6614de704a46d8b907 | 
| `FLUX.1 Canny [pro]`      | [Available in our API](https://docs.bfl.ml/).                  |
| `FLUX.1 Depth [pro]`      | [Available in our API](https://docs.bfl.ml/).                  |

## Examples

![canny](../assets/docs/canny.png)
![depth](../assets/docs/depth.png)

## Open-weights usage

The full model weights (`FLUX.1 Canny [dev], Flux.1 Depth [dev], FLUX.1 [dev], and the autoencoder) will be downloaded automatically from HuggingFace once you start one of the demos. To download them, you will need to be logged in, see [here](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-login). The LoRA weights are not downloaded automatically, but can be downloaded manually [here (Canny)](https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev-lora) and [here (Depth)](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev-lora). You may specify any manually downloaded weights via environment variables: (**necessary for LoRAs**):

```bash
export FLUX_DEV_DEPTH=<path_to_flux_dev_depth_sft_file>
export FLUX_DEV_CANNY=<path_to_flux_dev_canny_sft_file>
export FLUX_DEV_DEPTH_LORA=<path_to_flux_dev_depth_lora_sft_file>
export FLUX_DEV_CANNY_LORA=<path_to_flux_dev_canny_lora_sft_file>
export FLUX_REDUX=<path_to_flux_redux_sft_file>
export FLUX_SCHNELL=<path_to_flux_schnell_sft_file>
export FLUX_DEV=<path_to_flux_dev_sft_file>
export AE=<path_to_ae_sft_file>
```

For interactive sampling run

```bash
python -m src.flux.cli_control --loop --name <name>
```

where `name` is one of `flux-dev-canny`, `flux-dev-depth`, `flux-dev-canny-lora`, or `flux-dev-depth-lora`.

## Diffusers usage

Flux Control (including the LoRAs) is also compatible with the `diffusers` Python library. Check out the [documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux) to learn more.


================================================
FILE: flux-ToCa/docs/text-to-image.md
================================================
## Models

We currently offer four text-to-image models. `FLUX1.1 [pro]` is our most capable model which can generate images at up to 4MP while maintaining an impressive generation time of only 10 seconds per sample.

| Name                      | HuggingFace repo                                        | License                                                               | sha256sum                                                        |
| ------------------------- | ------------------------------------------------------- | --------------------------------------------------------------------- | ---------------------------------------------------------------- |
| `FLUX.1 [schnell]`        | https://huggingface.co/black-forest-labs/FLUX.1-schnell | [apache-2.0](model_licenses/LICENSE-FLUX1-schnell)                    | 9403429e0052277ac2a87ad800adece5481eecefd9ed334e1f348723621d2a0a |
| `FLUX.1 [dev]`            | https://huggingface.co/black-forest-labs/FLUX.1-dev     | [FLUX.1-dev Non-Commercial License](model_licenses/LICENSE-FLUX1-dev) | 4610115bb0c89560703c892c59ac2742fa821e60ef5871b33493ba544683abd7 |
| `FLUX.1 [pro]`            | [Available in our API](https://docs.bfl.ml/).           |
| `FLUX1.1 [pro]`           | [Available in our API](https://docs.bfl.ml/).           |
| `FLUX1.1 [pro] Ultra/raw` | [Available in our API](https://docs.bfl.ml/).           |

## Open-weights usage

The weights will be downloaded automatically from HuggingFace once you start one of the demos. To download `FLUX.1 [dev]`, you will need to be logged in, see [here](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-login).
If you have downloaded the model weights manually, you can specify the downloaded paths via environment-variables:

```bash
export FLUX_SCHNELL=<path_to_flux_schnell_sft_file>
export FLUX_DEV=<path_to_flux_dev_sft_file>
export AE=<path_to_ae_sft_file>
```

For interactive sampling run

```bash
python -m flux --name <name> --loop
```

Or to generate a single sample run

```bash
python -m flux --name <name> \
  --height <height> --width <width> \
  --prompt "<prompt>"
```

We also provide a streamlit demo that does both text-to-image and image-to-image. The demo can be run via

```bash
streamlit run demo_st.py
```

We also offer a Gradio-based demo for an interactive experience. To run the Gradio demo:

```bash
python demo_gr.py --name flux-schnell --device cuda
```

Options:

- `--name`: Choose the model to use (options: "flux-schnell", "flux-dev")
- `--device`: Specify the device to use (default: "cuda" if available, otherwise "cpu")
- `--offload`: Offload model to CPU when not in use
- `--share`: Create a public link to your demo

To run the demo with the dev model and create a public link:

```bash
python demo_gr.py --name flux-dev --share
```

## Diffusers integration

`FLUX.1 [schnell]` and `FLUX.1 [dev]` are integrated with the [🧨 diffusers](https://github.com/huggingface/diffusers) library. To use it with diffusers, install it:

```shell
pip install git+https://github.com/huggingface/diffusers.git
```

Then you can use `FluxPipeline` to run the model

```python
import torch
from diffusers import FluxPipeline

model_id = "black-forest-labs/FLUX.1-schnell" #you can also use `black-forest-labs/FLUX.1-dev`

pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
pipe.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power

prompt = "A cat holding a sign that says hello world"
seed = 42
image = pipe(
    prompt,
    output_type="pil",
    num_inference_steps=4, #use a larger number if you are using [dev]
    generator=torch.Generator("cpu").manual_seed(seed)
).images[0]
image.save("flux-schnell.png")
```

To learn more check out the [diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux) documentation


================================================
FILE: flux-ToCa/model_cards/FLUX.1-dev.md
================================================
![FLUX.1 [dev] Grid](../assets/dev_grid.jpg)

`FLUX.1 [dev]` is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions.
For more information, please read our [blog post](https://blackforestlabs.ai/announcing-black-forest-labs/).

# Key Features
1. Cutting-edge output quality, second only to our state-of-the-art model `FLUX.1 [pro]`.
2. Competitive prompt following, matching the performance of closed source alternatives.
3. Trained using guidance distillation, making `FLUX.1 [dev]` more efficient.
4. Open weights to drive new scientific research, and empower artists to develop innovative workflows.
5. Generated outputs can be used for personal, scientific, and commercial purposes, as described in the [flux-1-dev-non-commercial-license](./licence.md).

# Usage
We provide a reference implementation of `FLUX.1 [dev]`, as well as sampling code, in a dedicated [github repository](https://github.com/black-forest-labs/flux).
Developers and creatives looking to build on top of `FLUX.1 [dev]` are encouraged to use this as a starting point.

## API Endpoints
The FLUX.1 models are also available via API from the following sources
1. [bfl.ml](https://docs.bfl.ml/) (currently `FLUX.1 [pro]`)
2. [replicate.com](https://replicate.com/collections/flux)
3. [fal.ai](https://fal.ai/models/fal-ai/flux/dev)

## ComfyUI
`FLUX.1 [dev]` is also available in [Comfy UI](https://github.com/comfyanonymous/ComfyUI) for local inference with a node-based workflow.

---
# Limitations
- This model is not intended or able to provide factual information.
- As a statistical model this checkpoint might amplify existing societal biases.
- The model may fail to generate output that matches the prompts.
- Prompt following is heavily influenced by the prompting-style.

# Out-of-Scope Use
The model and its derivatives may not be used

- In any way that violates any applicable national, federal, state, local or international law or regulation.
- For the purpose of exploiting, harming or attempting to exploit or harm minors in any way; including but not limited to the solicitation, creation, acquisition, or dissemination of child exploitative content.
- To generate or disseminate verifiably false information and/or content with the purpose of harming others.
- To generate or disseminate personal identifiable information that can be used to harm an individual.
- To harass, abuse, threaten, stalk, or bully individuals or groups of individuals.
- To create non-consensual nudity or illegal pornographic content.
- For fully automated decision making that adversely impacts an individual's legal rights or otherwise creates or modifies a binding, enforceable obligation.
- Generating or facilitating large-scale disinformation campaigns.

# License
This model falls under the [`FLUX.1 [dev]` Non-Commercial License](https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/LICENSE.md).


================================================
FILE: flux-ToCa/model_cards/FLUX.1-schnell.md
================================================
![FLUX.1 [schnell] Grid](../assets/schnell_grid.jpg)

`FLUX.1 [schnell]` is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions.
For more information, please read our [blog post](https://blackforestlabs.ai/announcing-black-forest-labs/).

# Key Features
1. Cutting-edge output quality and competitive prompt following, matching the performance of closed source alternatives.
2. Trained using latent adversarial diffusion distillation, `FLUX.1 [schnell]` can generate high-quality images in only 1 to 4 steps.
3. Released under the `apache-2.0` licence, the model can be used for personal, scientific, and commercial purposes.

# Usage
We provide a reference implementation of `FLUX.1 [schnell]`, as well as sampling code, in a dedicated [github repository](https://github.com/black-forest-labs/flux).
Developers and creatives looking to build on top of `FLUX.1 [schnell]` are encouraged to use this as a starting point.

## API Endpoints
The FLUX.1 models are also available via API from the following sources
1. [bfl.ml](https://docs.bfl.ml/) (currently `FLUX.1 [pro]`)
2. [replicate.com](https://replicate.com/collections/flux)
3. [fal.ai](https://fal.ai/models/fal-ai/flux/schnell)

## ComfyUI
`FLUX.1 [schnell]` is also available in [Comfy UI](https://github.com/comfyanonymous/ComfyUI) for local inference with a node-based workflow.

---
# Limitations
- This model is not intended or able to provide factual information.
- As a statistical model this checkpoint might amplify existing societal biases.
- The model may fail to generate output that matches the prompts.
- Prompt following is heavily influenced by the prompting-style.

# Out-of-Scope Use
The model and its derivatives may not be used

- In any way that violates any applicable national, federal, state, local or international law or regulation.
- For the purpose of exploiting, harming or attempting to exploit or harm minors in any way; including but not limited to the solicitation, creation, acquisition, or dissemination of child exploitative content.
- To generate or disseminate verifiably false information and/or content with the purpose of harming others.
- To generate or disseminate personal identifiable information that can be used to harm an individual.
- To harass, abuse, threaten, stalk, or bully individuals or groups of individuals.
- To create non-consensual nudity or illegal pornographic content.
- For fully automated decision making that adversely impacts an individual's legal rights or otherwise creates or modifies a binding, enforceable obligation.
- Generating or facilitating large-scale disinformation campaigns.


================================================
FILE: flux-ToCa/model_licenses/LICENSE-FLUX1-dev
================================================
FLUX.1 [dev] Non-Commercial License 
Black Forest Labs, Inc. (“we” or “our” or “Company”) is pleased to make available the weights, parameters and inference code for the FLUX.1 [dev] Model (as defined below) freely available for your non-commercial and non-production use as set forth in this FLUX.1 [dev] Non-Commercial License (“License”).  The “FLUX.1 [dev] Model” means the FLUX.1 [dev] AI models, including FLUX.1 [dev], FLUX.1 Fill [dev], FLUX.1 Depth [dev], FLUX.1 Canny [dev], FLUX.1 Redux [dev], FLUX.1 Canny [dev] LoRA and FLUX.1 Depth [dev] LoRA, and their elements which includes algorithms, software, checkpoints, parameters, source code (inference code, evaluation code, and if applicable, fine-tuning code) and any other materials associated with the FLUX.1 [dev] AI models made available by Company under this License, including if any, the technical documentation, manuals and instructions for the use and operation thereof (collectively, “FLUX.1 [dev] Model”).
By downloading, accessing, use, Distributing (as defined below), or creating a Derivative (as defined below) of the FLUX.1 [dev] Model, you agree to the terms of this License. If you do not agree to this License, then you do not have any rights to access, use, Distribute or create a Derivative of the FLUX.1 [dev] Model and you must immediately cease using the FLUX.1 [dev] Model. If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to us that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the FLUX.1 [dev] Model on behalf of your employer or other entity.
    1. Definitions. Capitalized terms used in this License but not defined herein have the following meanings:
        a. “Derivative”  means any (i) modified version of the FLUX.1 [dev] Model (including but not limited to any customized or fine-tuned version thereof), (ii) work based on the FLUX.1 [dev] Model, or (iii) any other derivative work thereof. For the avoidance of doubt, Outputs are not considered Derivatives under this License. 
        b. “Distribution” or “Distribute” or “Distributing” means providing or making available, by any means, a copy of the FLUX.1 [dev] Models and/or the Derivatives as the case may be. 
        c. “Non-Commercial Purpose” means any of the following uses, but only so far as you do not receive any direct or indirect payment arising from the use of the model or its output: (i) personal use for research, experiment, and testing for the benefit of public knowledge, personal study, private entertainment, hobby projects, or otherwise not directly or indirectly connected to any commercial activities, business operations, or employment responsibilities; (ii) use by commercial or for-profit entities for testing, evaluation, or non-commercial research and development in a non-production environment, (iii) use by any charitable organization for charitable purposes, or for testing or evaluation. For clarity, use for revenue-generating activity or direct interactions with or impacts on end users, or use to train, fine tune or distill other models for commercial use is not a Non-Commercial purpose.
        d. “Outputs” means any content generated by the operation of the FLUX.1 [dev] Models or the Derivatives from a prompt (i.e., text instructions) provided by users. For the avoidance of doubt, Outputs do not include any components of a FLUX.1 [dev] Models, such as any fine-tuned versions of the FLUX.1 [dev] Models, the weights, or parameters. 
        e.   “you” or “your” means the individual or entity entering into this License with Company.
    2. License Grant.
        a. License. Subject to your compliance with this License, Company grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license to access, use, create Derivatives of, and Distribute the FLUX.1 [dev] Models solely for your Non-Commercial Purposes. The foregoing license is personal to you, and you may not assign or sublicense this License or any other rights or obligations under this License without Company’s prior written consent; any such assignment or sublicense will be void and will automatically and immediately terminate this License.  Any restrictions set forth herein in regarding the FLUX.1 [dev] Model also applies to any Derivative you create or that are created on your behalf.
        b. Non-Commercial Use Only.  You may only access, use, Distribute, or creative Derivatives of or the FLUX.1 [dev] Model or Derivatives for Non-Commercial Purposes.  If You want to use a FLUX.1 [dev] Model a Derivative for any purpose that is not expressly authorized under this License, such as for a commercial activity, you must request a license from Company, which Company may grant to you in Company’s sole discretion and which additional use may be subject to a fee, royalty or other revenue share. Please contact Company at the following e-mail address if you want to discuss such a license: info@blackforestlabs.ai. 
        c. Reserved Rights. The grant of rights expressly set forth in this License are the complete grant of rights to you in the FLUX.1 [dev] Model, and no other licenses are granted, whether by waiver, estoppel, implication, equity or otherwise. Company and its licensors reserve all rights not expressly granted by this License. 
        d. Outputs. We claim no ownership rights in and to the Outputs. You are solely responsible for the Outputs you generate and their subsequent uses in accordance with this License.  You may use Output for any purpose (including for commercial purposes), except as expressly prohibited herein.  You may not use the Output to train, fine-tune or distill a model that is competitive with the FLUX.1 [dev] Model.
    3. Distribution. Subject to this License, you may Distribute copies of the FLUX.1 [dev] Model and/or Derivatives made by you, under the following conditions: 
        a. you must make available a copy of this License to third-party recipients of the FLUX.1 [dev] Models and/or Derivatives you Distribute, and specify that any rights to use the FLUX.1 [dev] Models and/or Derivatives shall be directly granted by Company to said third-party recipients pursuant to this License; 
        b. you must make prominently display the following notice alongside the Distribution of the FLUX.1 [dev] Model or Derivative (such as via a “Notice” text file distributed as part of such FLUX.1 [dev] Model or Derivative) (the “Attribution Notice”): 
“The FLUX.1 [dev] Model is licensed by Black Forest Labs. Inc. under the FLUX.1 [dev] Non-Commercial License. Copyright Black Forest Labs. Inc. 
IN NO EVENT SHALL BLACK FOREST LABS, INC. BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH USE OF THIS MODEL.”
        c. in the case of Distribution of Derivatives made by you, you must also include in the Attribution Notice a statement that you have modified the applicable FLUX.1 [dev] Model; and
        d. in the case of Distribution of Derivatives made by you, any terms and conditions you impose on any third-party recipients relating to Derivatives made by or for you shall neither limit such third-party recipients’ use of the FLUX.1 [dev] Model or any Derivatives made by or for Company in accordance with this License nor conflict with any of its terms and conditions. 
        e. In the case of Distribution of Derivatives made by you, you must not misrepresent or imply, through any means, that the Derivatives made by or for you and/or any modified version of the FLUX.1 [dev] Model you Distribute under your name and responsibility is an official product of the Company or has been endorsed, approved or validated by the Company, unless you are authorized by Company to do so in writing.
    4. Restrictions.  You will not, and will not permit, assist or cause any third party to 
        a. use, modify, copy, reproduce, create Derivatives of, or Distribute the FLUX.1 [dev] Model (or any Derivative thereof, or any data produced by the FLUX.1 [dev] Model), in whole or in part, for (i) any commercial or production purposes, (ii) military purposes, (iii) purposes of surveillance, including any research or development relating to surveillance, (iv) biometric processing, (v) in any manner that infringes, misappropriates, or otherwise violates any third-party rights, or (vi) in any manner that violates any applicable law and violating any privacy or security laws, rules, regulations, directives, or governmental requirements (including the General Data Privacy Regulation (Regulation (EU) 2016/679), the California Consumer Privacy Act, and any and all laws governing the processing of biometric information), as well as all amendments and successor laws to any of the foregoing;
        b. alter or remove copyright and other proprietary notices which appear on or in any portion of the FLUX.1 [dev] Model;
        c. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Company in connection with the FLUX.1 [dev] Model, or to circumvent or remove any usage restrictions, or to enable functionality disabled by FLUX.1 [dev] Model; or
        d. offer or impose any terms on the FLUX.1 [dev] Model that alter, restrict, or are inconsistent with the terms of this License.
        e. violate any applicable U.S. and non-U.S. export control and trade sanctions laws (“Export Laws”) in connection with your use or Distribution of any FLUX.1 [dev] Model;
        f. directly or indirectly Distribute, export, or otherwise transfer FLUX.1 [dev] Model  (a) to any individual, entity, or country prohibited by Export Laws; (b) to anyone on U.S. or non-U.S. government restricted parties lists; or (c) for any purpose prohibited by Export Laws, including nuclear, chemical or biological weapons, or missile technology applications; 3) use or download FLUX.1 [dev] Model if you or they are  (a) located in a comprehensively sanctioned jurisdiction, (b) currently listed on any U.S. or non-U.S. restricted parties list, or (c) for any purpose prohibited by Export Laws; and (4) will not disguise your location through IP proxying or other methods.
    5. DISCLAIMERS.  THE FLUX.1 [dev] MODEL IS PROVIDED “AS IS” AND “WITH ALL FAULTS” WITH NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. COMPANY EXPRESSLY DISCLAIMS ALL REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY STATUTE, CUSTOM, USAGE OR OTHERWISE AS TO ANY MATTERS RELATED TO THE FLUX.1 [dev] MODEL, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, SATISFACTORY QUALITY, OR NON-INFRINGEMENT. COMPANY MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE FLUX.1 [dev] MODEL WILL BE ERROR FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR PRODUCE ANY PARTICULAR RESULTS.
    6. LIMITATION OF LIABILITY.  TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL COMPANY BE LIABLE TO YOU OR YOUR EMPLOYEES, AFFILIATES, USERS, OFFICERS OR DIRECTORS (A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE UNDER THIS LICENSE, OR (B) FOR ANY INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR LOST PROFITS, EVEN IF COMPANY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THE FLUX.1 [dev] MODEL, ITS CONSTITUENT COMPONENTS, AND ANY OUTPUT (COLLECTIVELY, “MODEL MATERIALS”) ARE NOT DESIGNED OR INTENDED FOR USE IN ANY APPLICATION OR SITUATION WHERE FAILURE OR FAULT OF THE MODEL MATERIALS COULD REASONABLY BE ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON, INCLUDING POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL’S PRIVACY RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE (EACH, A “HIGH-RISK USE”). IF YOU ELECT TO USE ANY OF THE MODEL MATERIALS FOR A HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN AND IMPLEMENT APPROPRIATE DECISION-MAKING AND RISK-MITIGATION PROCEDURES AND POLICIES IN CONNECTION WITH A HIGH-RISK USE SUCH THAT EVEN IF THERE IS A FAILURE OR FAULT IN ANY OF THE MODEL MATERIALS, THE SAFETY OF PERSONS OR PROPERTY AFFECTED BY THE ACTIVITY STAYS AT A LEVEL THAT IS REASONABLE, APPROPRIATE, AND LAWFUL FOR THE FIELD OF THE HIGH-RISK USE.
    7. INDEMNIFICATION

You will indemnify, defend and hold harmless Company and our subsidiaries and affiliates, and each of our respective shareholders, directors, officers, employees, agents, successors, and assigns (collectively, the “Company Parties”) from and against any losses, liabilities, damages, fines, penalties, and expenses (including reasonable attorneys’ fees) incurred by any Company Party in connection with any claim, demand, allegation, lawsuit, proceeding, or investigation (collectively, “Claims”) arising out of or related to  (a) your access to or use of the FLUX.1 [dev] Model (as well as any Output, results or data generated from such access or use), including any High-Risk Use (defined below); (b) your violation of this License; or (c) your violation, misappropriation or infringement of any rights of another (including intellectual property or other proprietary rights and privacy rights). You will promptly notify the Company Parties of any such Claims, and cooperate with Company Parties in defending such Claims. You will also grant the Company Parties sole control of the defense or settlement, at Company’s sole option, of any Claims. This indemnity is in addition to, and not in lieu of, any other indemnities or remedies set forth in a written agreement between you and Company or the other Company Parties.
    8. Termination; Survival.
        a. This License will automatically terminate upon any breach by you of the terms of this License.
        b. We may terminate this License, in whole or in part, at any time upon notice (including electronic) to you.
        c. If You initiate any legal action or proceedings against Company or any other entity (including a cross-claim or counterclaim in a lawsuit), alleging that the FLUX.1 [dev] Model or any Derivative, or any part thereof, infringe upon intellectual property or other rights owned or licensable by you, then any licenses granted to you under this License will immediately terminate as of the date such legal action or claim is filed or initiated.
        d. Upon termination of this License, you must cease all use, access or Distribution of the FLUX.1 [dev] Model and any Derivatives.  The following sections survive termination of this License  2(c), 2(d), 4-11.  
    9. Third Party Materials. The FLUX.1 [dev] Model may contain third-party software or other components (including free and open source software) (all of the foregoing, “Third Party Materials”), which are subject to the license terms of the respective third-party licensors. Your dealings or correspondence with third parties and your use of or interaction with any Third Party Materials are solely between you and the third party. Company does not control or endorse, and makes no representations or warranties regarding, any Third Party Materials, and your access to and use of such Third Party Materials are at your own risk.
    10. Trademarks. You have not been granted any trademark license as part of this License and may not use any name or mark associated with Company without the prior written permission of Company, except to the extent necessary to make the reference required in the Attribution Notice as specified above or as is reasonably necessary in describing the FLUX.1 [dev] Model and its creators.  
    11. General. This License will be governed and construed under the laws of the State of Delaware without regard to conflicts of law provisions. If any provision or part of a provision of this License is unlawful, void or unenforceable, that provision or part of the provision is deemed severed from this License, and will not affect the validity and enforceability of any remaining provisions. The failure of Company to exercise or enforce any right or provision of this License will not operate as a waiver of such right or provision. This License does not confer any third-party beneficiary rights upon any other person or entity. This License, together with the Documentation, contains the entire understanding between you and Company regarding the subject matter of this License, and supersedes all other written or oral agreements and understandings between you and Company regarding such subject matter. No change or addition to any provision of this License will be binding unless it is in writing and signed by an authorized representative of both you and Company.

================================================
FILE: flux-ToCa/model_licenses/LICENSE-FLUX1-schnell
================================================


Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

1. Definitions.

"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.

"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.

"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.

"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.

"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.

"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.

"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).

"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.

"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."

"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.

2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.

3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.

4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:

    You must give any other recipients of the Work or Derivative Works a copy of this License; and
    You must cause any modified files to carry prominent notices stating that You changed the files; and
    You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
    If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.

You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.

5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.

6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.

7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.

8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.

9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS


================================================
FILE: flux-ToCa/pyproject.toml
================================================
[project]
name = "flux"
authors = [
  { name = "Black Forest Labs", email = "support@blackforestlabs.ai" },
]
description = "Inference codebase for FLUX"
readme = "README.md"
requires-python = ">=3.10"
license = { file = "LICENSE.md" }
dynamic = ["version"]
dependencies = [
  "torch == 2.5.1",
  "torchvision",
  "einops",
  "fire >= 0.6.0",
  "huggingface-hub",
  "safetensors",
  "sentencepiece",
  "transformers",
  "tokenizers",
  "protobuf",
  "requests",
  "invisible-watermark",
  "ruff == 0.6.8",
]

[project.optional-dependencies]
streamlit = [
  "streamlit",
  "streamlit-drawable-canvas",
  "streamlit-keyup",
]
gradio = [
  "gradio",
]
all = [
  "flux[streamlit]",
  "flux[gradio]",
]

[project.scripts]
flux = "flux.cli:app"

[build-system]
build-backend = "setuptools.build_meta"
requires = ["setuptools>=64", "wheel", "setuptools_scm>=8"]

[tool.ruff]
line-length = 110
target-version = "py310"
extend-exclude = ["/usr/lib/*"]

[tool.ruff.lint]
ignore = [
  "E501", # line too long - will be fixed in format
]

[tool.ruff.format]
quote-style = "double"
indent-style = "space"
line-ending = "auto"
skip-magic-trailing-comma = false
docstring-code-format = true
exclude = [
  "src/flux/_version.py", # generated by setuptools_scm
]

[tool.ruff.lint.isort]
combine-as-imports = true
force-wrap-aliases = true
known-local-folder = ["src"]
known-first-party = ["flux"]

[tool.pyright]
include = ["src"]
exclude = [
  "**/__pycache__", # cache directories
  "./typings",      # generated type stubs
]
stubPath = "./typings"

[tool.tomlsort]
in_place = true
no_sort_tables = true
spaces_before_inline_comment = 1
spaces_indent_inline_array = 2
trailing_comma_inline_array = true
sort_first = [
  "project",
  "build-system",
  "tool.setuptools",
]

# needs to be last for CI reasons
[tool.setuptools_scm]
write_to = "src/flux/_version.py"
parentdir_prefix_version = "flux-"
fallback_version = "0.0.0"
version_scheme = "post-release"


================================================
FILE: flux-ToCa/setup.py
================================================
import setuptools

setuptools.setup()


================================================
FILE: flux-ToCa/src/flux/__init__.py
================================================
try:
    from ._version import (
        version as __version__,  # type: ignore
        version_tuple,
    )
except ImportError:
    __version__ = "unknown (no version information available)"
    version_tuple = (0, 0, "unknown", "noinfo")

from pathlib import Path

PACKAGE = __package__.replace("_", "-")
PACKAGE_ROOT = Path(__file__).parent


================================================
FILE: flux-ToCa/src/flux/__main__.py
================================================
from .cli import app

if __name__ == "__main__":
    app()


================================================
FILE: flux-ToCa/src/flux/_version.py
================================================
# file generated by setuptools_scm
# don't change, don't track in version control
TYPE_CHECKING = False
if TYPE_CHECKING:
    from typing import Tuple, Union
    VERSION_TUPLE = Tuple[Union[int, str], ...]
else:
    VERSION_TUPLE = object

version: str
__version__: str
__version_tuple__: VERSION_TUPLE
version_tuple: VERSION_TUPLE

__version__ = version = '0.0.post49+gd06f828.d20250206'
__version_tuple__ = version_tuple = (0, 0, 'gd06f828.d20250206')


================================================
FILE: flux-ToCa/src/flux/api.py
================================================
import io
import os
import time
from pathlib import Path

import requests
from PIL import Image

API_URL = "https://api.bfl.ml"
API_ENDPOINTS = {
    "flux.1-pro": "flux-pro",
    "flux.1-dev": "flux-dev",
    "flux.1.1-pro": "flux-pro-1.1",
}


class ApiException(Exception):
    def __init__(self, status_code: int, detail: str | list[dict] | None = None):
        super().__init__()
        self.detail = detail
        self.status_code = status_code

    def __str__(self) -> str:
        return self.__repr__()

    def __repr__(self) -> str:
        if self.detail is None:
            message = None
        elif isinstance(self.detail, str):
            message = self.detail
        else:
            message = "[" + ",".join(d["msg"] for d in self.detail) + "]"
        return f"ApiException({self.status_code=}, {message=}, detail={self.detail})"


class ImageRequest:
    def __init__(
        self,
        # api inputs
        prompt: str,
        name: str = "flux.1.1-pro",
        width: int | None = None,
        height: int | None = None,
        num_steps: int | None = None,
        prompt_upsampling: bool | None = None,
        seed: int | None = None,
        guidance: float | None = None,
        interval: float | None = None,
        safety_tolerance: int | None = None,
        # behavior of this class
        validate: bool = True,
        launch: bool = True,
        api_key: str | None = None,
    ):
        """
        Manages an image generation request to the API.

        All parameters not specified will use the API defaults.

        Args:
            prompt: Text prompt for image generation.
            width: Width of the generated image in pixels. Must be a multiple of 32.
            height: Height of the generated image in pixels. Must be a multiple of 32.
            name: Which model version to use
            num_steps: Number of steps for the image generation process.
            prompt_upsampling: Whether to perform upsampling on the prompt.
            seed: Optional seed for reproducibility.
            guidance: Guidance scale for image generation.
            safety_tolerance: Tolerance level for input and output moderation.
                 Between 0 and 6, 0 being most strict, 6 being least strict.
            validate: Run input validation
            launch: Directly launches request
            api_key: Your API key if not provided by the environment

        Raises:
            ValueError: For invalid input, when `validate`
            ApiException: For errors raised from the API
        """
        if validate:
            if name not in API_ENDPOINTS.keys():
                raise ValueError(f"Invalid model {name}")
            elif width is not None and width % 32 != 0:
                raise ValueError(f"width must be divisible by 32, got {width}")
            elif width is not None and not (256 <= width <= 1440):
                raise ValueError(f"width must be between 256 and 1440, got {width}")
            elif height is not None and height % 32 != 0:
                raise ValueError(f"height must be divisible by 32, got {height}")
            elif height is not None and not (256 <= height <= 1440):
                raise ValueError(f"height must be between 256 and 1440, got {height}")
            elif num_steps is not None and not (1 <= num_steps <= 50):
                raise ValueError(f"steps must be between 1 and 50, got {num_steps}")
            elif guidance is not None and not (1.5 <= guidance <= 5.0):
                raise ValueError(f"guidance must be between 1.5 and 4, got {guidance}")
            elif interval is not None and not (1.0 <= interval <= 4.0):
                raise ValueError(f"interval must be between 1 and 4, got {interval}")
            elif safety_tolerance is not None and not (0 <= safety_tolerance <= 6.0):
                raise ValueError(f"safety_tolerance must be between 0 and 6, got {interval}")

            if name == "flux.1-dev":
                if interval is not None:
                    raise ValueError("Interval is not supported for flux.1-dev")
            if name == "flux.1.1-pro":
                if interval is not None or num_steps is not None or guidance is not None:
                    raise ValueError("Interval, num_steps and guidance are not supported for " "flux.1.1-pro")

        self.name = name
        self.request_json = {
            "prompt": prompt,
            "width": width,
            "height": height,
            "steps": num_steps,
            "prompt_upsampling": prompt_upsampling,
            "seed": seed,
            "guidance": guidance,
            "interval": interval,
            "safety_tolerance": safety_tolerance,
        }
        self.request_json = {key: value for key, value in self.request_json.items() if value is not None}

        self.request_id: str | None = None
        self.result: dict | None = None
        self._image_bytes: bytes | None = None
        self._url: str | None = None
        if api_key is None:
            self.api_key = os.environ.get("BFL_API_KEY")
        else:
            self.api_key = api_key

        if launch:
            self.request()

    def request(self):
        """
        Request to generate the image.
        """
        if self.request_id is not None:
            return
        response = requests.post(
            f"{API_URL}/v1/{API_ENDPOINTS[self.name]}",
            headers={
                "accept": "application/json",
                "x-key": self.api_key,
                "Content-Type": "application/json",
            },
            json=self.request_json,
        )
        result = response.json()
        if response.status_code != 200:
            raise ApiException(status_code=response.status_code, detail=result.get("detail"))
        self.request_id = response.json()["id"]

    def retrieve(self) -> dict:
        """
        Wait for the generation to finish and retrieve response.
        """
        if self.request_id is None:
            self.request()
        while self.result is None:
            response = requests.get(
                f"{API_URL}/v1/get_result",
                headers={
                    "accept": "application/json",
                    "x-key": self.api_key,
                },
                params={
                    "id": self.request_id,
                },
            )
            result = response.json()
            if "status" not in result:
                raise ApiException(status_code=response.status_code, detail=result.get("detail"))
            elif result["status"] == "Ready":
                self.result = result["result"]
            elif result["status"] == "Pending":
                time.sleep(0.5)
            else:
                raise ApiException(status_code=200, detail=f"API returned status '{result['status']}'")
        return self.result

    @property
    def bytes(self) -> bytes:
        """
        Generated image as bytes.
        """
        if self._image_bytes is None:
            response = requests.get(self.url)
            if response.status_code == 200:
                self._image_bytes = response.content
            else:
                raise ApiException(status_code=response.status_code)
        return self._image_bytes

    @property
    def url(self) -> str:
        """
        Public url to retrieve the image from
        """
        if self._url is None:
            result = self.retrieve()
            self._url = result["sample"]
        return self._url

    @property
    def image(self) -> Image.Image:
        """
        Load the image as a PIL Image
        """
        return Image.open(io.BytesIO(self.bytes))

    def save(self, path: str):
        """
        Save the generated image to a local path
        """
        suffix = Path(self.url).suffix
        if not path.endswith(suffix):
            path = path + suffix
        Path(path).resolve().parent.mkdir(parents=True, exist_ok=True)
        with open(path, "wb") as file:
            file.write(self.bytes)


if __name__ == "__main__":
    from fire import Fire

    Fire(ImageRequest)


================================================
FILE: flux-ToCa/src/flux/cli.py
================================================
import os
import re
import time
from dataclasses import dataclass
from glob import iglob

import torch
from fire import Fire
from transformers import pipeline

from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack
from flux.ideas import denoise_cache
from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image

NSFW_THRESHOLD = 0.85


@dataclass
class SamplingOptions:
    prompt: str
    width: int
    height: int
    num_steps: int
    guidance: float
    seed: int | None


def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
    user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n"
    usage = (
        "Usage: Either write your prompt directly, leave this field empty "
        "to repeat the prompt or write a command starting with a slash:\n"
        "- '/w <width>' will set the width of the generated image\n"
        "- '/h <height>' will set the height of the generated image\n"
        "- '/s <seed>' sets the next seed\n"
        "- '/g <guidance>' sets the guidance (flux-dev only)\n"
        "- '/n <steps>' sets the number of steps\n"
        "- '/q' to quit"
    )

    while (prompt := input(user_question)).startswith("/"):
        if prompt.startswith("/w"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, width = prompt.split()
            options.width = 16 * (int(width) // 16)
            print(
                f"Setting resolution to {options.width} x {options.height} "
                f"({options.height *options.width/1e6:.2f}MP)"
            )
        elif prompt.startswith("/h"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, height = prompt.split()
            options.height = 16 * (int(height) // 16)
            print(
                f"Setting resolution to {options.width} x {options.height} "
                f"({options.height *options.width/1e6:.2f}MP)"
            )
        elif prompt.startswith("/g"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, guidance = prompt.split()
            options.guidance = float(guidance)
            print(f"Setting guidance to {options.guidance}")
        elif prompt.startswith("/s"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, seed = prompt.split()
            options.seed = int(seed)
            print(f"Setting seed to {options.seed}")
        elif prompt.startswith("/n"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, steps = prompt.split()
            options.num_steps = int(steps)
            print(f"Setting number of steps to {options.num_steps}")
        elif prompt.startswith("/q"):
            print("Quitting")
            return None
        else:
            if not prompt.startswith("/h"):
                print(f"Got invalid command '{prompt}'\n{usage}")
            print(usage)
    if prompt != "":
        options.prompt = prompt
    return options


@torch.inference_mode()
def main(
    name: str = "flux-schnell",
    width: int = 1360,
    height: int = 768,
    seed: int | None = None,
    prompt: str = (
        "a photo of a forest with mist swirling around the tree trunks. The word "
        '"FLUX" is painted over it in big, red brush strokes with visible texture'
    ),
    device: str = "cuda" if torch.cuda.is_available() else "cpu",
    num_steps: int | None = None,
    loop: bool = False,
    guidance: float = 3.5,
    offload: bool = False,
    output_dir: str = "output",
    add_sampling_metadata: bool = True,
):
    """
    Sample the flux model. Either interactively (set `--loop`) or run for a
    single image.

    Args:
        name: Name of the model to load
        height: height of the sample in pixels (should be a multiple of 16)
        width: width of the sample in pixels (should be a multiple of 16)
        seed: Set a seed for sampling
        output_name: where to save the output image, `{idx}` will be replaced
            by the index of the sample
        prompt: Prompt used for sampling
        device: Pytorch device
        num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
        loop: start an interactive session and sample multiple times
        guidance: guidance value used for guidance distillation
        add_sampling_metadata: Add the prompt to the image Exif metadata
    """
    nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)

    if name not in configs:
        available = ", ".join(configs.keys())
        raise ValueError(f"Got unknown model name: {name}, chose from {available}")

    torch_device = torch.device(device)
    if num_steps is None:
        num_steps = 4 if name == "flux-schnell" else 50

    # allow for packing and conversion to latent space
    height = 16 * (height // 16)
    width = 16 * (width // 16)

    output_name = os.path.join(output_dir, "img_{idx}.jpg")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        idx = 0
    else:
        fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
        if len(fns) > 0:
            idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
        else:
            idx = 0

    # init all components
    t5 = load_t5(torch_device, max_length=256 if name == "flux-schnell" else 512)
    clip = load_clip(torch_device)
    model = load_flow_model(name, device="cpu" if offload else torch_device)
    ae = load_ae(name, device="cpu" if offload else torch_device)

    rng = torch.Generator(device="cpu")
    opts = SamplingOptions(
        prompt=prompt,
        width=width,
        height=height,
        num_steps=num_steps,
        guidance=guidance,
        seed=seed,
    )

    if loop:
        opts = parse_prompt(opts)

    while opts is not None:
        if opts.seed is None:
            opts.seed = rng.seed()
        print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
        t0 = time.perf_counter()

        # prepare input
        x = get_noise(
            1,
            opts.height,
            opts.width,
            device=torch_device,
            dtype=torch.bfloat16,
            seed=opts.seed,
        )
        opts.seed = None
        if offload:
            ae = ae.cpu()
            torch.cuda.empty_cache()
            t5, clip = t5.to(torch_device), clip.to(torch_device)
        inp = prepare(t5, clip, x, prompt=opts.prompt)
        timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))

        # offload TEs to CPU, load model to gpu
        if offload:
            t5, clip = t5.cpu(), clip.cpu()
            torch.cuda.empty_cache()
            model = model.to(torch_device)

        # denoise initial noise
        x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance)

        # offload model, load autoencoder to gpu
        if offload:
            model.cpu()
            torch.cuda.empty_cache()
            ae.decoder.to(x.device)

        # decode latents to pixel space
        x = unpack(x.float(), opts.height, opts.width)
        with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
            x = ae.decode(x)

        if torch.cuda.is_available():
            torch.cuda.synchronize()
        t1 = time.perf_counter()

        fn = output_name.format(idx=idx)
        print(f"Done in {t1 - t0:.1f}s. Saving {fn}")

        idx = save_image(nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt)

        if loop:
            print("-" * 80)
            opts = parse_prompt(opts)
        else:
            opts = None


def app():
    Fire(main)


if __name__ == "__main__":
    app()


================================================
FILE: flux-ToCa/src/flux/cli_control.py
================================================
import os
import re
import time
from dataclasses import dataclass
from glob import iglob

import torch
from fire import Fire
from transformers import pipeline

from flux.modules.image_embedders import CannyImageEncoder, DepthImageEncoder
from flux.sampling import denoise, get_noise, get_schedule, prepare_control, unpack
from flux.ideas import denoise_cache
from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image


@dataclass
class SamplingOptions:
    prompt: str
    width: int
    height: int
    num_steps: int
    guidance: float
    seed: int | None
    img_cond_path: str
    lora_scale: float | None


def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
    user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n"
    usage = (
        "Usage: Either write your prompt directly, leave this field empty "
        "to repeat the prompt or write a command starting with a slash:\n"
        "- '/w <width>' will set the width of the generated image\n"
        "- '/h <height>' will set the height of the generated image\n"
        "- '/s <seed>' sets the next seed\n"
        "- '/g <guidance>' sets the guidance (flux-dev only)\n"
        "- '/n <steps>' sets the number of steps\n"
        "- '/q' to quit"
    )

    while (prompt := input(user_question)).startswith("/"):
        if prompt.startswith("/w"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, width = prompt.split()
            options.width = 16 * (int(width) // 16)
            print(
                f"Setting resolution to {options.width} x {options.height} "
                f"({options.height *options.width/1e6:.2f}MP)"
            )
        elif prompt.startswith("/h"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, height = prompt.split()
            options.height = 16 * (int(height) // 16)
            print(
                f"Setting resolution to {options.width} x {options.height} "
                f"({options.height *options.width/1e6:.2f}MP)"
            )
        elif prompt.startswith("/g"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, guidance = prompt.split()
            options.guidance = float(guidance)
            print(f"Setting guidance to {options.guidance}")
        elif prompt.startswith("/s"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, seed = prompt.split()
            options.seed = int(seed)
            print(f"Setting seed to {options.seed}")
        elif prompt.startswith("/n"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, steps = prompt.split()
            options.num_steps = int(steps)
            print(f"Setting number of steps to {options.num_steps}")
        elif prompt.startswith("/q"):
            print("Quitting")
            return None
        else:
            if not prompt.startswith("/h"):
                print(f"Got invalid command '{prompt}'\n{usage}")
            print(usage)
    if prompt != "":
        options.prompt = prompt
    return options


def parse_img_cond_path(options: SamplingOptions | None) -> SamplingOptions | None:
    if options is None:
        return None

    user_question = "Next conditioning image (write /h for help, /q to quit and leave empty to repeat):\n"
    usage = (
        "Usage: Either write your prompt directly, leave this field empty "
        "to repeat the conditioning image or write a command starting with a slash:\n"
        "- '/q' to quit"
    )

    while True:
        img_cond_path = input(user_question)

        if img_cond_path.startswith("/"):
            if img_cond_path.startswith("/q"):
                print("Quitting")
                return None
            else:
                if not img_cond_path.startswith("/h"):
                    print(f"Got invalid command '{img_cond_path}'\n{usage}")
                print(usage)
            continue

        if img_cond_path == "":
            break

        if not os.path.isfile(img_cond_path) or not img_cond_path.lower().endswith(
            (".jpg", ".jpeg", ".png", ".webp")
        ):
            print(f"File '{img_cond_path}' does not exist or is not a valid image file")
            continue

        options.img_cond_path = img_cond_path
        break

    return options


def parse_lora_scale(options: SamplingOptions | None) -> tuple[SamplingOptions | None, bool]:
    changed = False

    if options is None:
        return None, changed

    user_question = "Next lora scale (write /h for help, /q to quit and leave empty to repeat):\n"
    usage = (
        "Usage: Either write your prompt directly, leave this field empty "
        "to repeat the lora scale or write a command starting with a slash:\n"
        "- '/q' to quit"
    )

    while (prompt := input(user_question)).startswith("/"):
        if prompt.startswith("/q"):
            print("Quitting")
            return None, changed
        else:
            if not prompt.startswith("/h"):
                print(f"Got invalid command '{prompt}'\n{usage}")
            print(usage)
    if prompt != "":
        options.lora_scale = float(prompt)
        changed = True
    return options, changed


@torch.inference_mode()
def main(
    name: str,
    width: int = 1024,
    height: int = 1024,
    seed: int | None = None,
    prompt: str = "a robot made out of gold",
    device: str = "cuda" if torch.cuda.is_available() else "cpu",
    num_steps: int = 50,
    loop: bool = False,
    guidance: float | None = None,
    offload: bool = False,
    output_dir: str = "output",
    add_sampling_metadata: bool = True,
    img_cond_path: str = "assets/robot.webp",
    lora_scale: float | None = 0.85,
):
    """
    Sample the flux model. Either interactively (set `--loop`) or run for a
    single image.

    Args:
        height: height of the sample in pixels (should be a multiple of 16)
        width: width of the sample in pixels (should be a multiple of 16)
        seed: Set a seed for sampling
        output_name: where to save the output image, `{idx}` will be replaced
            by the index of the sample
        prompt: Prompt used for sampling
        device: Pytorch device
        num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
        loop: start an interactive session and sample multiple times
        guidance: guidance value used for guidance distillation
        add_sampling_metadata: Add the prompt to the image Exif metadata
        img_cond_path: path to conditioning image (jpeg/png/webp)
    """
    nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)

    assert name in [
        "flux-dev-canny",
        "flux-dev-depth",
        "flux-dev-canny-lora",
        "flux-dev-depth-lora",
    ], f"Got unknown model name: {name}"
    if guidance is None:
        if name in ["flux-dev-canny", "flux-dev-canny-lora"]:
            guidance = 30.0
        elif name in ["flux-dev-depth", "flux-dev-depth-lora"]:
            guidance = 10.0
        else:
            raise NotImplementedError()

    if name not in configs:
        available = ", ".join(configs.keys())
        raise ValueError(f"Got unknown model name: {name}, chose from {available}")

    torch_device = torch.device(device)

    output_name = os.path.join(output_dir, "img_{idx}.jpg")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        idx = 0
    else:
        fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
        if len(fns) > 0:
            idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
        else:
            idx = 0

    # init all components
    t5 = load_t5(torch_device, max_length=512)
    clip = load_clip(torch_device)
    model = load_flow_model(name, device="cpu" if offload else torch_device)
    ae = load_ae(name, device="cpu" if offload else torch_device)

    # set lora scale
    if "lora" in name and lora_scale is not None:
        for _, module in model.named_modules():
            if hasattr(module, "set_scale"):
                module.set_scale(lora_scale)

    if name in ["flux-dev-depth", "flux-dev-depth-lora"]:
        img_embedder = DepthImageEncoder(torch_device)
    elif name in ["flux-dev-canny", "flux-dev-canny-lora"]:
        img_embedder = CannyImageEncoder(torch_device)
    else:
        raise NotImplementedError()

    rng = torch.Generator(device="cpu")
    opts = SamplingOptions(
        prompt=prompt,
        width=width,
        height=height,
        num_steps=num_steps,
        guidance=guidance,
        seed=seed,
        img_cond_path=img_cond_path,
        lora_scale=lora_scale,
    )

    if loop:
        opts = parse_prompt(opts)
        opts = parse_img_cond_path(opts)
        if "lora" in name:
            opts, changed = parse_lora_scale(opts)
            if changed:
                # update the lora scale:
                for _, module in model.named_modules():
                    if hasattr(module, "set_scale"):
                        module.set_scale(opts.lora_scale)

    while opts is not None:
        if opts.seed is None:
            opts.seed = rng.seed()
        print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
        t0 = time.perf_counter()

        # prepare input
        x = get_noise(
            1,
            opts.height,
            opts.width,
            device=torch_device,
            dtype=torch.bfloat16,
            seed=opts.seed,
        )
        opts.seed = None
        if offload:
            t5, clip, ae = t5.to(torch_device), clip.to(torch_device), ae.to(torch_device)
        inp = prepare_control(
            t5,
            clip,
            x,
            prompt=opts.prompt,
            ae=ae,
            encoder=img_embedder,
            img_cond_path=opts.img_cond_path,
        )
        timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))

        # offload TEs and AE to CPU, load model to gpu
        if offload:
            t5, clip, ae = t5.cpu(), clip.cpu(), ae.cpu()
            torch.cuda.empty_cache()
            model = model.to(torch_device)

        # denoise initial noise
        x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance)

        # offload model, load autoencoder to gpu
        if offload:
            model.cpu()
            torch.cuda.empty_cache()
            ae.decoder.to(x.device)

        # decode latents to pixel space
        x = unpack(x.float(), opts.height, opts.width)
        with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
            x = ae.decode(x)

        if torch.cuda.is_available():
            torch.cuda.synchronize()
        t1 = time.perf_counter()
        print(f"Done in {t1 - t0:.1f}s")

        idx = save_image(nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt)

        if loop:
            print("-" * 80)
            opts = parse_prompt(opts)
            opts = parse_img_cond_path(opts)
            if "lora" in name:
                opts, changed = parse_lora_scale(opts)
                if changed:
                    # update the lora scale:
                    for _, module in model.named_modules():
                        if hasattr(module, "set_scale"):
                            module.set_scale(opts.lora_scale)
        else:
            opts = None


def app():
    Fire(main)


if __name__ == "__main__":
    app()


================================================
FILE: flux-ToCa/src/flux/cli_fill.py
================================================
import os
import re
import time
from dataclasses import dataclass
from glob import iglob

import torch
from fire import Fire
from PIL import Image
from transformers import pipeline

from flux.sampling import denoise, get_noise, get_schedule, prepare_fill, unpack
from flux.ideas import denoise_cache
from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image


@dataclass
class SamplingOptions:
    prompt: str
    width: int
    height: int
    num_steps: int
    guidance: float
    seed: int | None
    img_cond_path: str
    img_mask_path: str


def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
    user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n"
    usage = (
        "Usage: Either write your prompt directly, leave this field empty "
        "to repeat the prompt or write a command starting with a slash:\n"
        "- '/s <seed>' sets the next seed\n"
        "- '/g <guidance>' sets the guidance (flux-dev only)\n"
        "- '/n <steps>' sets the number of steps\n"
        "- '/q' to quit"
    )

    while (prompt := input(user_question)).startswith("/"):
        if prompt.startswith("/g"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, guidance = prompt.split()
            options.guidance = float(guidance)
            print(f"Setting guidance to {options.guidance}")
        elif prompt.startswith("/s"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, seed = prompt.split()
            options.seed = int(seed)
            print(f"Setting seed to {options.seed}")
        elif prompt.startswith("/n"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, steps = prompt.split()
            options.num_steps = int(steps)
            print(f"Setting number of steps to {options.num_steps}")
        elif prompt.startswith("/q"):
            print("Quitting")
            return None
        else:
            if not prompt.startswith("/h"):
                print(f"Got invalid command '{prompt}'\n{usage}")
            print(usage)
    if prompt != "":
        options.prompt = prompt
    return options


def parse_img_cond_path(options: SamplingOptions | None) -> SamplingOptions | None:
    if options is None:
        return None

    user_question = "Next conditioning image (write /h for help, /q to quit and leave empty to repeat):\n"
    usage = (
        "Usage: Either write your prompt directly, leave this field empty "
        "to repeat the conditioning image or write a command starting with a slash:\n"
        "- '/q' to quit"
    )

    while True:
        img_cond_path = input(user_question)

        if img_cond_path.startswith("/"):
            if img_cond_path.startswith("/q"):
                print("Quitting")
                return None
            else:
                if not img_cond_path.startswith("/h"):
                    print(f"Got invalid command '{img_cond_path}'\n{usage}")
                print(usage)
            continue

        if img_cond_path == "":
            break

        if not os.path.isfile(img_cond_path) or not img_cond_path.lower().endswith(
            (".jpg", ".jpeg", ".png", ".webp")
        ):
            print(f"File '{img_cond_path}' does not exist or is not a valid image file")
            continue
        else:
            with Image.open(img_cond_path) as img:
                width, height = img.size

            if width % 32 != 0 or height % 32 != 0:
                print(f"Image dimensions must be divisible by 32, got {width}x{height}")
                continue

        options.img_cond_path = img_cond_path
        break

    return options


def parse_img_mask_path(options: SamplingOptions | None) -> SamplingOptions | None:
    if options is None:
        return None

    user_question = "Next conditioning mask (write /h for help, /q to quit and leave empty to repeat):\n"
    usage = (
        "Usage: Either write your prompt directly, leave this field empty "
        "to repeat the conditioning mask or write a command starting with a slash:\n"
        "- '/q' to quit"
    )

    while True:
        img_mask_path = input(user_question)

        if img_mask_path.startswith("/"):
            if img_mask_path.startswith("/q"):
                print("Quitting")
                return None
            else:
                if not img_mask_path.startswith("/h"):
                    print(f"Got invalid command '{img_mask_path}'\n{usage}")
                print(usage)
            continue

        if img_mask_path == "":
            break

        if not os.path.isfile(img_mask_path) or not img_mask_path.lower().endswith(
            (".jpg", ".jpeg", ".png", ".webp")
        ):
            print(f"File '{img_mask_path}' does not exist or is not a valid image file")
            continue
        else:
            with Image.open(img_mask_path) as img:
                width, height = img.size

            if width % 32 != 0 or height % 32 != 0:
                print(f"Image dimensions must be divisible by 32, got {width}x{height}")
                continue
            else:
                with Image.open(options.img_cond_path) as img_cond:
                    img_cond_width, img_cond_height = img_cond.size

                if width != img_cond_width or height != img_cond_height:
                    print(
                        f"Mask dimensions must match conditioning image, got {width}x{height} and {img_cond_width}x{img_cond_height}"
                    )
                    continue

        options.img_mask_path = img_mask_path
        break

    return options


@torch.inference_mode()
def main(
    seed: int | None = None,
    prompt: str = "a white paper cup",
    device: str = "cuda" if torch.cuda.is_available() else "cpu",
    num_steps: int = 50,
    loop: bool = False,
    guidance: float = 30.0,
    offload: bool = False,
    output_dir: str = "output",
    add_sampling_metadata: bool = True,
    img_cond_path: str = "assets/cup.png",
    img_mask_path: str = "assets/cup_mask.png",
):
    """
    Sample the flux model. Either interactively (set `--loop`) or run for a
    single image. This demo assumes that the conditioning image and mask have
    the same shape and that height and width are divisible by 32.

    Args:
        seed: Set a seed for sampling
        output_name: where to save the output image, `{idx}` will be replaced
            by the index of the sample
        prompt: Prompt used for sampling
        device: Pytorch device
        num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
        loop: start an interactive session and sample multiple times
        guidance: guidance value used for guidance distillation
        add_sampling_metadata: Add the prompt to the image Exif metadata
        img_cond_path: path to conditioning image (jpeg/png/webp)
        img_mask_path: path to conditioning mask (jpeg/png/webp
    """
    nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)

    name = "flux-dev-fill"
    if name not in configs:
        available = ", ".join(configs.keys())
        raise ValueError(f"Got unknown model name: {name}, chose from {available}")

    torch_device = torch.device(device)

    output_name = os.path.join(output_dir, "img_{idx}.jpg")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        idx = 0
    else:
        fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
        if len(fns) > 0:
            idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
        else:
            idx = 0

    # init all components
    t5 = load_t5(torch_device, max_length=128)
    clip = load_clip(torch_device)
    model = load_flow_model(name, device="cpu" if offload else torch_device)
    ae = load_ae(name, device="cpu" if offload else torch_device)

    rng = torch.Generator(device="cpu")
    with Image.open(img_cond_path) as img:
        width, height = img.size
    opts = SamplingOptions(
        prompt=prompt,
        width=width,
        height=height,
        num_steps=num_steps,
        guidance=guidance,
        seed=seed,
        img_cond_path=img_cond_path,
        img_mask_path=img_mask_path,
    )

    if loop:
        opts = parse_prompt(opts)
        opts = parse_img_cond_path(opts)

        with Image.open(opts.img_cond_path) as img:
            width, height = img.size
        opts.height = height
        opts.width = width

        opts = parse_img_mask_path(opts)

    while opts is not None:
        if opts.seed is None:
            opts.seed = rng.seed()
        print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
        t0 = time.perf_counter()

        # prepare input
        x = get_noise(
            1,
            opts.height,
            opts.width,
            device=torch_device,
            dtype=torch.bfloat16,
            seed=opts.seed,
        )
        opts.seed = None
        if offload:
            t5, clip, ae = t5.to(torch_device), clip.to(torch_device), ae.to(torch_device)
        inp = prepare_fill(
            t5,
            clip,
            x,
            prompt=opts.prompt,
            ae=ae,
            img_cond_path=opts.img_cond_path,
            mask_path=opts.img_mask_path,
        )

        timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))

        # offload TEs and AE to CPU, load model to gpu
        if offload:
            t5, clip, ae = t5.cpu(), clip.cpu(), ae.cpu()
            torch.cuda.empty_cache()
            model = model.to(torch_device)

        # denoise initial noise
        x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance)

        # offload model, load autoencoder to gpu
        if offload:
            model.cpu()
            torch.cuda.empty_cache()
            ae.decoder.to(x.device)

        # decode latents to pixel space
        x = unpack(x.float(), opts.height, opts.width)
        with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
            x = ae.decode(x)

        if torch.cuda.is_available():
            torch.cuda.synchronize()
        t1 = time.perf_counter()
        print(f"Done in {t1 - t0:.1f}s")

        idx = save_image(nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt)

        if loop:
            print("-" * 80)
            opts = parse_prompt(opts)
            opts = parse_img_cond_path(opts)

            with Image.open(opts.img_cond_path) as img:
                width, height = img.size
            opts.height = height
            opts.width = width

            opts = parse_img_mask_path(opts)
        else:
            opts = None


def app():
    Fire(main)


if __name__ == "__main__":
    app()


================================================
FILE: flux-ToCa/src/flux/cli_redux.py
================================================
import os
import re
import time
from dataclasses import dataclass
from glob import iglob

import torch
from fire import Fire
from transformers import pipeline

from flux.modules.image_embedders import ReduxImageEncoder
from flux.sampling import denoise, get_noise, get_schedule, prepare_redux, unpack
from flux.ideas import denoise_cache
from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image


@dataclass
class SamplingOptions:
    prompt: str
    width: int
    height: int
    num_steps: int
    guidance: float
    seed: int | None
    img_cond_path: str


def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
    user_question = "Write /h for help, /q to quit and leave empty to repeat):\n"
    usage = (
        "Usage: Leave this field empty to do nothing "
        "or write a command starting with a slash:\n"
        "- '/w <width>' will set the width of the generated image\n"
        "- '/h <height>' will set the height of the generated image\n"
        "- '/s <seed>' sets the next seed\n"
        "- '/g <guidance>' sets the guidance (flux-dev only)\n"
        "- '/n <steps>' sets the number of steps\n"
        "- '/q' to quit"
    )

    while (prompt := input(user_question)).startswith("/"):
        if prompt.startswith("/w"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, width = prompt.split()
            options.width = 16 * (int(width) // 16)
            print(
                f"Setting resolution to {options.width} x {options.height} "
                f"({options.height *options.width/1e6:.2f}MP)"
            )
        elif prompt.startswith("/h"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, height = prompt.split()
            options.height = 16 * (int(height) // 16)
            print(
                f"Setting resolution to {options.width} x {options.height} "
                f"({options.height *options.width/1e6:.2f}MP)"
            )
        elif prompt.startswith("/g"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, guidance = prompt.split()
            options.guidance = float(guidance)
            print(f"Setting guidance to {options.guidance}")
        elif prompt.startswith("/s"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, seed = prompt.split()
            options.seed = int(seed)
            print(f"Setting seed to {options.seed}")
        elif prompt.startswith("/n"):
            if prompt.count(" ") != 1:
                print(f"Got invalid command '{prompt}'\n{usage}")
                continue
            _, steps = prompt.split()
            options.num_steps = int(steps)
            print(f"Setting number of steps to {options.num_steps}")
        elif prompt.startswith("/q"):
            print("Quitting")
            return None
        else:
            if not prompt.startswith("/h"):
                print(f"Got invalid command '{prompt}'\n{usage}")
            print(usage)
    return options


def parse_img_cond_path(options: SamplingOptions | None) -> SamplingOptions | None:
    if options is None:
        return None

    user_question = "Next conditioning image (write /h for help, /q to quit and leave empty to repeat):\n"
    usage = (
        "Usage: Either write your prompt directly, leave this field empty "
        "to repeat the conditioning image or write a command starting with a slash:\n"
        "- '/q' to quit"
    )

    while True:
        img_cond_path = input(user_question)

        if img_cond_path.startswith("/"):
            if img_cond_path.startswith("/q"):
                print("Quitting")
                return None
            else:
                if not img_cond_path.startswith("/h"):
                    print(f"Got invalid command '{img_cond_path}'\n{usage}")
                print(usage)
            continue

        if img_cond_path == "":
            break

        if not os.path.isfile(img_cond_path) or not img_cond_path.lower().endswith(
            (".jpg", ".jpeg", ".png", ".webp")
        ):
            print(f"File '{img_cond_path}' does not exist or is not a valid image file")
            continue

        options.img_cond_path = img_cond_path
        break

    return options


@torch.inference_mode()
def main(
    name: str = "flux-dev",
    width: int = 1360,
    height: int = 768,
    seed: int | None = None,
    device: str = "cuda" if torch.cuda.is_available() else "cpu",
    num_steps: int | None = None,
    loop: bool = False,
    guidance: float = 2.5,
    offload: bool = False,
    output_dir: str = "output",
    add_sampling_metadata: bool = True,
    img_cond_path: str = "assets/robot.webp",
):
    """
    Sample the flux model. Either interactively (set `--loop`) or run for a
    single image.

    Args:
        name: Name of the model to load
        height: height of the sample in pixels (should be a multiple of 16)
        width: width of the sample in pixels (should be a multiple of 16)
        seed: Set a seed for sampling
        output_name: where to save the output image, `{idx}` will be replaced
            by the index of the sample
        prompt: Prompt used for sampling
        device: Pytorch device
        num_steps: number of sampling steps (default 4 for schnell, 50 for guidance distilled)
        loop: start an interactive session and sample multiple times
        guidance: guidance value used for guidance distillation
        add_sampling_metadata: Add the prompt to the image Exif metadata
        img_cond_path: path to conditioning image (jpeg/png/webp)
    """
    nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)

    if name not in configs:
        available = ", ".join(configs.keys())
        raise ValueError(f"Got unknown model name: {name}, chose from {available}")

    torch_device = torch.device(device)
    if num_steps is None:
        num_steps = 4 if name == "flux-schnell" else 50

    output_name = os.path.join(output_dir, "img_{idx}.jpg")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        idx = 0
    else:
        fns = [fn for fn in iglob(output_name.format(idx="*")) if re.search(r"img_[0-9]+\.jpg$", fn)]
        if len(fns) > 0:
            idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
        else:
            idx = 0

    # init all components
    t5 = load_t5(torch_device, max_length=256 if name == "flux-schnell" else 512)
    clip = load_clip(torch_device)
    model = load_flow_model(name, device="cpu" if offload else torch_device)
    ae = load_ae(name, device="cpu" if offload else torch_device)
    img_embedder = ReduxImageEncoder(torch_device)

    rng = torch.Generator(device="cpu")
    prompt = ""
    opts = SamplingOptions(
        prompt=prompt,
        width=width,
        height=height,
        num_steps=num_steps,
        guidance=guidance,
        seed=seed,
        img_cond_path=img_cond_path,
    )

    if loop:
        opts = parse_prompt(opts)
        opts = parse_img_cond_path(opts)

    while opts is not None:
        if opts.seed is None:
            opts.seed = rng.seed()
        print(f"Generating with seed {opts.seed}:\n{opts.prompt}")
        t0 = time.perf_counter()

        # prepare input
        x = get_noise(
            1,
            opts.height,
            opts.width,
            device=torch_device,
            dtype=torch.bfloat16,
            seed=opts.seed,
        )
        opts.seed = None
        if offload:
            ae = ae.cpu()
            torch.cuda.empty_cache()
            t5, clip = t5.to(torch_device), clip.to(torch_device)
        inp = prepare_redux(
            t5,
            clip,
            x,
            prompt=opts.prompt,
            encoder=img_embedder,
            img_cond_path=opts.img_cond_path,
        )
        timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))

        # offload TEs to CPU, load model to gpu
        if offload:
            t5, clip = t5.cpu(), clip.cpu()
            torch.cuda.empty_cache()
            model = model.to(torch_device)

        # denoise initial noise
        x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance)

        # offload model, load autoencoder to gpu
        if offload:
            model.cpu()
            torch.cuda.empty_cache()
            ae.decoder.to(x.device)

        # decode latents to pixel space
        x = unpack(x.float(), opts.height, opts.width)
        with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
            x = ae.decode(x)

        if torch.cuda.is_available():
            torch.cuda.synchronize()
        t1 = time.perf_counter()
        print(f"Done in {t1 - t0:.1f}s")

        idx = save_image(nsfw_classifier, name, output_name, idx, x, add_sampling_metadata, prompt)

        if loop:
            print("-" * 80)
            opts = parse_prompt(opts)
            opts = parse_img_cond_path(opts)
        else:
            opts = None


def app():
    Fire(main)


if __name__ == "__main__":
    app()


================================================
FILE: flux-ToCa/src/flux/ideas/__init__.py
================================================
from .cache_denoise import denoise_cache

================================================
FILE: flux-ToCa/src/flux/ideas/cache_denoise.py
================================================
import torch
from ..model import Flux
from torch import Tensor
from ..modules.cache_functions import cache_init

def denoise_cache(
    model: Flux,
    # model input
    img: Tensor,
    img_ids: Tensor,
    txt: Tensor,
    txt_ids: Tensor,
    vec: Tensor,
    # sampling parameters
    timesteps: list[float],
    guidance: float = 4.0,
):  
    # init cache
    cache_dic, current = cache_init(timesteps)
    # this is ignored for schnell
    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
    current['step']=0
    current['num_steps'] = len(timesteps)-1
    for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
        current['t'] = t_curr
        #print(t_curr)
        pred = model(
            img=img,
            img_ids=img_ids,
            txt=txt,
            txt_ids=txt_ids,
            y=vec,
            timesteps=t_vec,
            cache_dic = cache_dic,
            current = current,
            guidance=guidance_vec,
        )
        #print(img.shape)
        img = img + (t_prev - t_curr) * pred
        current['step'] += 1

    return img


================================================
FILE: flux-ToCa/src/flux/math.py
================================================
import torch
from einops import rearrange
from torch import Tensor


def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, **kwargs) -> Tensor:
    
    cache_dic = kwargs.get('cache_dic', None)
    current = kwargs.get('current', None)     

    q, k = apply_rope(q, k, pe)
    
    if cache_dic is None:
        x, score = dot_product_attention(q, k, v)
        #x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
    elif cache_dic['cache_type'] == 'attention':
        x, score = dot_product_attention(q, k, v)
        cache_dic['attn_map'][-1][current['stream']][current['layer']]['total'] = score
    else:
        #x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
        x, score = dot_product_attention(q, k, v) # if you are testing the FLOPs, should change to dot_product_attention
    x = rearrange(x, "B H L D -> B L (H D)")

    return x

def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
    omega = 1.0 / (theta**scale)
    out = torch.einsum("...n,d->...nd", pos, omega)
    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
    return out.float()


def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)

############################################################################################################

import math

def dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
        is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor | torch.Tensor:
    L, S = query.size(-2), key.size(-2)
    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
    attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
    if is_causal:
        assert attn_mask is None
        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
        attn_bias.to(query.dtype)

    if attn_mask is not None:
        if attn_mask.dtype == torch.bool:
            attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
        else:
            attn_bias += attn_mask

    if enable_gqa:
        key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
        value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)

    #attn_weight = query @ key.transpose(-2, -1) * scale_factor
    attn_weight = torch.matmul(query, key.transpose(-2, -1))* scale_factor
    attn_weight += attn_bias
    
    #attn_weight = torch.softmax(attn_weight, dim=-1)
    #attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
#
    #return torch.matmul(attn_weight, value)

    attn_map = torch.softmax(attn_weight, dim=-1)
    attn_weight = torch.dropout(attn_map, dropout_p, train=True)
    #return attn_weight @ value, attn_map.mean(dim=1).mean(dim=1) 
    return torch.matmul(attn_weight, value), attn_map.mean(dim=1).mean(dim=1) 

================================================
FILE: flux-ToCa/src/flux/model.py
================================================
from dataclasses import dataclass

import torch
from torch import Tensor, nn

from flux.modules.layers import (
    DoubleStreamBlock,
    EmbedND,
    LastLayer,
    MLPEmbedder,
    SingleStreamBlock,
    timestep_embedding,
)
from flux.modules.lora import LinearLora, replace_linear_with_lora
from flux.modules.cache_functions import cal_type

@dataclass
class FluxParams:
    in_channels: int
    out_channels: int
    vec_in_dim: int
    context_in_dim: int
    hidden_size: int
    mlp_ratio: float
    num_heads: int
    depth: int
    depth_single_blocks: int
    axes_dim: list[int]
    theta: int
    qkv_bias: bool
    guidance_embed: bool


class Flux(nn.Module):
    """
    Transformer model for flow matching on sequences.
    """

    def __init__(self, params: FluxParams):
        super().__init__()

        self.params = params
        self.in_channels = params.in_channels
        self.out_channels = params.out_channels
        if params.hidden_size % params.num_heads != 0:
            raise ValueError(
                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
            )
        pe_dim = params.hidden_size // params.num_heads
        if sum(params.axes_dim) != pe_dim:
            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
        self.hidden_size = params.hidden_size
        self.num_heads = params.num_heads
        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
        self.guidance_in = (
            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
        )
        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)

        self.double_blocks = nn.ModuleList(
            [
                DoubleStreamBlock(
                    self.hidden_size,
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
                )
                for _ in range(params.depth)
            ]
        )

        self.single_blocks = nn.ModuleList(
            [
                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
                for _ in range(params.depth_single_blocks)
            ]
        )

        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)

    def forward(
        self,
        img: Tensor,
        img_ids: Tensor,
        txt: Tensor,
        txt_ids: Tensor,
        timesteps: Tensor,
        y: Tensor,
        guidance: Tensor | None = None,
        *args,
        **kwargs,
    ) -> Tensor:
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")
        
        cache_dic = kwargs.get('cache_dic', None)
        current = kwargs.get('current', None)
        
        # running on sequences img
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256))
        if self.params.guidance_embed:
            if guidance is None:
                raise ValueError("Didn't get guidance strength for guidance distilled model.")
            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
        vec = vec + self.vector_in(y)
        txt = self.txt_in(txt)

        ids = torch.cat((txt_ids, img_ids), dim=1)
        pe = self.pe_embedder(ids)

        cal_type(cache_dic=cache_dic, current=current)

        for i, block in enumerate(self.double_blocks):
            current['layer'] = i
            img, txt = block(img=img, txt=txt, vec=vec, pe=pe, cache_dic=cache_dic, current=current)

        img = torch.cat((txt, img), 1)
        for i, block in enumerate(self.single_blocks):
            current['layer'] = i
            img = block(img, vec=vec, pe=pe, cache_dic=cache_dic, current=current)
        img = img[:, txt.shape[1] :, ...]

        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
        return img


class FluxLoraWrapper(Flux):
    def __init__(
        self,
        lora_rank: int = 128,
        lora_scale: float = 1.0,
        *args,
        **kwargs,
    ) -> None:
        super().__init__(*args, **kwargs)

        self.lora_rank = lora_rank

        replace_linear_with_lora(
            self,
            max_rank=lora_rank,
            scale=lora_scale,
        )

    def set_lora_scale(self, scale: float) -> None:
        for module in self.modules():
            if isinstance(module, LinearLora):
                module.set_scale(scale=scale)


================================================
FILE: flux-ToCa/src/flux/modules/autoencoder.py
================================================
from dataclasses import dataclass

import torch
from einops import rearrange
from torch import Tensor, nn


@dataclass
class AutoEncoderParams:
    resolution: int
    in_channels: int
    ch: int
    out_ch: int
    ch_mult: list[int]
    num_res_blocks: int
    z_channels: int
    scale_factor: float
    shift_factor: float


def swish(x: Tensor) -> Tensor:
    return x * torch.sigmoid(x)


class AttnBlock(nn.Module):
    def __init__(self, in_channels: int):
        super().__init__()
        self.in_channels = in_channels

        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)

        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)

    def attention(self, h_: Tensor) -> Tensor:
        h_ = self.norm(h_)
        q = self.q(h_)
        k = self.k(h_)
        v = self.v(h_)

        b, c, h, w = q.shape
        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
        h_ = nn.functional.scaled_dot_product_attention(q, k, v)

        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)

    def forward(self, x: Tensor) -> Tensor:
        return x + self.proj_out(self.attention(x))


class ResnetBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()
        self.in_channels = in_channels
        out_channels = in_channels if out_channels is None else out_channels
        self.out_channels = out_channels

        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        if self.in_channels != self.out_channels:
            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)

    def forward(self, x):
        h = x
        h = self.norm1(h)
        h = swish(h)
        h = self.conv1(h)

        h = self.norm2(h)
        h = swish(h)
        h = self.conv2(h)

        if self.in_channels != self.out_channels:
            x = self.nin_shortcut(x)

        return x + h


class Downsample(nn.Module):
    def __init__(self, in_channels: int):
        super().__init__()
        # no asymmetric padding in torch conv, must do it ourselves
        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)

    def forward(self, x: Tensor):
        pad = (0, 1, 0, 1)
        x = nn.functional.pad(x, pad, mode="constant", value=0)
        x = self.conv(x)
        return x


class Upsample(nn.Module):
    def __init__(self, in_channels: int):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)

    def forward(self, x: Tensor):
        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
        x = self.conv(x)
        return x


class Encoder(nn.Module):
    def __init__(
        self,
        resolution: int,
        in_channels: int,
        ch: int,
        ch_mult: list[int],
        num_res_blocks: int,
        z_channels: int,
    ):
        super().__init__()
        self.ch = ch
        self.num_resolutions = len(ch_mult)
        self.num_res_blocks = num_res_blocks
        self.resolution = resolution
        self.in_channels = in_channels
        # downsampling
        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)

        curr_res = resolution
        in_ch_mult = (1,) + tuple(ch_mult)
        self.in_ch_mult = in_ch_mult
        self.down = nn.ModuleList()
        block_in = self.ch
        for i_level in range(self.num_resolutions):
            block = nn.ModuleList()
            attn = nn.ModuleList()
            block_in = ch * in_ch_mult[i_level]
            block_out = ch * ch_mult[i_level]
            for _ in range(self.num_res_blocks):
                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
                block_in = block_out
            down = nn.Module()
            down.block = block
            down.attn = attn
            if i_level != self.num_resolutions - 1:
                down.downsample = Downsample(block_in)
                curr_res = curr_res // 2
            self.down.append(down)

        # middle
        self.mid = nn.Module()
        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
        self.mid.attn_1 = AttnBlock(block_in)
        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)

        # end
        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)

    def forward(self, x: Tensor) -> Tensor:
        # downsampling
        hs = [self.conv_in(x)]
        for i_level in range(self.num_resolutions):
            for i_block in range(self.num_res_blocks):
                h = self.down[i_level].block[i_block](hs[-1])
                if len(self.down[i_level].attn) > 0:
                    h = self.down[i_level].attn[i_block](h)
                hs.append(h)
            if i_level != self.num_resolutions - 1:
                hs.append(self.down[i_level].downsample(hs[-1]))

        # middle
        h = hs[-1]
        h = self.mid.block_1(h)
        h = self.mid.attn_1(h)
        h = self.mid.block_2(h)
        # end
        h = self.norm_out(h)
        h = swish(h)
        h = self.conv_out(h)
        return h


class Decoder(nn.Module):
    def __init__(
        self,
        ch: int,
        out_ch: int,
        ch_mult: list[int],
        num_res_blocks: int,
        in_channels: int,
        resolution: int,
        z_channels: int,
    ):
        super().__init__()
        self.ch = ch
        self.num_resolutions = len(ch_mult)
        self.num_res_blocks = num_res_blocks
        self.resolution = resolution
        self.in_channels = in_channels
        self.ffactor = 2 ** (self.num_resolutions - 1)

        # compute in_ch_mult, block_in and curr_res at lowest res
        block_in = ch * ch_mult[self.num_resolutions - 1]
        curr_res = resolution // 2 ** (self.num_resolutions - 1)
        self.z_shape = (1, z_channels, curr_res, curr_res)

        # z to block_in
        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)

        # middle
        self.mid = nn.Module()
        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
        self.mid.attn_1 = AttnBlock(block_in)
        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)

        # upsampling
        self.up = nn.ModuleList()
        for i_level in reversed(range(self.num_resolutions)):
            block = nn.ModuleList()
            attn = nn.ModuleList()
            block_out = ch * ch_mult[i_level]
            for _ in range(self.num_res_blocks + 1):
                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
                block_in = block_out
            up = nn.Module()
            up.block = block
            up.attn = attn
            if i_level != 0:
                up.upsample = Upsample(block_in)
                curr_res = curr_res * 2
            self.up.insert(0, up)  # prepend to get consistent order

        # end
        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)

    def forward(self, z: Tensor) -> Tensor:
        # z to block_in
        h = self.conv_in(z)

        # middle
        h = self.mid.block_1(h)
        h = self.mid.attn_1(h)
        h = self.mid.block_2(h)

        # upsampling
        for i_level in reversed(range(self.num_resolutions)):
            for i_block in range(self.num_res_blocks + 1):
                h = self.up[i_level].block[i_block](h)
                if len(self.up[i_level].attn) > 0:
                    h = self.up[i_level].attn[i_block](h)
            if i_level != 0:
                h = self.up[i_level].upsample(h)

        # end
        h = self.norm_out(h)
        h = swish(h)
        h = self.conv_out(h)
        return h


class DiagonalGaussian(nn.Module):
    def __init__(self, sample: bool = True, chunk_dim: int = 1):
        super().__init__()
        self.sample = sample
        self.chunk_dim = chunk_dim

    def forward(self, z: Tensor) -> Tensor:
        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
        if self.sample:
            std = torch.exp(0.5 * logvar)
            return mean + std * torch.randn_like(mean)
        else:
            return mean


class AutoEncoder(nn.Module):
    def __init__(self, params: AutoEncoderParams):
        super().__init__()
        self.encoder = Encoder(
            resolution=params.resolution,
            in_channels=params.in_channels,
            ch=params.ch,
            ch_mult=params.ch_mult,
            num_res_blocks=params.num_res_blocks,
            z_channels=params.z_channels,
        )
        self.decoder = Decoder(
            resolution=params.resolution,
            in_channels=params.in_channels,
            ch=params.ch,
            out_ch=params.out_ch,
            ch_mult=params.ch_mult,
            num_res_blocks=params.num_res_blocks,
            z_channels=params.z_channels,
        )
        self.reg = DiagonalGaussian()

        self.scale_factor = params.scale_factor
        self.shift_factor = params.shift_factor

    def encode(self, x: Tensor) -> Tensor:
        z = self.reg(self.encoder(x))
        z = self.scale_factor * (z - self.shift_factor)
        return z

    def decode(self, z: Tensor) -> Tensor:
        z = z / self.scale_factor + self.shift_factor
        return self.decoder(z)

    def forward(self, x: Tensor) -> Tensor:
        return self.decode(self.encode(x))


================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/__init__.py
================================================
from .cache_cutfresh import cache_cutfresh
from .fresh_ratio_scheduler import fresh_ratio_scheduler
from .score_evaluate import score_evaluate
from .global_force_fresh import global_force_fresh
from .cache_cutfresh import cache_cutfresh
from .update_cache import update_cache
from .force_init import force_init
from .attention import cached_attention_forward
from .cache_init import cache_init
from .cal_type import cal_type
from .force_scheduler import force_scheduler
from .support_set_selection import support_set_selection

================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/attention.py
================================================
# Besides, re-arrange the attention module
from torch.jit import Final
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Union
#from xformers.ops.fmha.attn_bias import BlockDiagonalMask
def cached_attention_forward(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    #attn_bias: Optional[Union[torch.Tensor, BlockDiagonalMask]] = None,
    attn_bias,
    p: float = 0.0,
    scale: Optional[float] = None
) -> torch.Tensor:
    scale = 1.0 / query.shape[-1] ** 0.5
    query = query * scale
    query = query.transpose(1, 2)
    key = key.transpose(1, 2)
    value = value.transpose(1, 2)
    attn = query @ key.transpose(-2, -1)
    if attn_bias is not None:
        attn_bias = attn_bias.materialize(shape= attn.shape, dtype= attn.dtype, device= attn.device)
        attn = attn + attn_bias
    #out_map = attn
    attn_map = attn.softmax(-1)
    attn = F.dropout(attn_map, p)
    attn = attn @ value

    return attn.transpose(1, 2).contiguous(), attn_map.mean(dim=1)

================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/cache_cutfresh.py
================================================
from .fresh_ratio_scheduler import fresh_ratio_scheduler
from .score_evaluate import score_evaluate
#from .token_merge import token_merge
from .support_set_selection import support_set_selection
import torch
def cache_cutfresh(cache_dic, tokens, current):
    '''
    Cut fresh tokens from the input tokens and update the cache counter.
    
    cache_dic: dict, the cache dictionary containing cache(main extra memory cost), indices and some other information.
    tokens: torch.Tensor, the input tokens to be cut.
    current: dict, the current step, layer, and module information. Particularly convenient for debugging.
    '''
    step = current['step']
    layer = current['layer']
    stream = current['stream']
    module = current['module']
    
    fresh_ratio = fresh_ratio_scheduler(cache_dic, current)
    fresh_ratio = torch.clamp(torch.tensor(fresh_ratio, device = tokens.device), min=0, max=1)
    
    # Generate the index tensor for fresh tokens
    score = score_evaluate(cache_dic, tokens, current) # s1, s2, s3 mentioned in the paper
    #score = local_selection_with_bonus(score, 0.4, 4) # Uniform Spatial Distribution s4 mentioned in the paper
    indices = score.argsort(dim=-1, descending=True)
    topk = int(fresh_ratio * score.shape[1])
    fresh_indices = indices[:, :topk]
    stale_indices = indices[:, topk:]

    #fresh_indices = support_set_selection(tokens, fresh_ratio, 0.4, current, cache_dic) # (B, fresh_ratio * N) # 0.4

    # (B, fresh_ratio *N)

    # Updating the Cache Frequency Score s3 mentioned in the paper
    # stale tokens index + 1 in each ***module***, fresh tokens index = 0
    cache_dic['cache_index'][-1][layer][module] += 1
    cache_dic['cache_index'][-1][layer][module].scatter_(dim=1, index=fresh_indices, 
                                                                    src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device))
    #cache_dic['cache_index']['layer_index'][module] += 1
    #cache_dic['cache_index']['layer_index'][module].scatter_(dim=1, index=fresh_indices, 
    #                                                                src = torch.zeros_like(fresh_indices, dtype=torch.int, device=fresh_indices.device))
    
    fresh_indices_expand = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1])

    fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices_expand)
    return fresh_indices, fresh_tokens
    
def local_selection_with_bonus(score, bonus_ratio, grid_size=2):
    batch_size, num_tokens = score.shape
    image_size = int(num_tokens ** 0.5)
    block_size = grid_size * grid_size
    
    assert num_tokens % block_size == 0, "The number of tokens must be divisible by the block size."
    
    # Step 1: Reshape score to group it by blocks
    score_reshaped = score.view(batch_size, image_size // grid_size, grid_size, image_size // grid_size, grid_size)
    score_reshaped = score_reshaped.permute(0, 1, 3, 2, 4).contiguous()
    score_reshaped = score_reshaped.view(batch_size, -1, block_size)  # [batch_size, num_blocks, block_size]
    
    # Step 2: Find the max token in each block
    max_scores, max_indices = score_reshaped.max(dim=-1, keepdim=True)  # [batch_size, num_blocks, 1]
    
    # Step 3: Create a mask to identify max score tokens
    mask = torch.zeros_like(score_reshaped)
    mask.scatter_(-1, max_indices, 1)  # Set mask to 1 at the max indices
    
    # Step 4: Apply the bonus only to the max score tokens
    score_reshaped = score_reshaped + (mask * max_scores * bonus_ratio)  # Apply bonus only to max tokens
    
    # Step 5: Reshape the score back to its original shape
    score_modified = score_reshaped.view(batch_size, image_size // grid_size, image_size // grid_size, grid_size, grid_size)
    score_modified = score_modified.permute(0, 1, 3, 2, 4).contiguous()
    score_modified = score_modified.view(batch_size, num_tokens)
    
    return score_modified

================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/cache_init.py
================================================
def cache_init(timesteps, model_kwargs=None):   
    '''
    Initialization for cache.
    '''
    cache_dic = {}
    cache = {}
    cache_index = {}
    cache[-1]={}
    cache_index[-1]={}
    cache_index['layer_index']={}
    cache_dic['attn_map'] = {}
    cache_dic['attn_map'][-1] = {}
    cache_dic['attn_map'][-1]['double_stream'] = {}
    cache_dic['attn_map'][-1]['single_stream'] = {}

    cache_dic['k-norm'] = {}
    cache_dic['k-norm'][-1] = {}
    cache_dic['k-norm'][-1]['double_stream'] = {}
    cache_dic['k-norm'][-1]['single_stream'] = {}

    cache_dic['v-norm'] = {}
    cache_dic['v-norm'][-1] = {}
    cache_dic['v-norm'][-1]['double_stream'] = {}
    cache_dic['v-norm'][-1]['single_stream'] = {}

    cache_dic['cross_attn_map'] = {}
    cache_dic['cross_attn_map'][-1] = {}
    cache[-1]['double_stream']={}
    cache[-1]['single_stream']={}
    cache_dic['cache_counter'] = 0

    for j in range(19):
        cache[-1]['double_stream'][j] = {}
        cache_index[-1][j] = {}
        cache_dic['attn_map'][-1]['double_stream'][j] = {}
        cache_dic['attn_map'][-1]['double_stream'][j]['total'] = {}
        cache_dic['attn_map'][-1]['double_stream'][j]['txt_mlp'] = {}
        cache_dic['attn_map'][-1]['double_stream'][j]['img_mlp'] = {}
        
        cache_dic['k-norm'][-1]['double_stream'][j] = {}
        cache_dic['k-norm'][-1]['double_stream'][j]['txt_mlp'] = {}
        cache_dic['k-norm'][-1]['double_stream'][j]['img_mlp'] = {}

        cache_dic['v-norm'][-1]['double_stream'][j] = {}
        cache_dic['v-norm'][-1]['double_stream'][j]['txt_mlp'] = {}
        cache_dic['v-norm'][-1]['double_stream'][j]['img_mlp'] = {}

    for j in range(38):
        cache[-1]['single_stream'][j] = {}
        cache_index[-1][j] = {}
        cache_dic['attn_map'][-1]['single_stream'][j] = {}
        cache_dic['attn_map'][-1]['single_stream'][j]['total'] = {}

        cache_dic['k-norm'][-1]['single_stream'][j] = {}
        cache_dic['k-norm'][-1]['single_stream'][j]['total'] = {}

        cache_dic['v-norm'][-1]['single_stream'][j] = {}
        cache_dic['v-norm'][-1]['single_stream'][j]['total'] = {}

    mode = 'ToCa'
    if mode == 'original':
        cache_dic['cache_type'] = 'random'              # model_kwargs['cache_type'] # no use
        cache_dic['cache_index'] = cache_index
        cache_dic['cache'] = cache
        cache_dic['fresh_ratio_schedule'] = 'ToCa'      # model_kwargs['ratio_scheduler']
        cache_dic['fresh_ratio'] = 0.0                  # model_kwargs['fresh_ratio']
        cache_dic['fresh_threshold'] = 1                # model_kwargs['fresh_threshold']
        cache_dic['force_fresh'] = 'global'             # model_kwargs['force_fresh']
        cache_dic['soft_fresh_weight'] = 0.0            # model_kwargs['soft_fresh_weight']
    
    elif mode == 'ToCa':
        cache_dic['cache_type'] = 'attention'           # Attention cache type for ToCa, use Self-Attention Weight to evaluate the importance of each token
        cache_dic['cache_index'] = cache_index
        cache_dic['cache'] = cache
        cache_dic['fresh_ratio_schedule'] = 'ToCa' 
        cache_dic['fresh_ratio'] = 0.1
        cache_dic['fresh_threshold'] = 4
        cache_dic['force_fresh'] = 'global' 
        cache_dic['soft_fresh_weight'] = 0.25
        
    current = {}
    current['final_time'] = timesteps[-2]
    return cache_dic, current


================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/cal_type.py
================================================
from .force_scheduler import force_scheduler

def cal_type(cache_dic, current):
    '''
    Determine calculation type for this step
    '''
    if cache_dic['fresh_ratio'] == 0.0:
        # FORA: Uniform
        first_step = (current['step'] == 0)
    else:
        # ToCa: First 3 steps enhanced
        first_step = (current['step'] <= 2)
    
    force_fresh = cache_dic['force_fresh']
    if not first_step:
        fresh_interval = cache_dic['cal_threshold']
    else:
        fresh_interval = cache_dic['fresh_threshold']

    if (first_step) or (cache_dic['cache_counter'] == fresh_interval - 1 ):
        current['type'] = 'full'
        cache_dic['cache_counter'] = 0
        force_scheduler(cache_dic, current)
    
    # ToCa
    else:
        cache_dic['cache_counter'] += 1
        current['type'] = 'ToCa'

######################################################################
    #if (current['step'] in [3,2,1,0]):
    #    current['type'] = 'full'

================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/force_init.py
================================================
import torch

def force_init(cache_dic, current, tokens):
    '''
    Initialization for Force Activation step.
    '''
    cache_dic['cache_index'][-1][current['layer']][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device)

    #if current['layer'] == 0:
    #    cache_dic['cache_index']['layer_index'][current['module']] = torch.zeros(tokens.shape[0], tokens.shape[1], dtype=torch.int, device=tokens.device)

================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/force_scheduler.py
================================================
import torch
def force_scheduler(cache_dic, current):
    if cache_dic['fresh_ratio'] == 0:
        # FORA
        linear_step_weight = 0.0
    else: 
        # TokenCache
        linear_step_weight = 0.0
    step_factor = torch.tensor(1 - linear_step_weight + 2 * linear_step_weight * current['step'] / current['num_steps'])
    threshold = torch.round(cache_dic['fresh_threshold'] / step_factor)

    # no force constrain for sensitive steps, cause the performance is good enough.
    # you may have a try.
    
    cache_dic['cal_threshold'] = threshold
    #return threshold

================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/fresh_ratio_scheduler.py
================================================
import torch
def fresh_ratio_scheduler(cache_dic, current):
    '''
    Return the fresh ratio for the current step.
    '''
    fresh_ratio = cache_dic['fresh_ratio']
    fresh_ratio_schedule = cache_dic['fresh_ratio_schedule']
    step = current['step']
    num_steps = current['num_steps']
    threshold = cache_dic['fresh_threshold']
    weight = 0.9
    if fresh_ratio_schedule == 'constant':
        return fresh_ratio
    elif fresh_ratio_schedule == 'linear':
        return fresh_ratio * (1 + weight - 2 * weight * step / num_steps)
    elif fresh_ratio_schedule == 'exp':
        #return 0.5 * (0.052 ** (step/num_steps))
        return fresh_ratio * (weight ** (step / num_steps))
    elif fresh_ratio_schedule == 'linear-mode':
        mode = (step % threshold)/threshold - 0.5
        mode_weight = 0.1
        return fresh_ratio * (1 + weight - 2 * weight * step / num_steps + mode_weight * mode)
    elif fresh_ratio_schedule == 'layerwise':
        return fresh_ratio * (1 + weight - 2 * weight * current['layer'] / 27)
    elif fresh_ratio_schedule == 'linear-layerwise':
        step_weight = -0.9 #0.9
        step_factor = 1 - step_weight + 2 * step_weight * step / num_steps
        #if current['layer'] == 2:
        #    return 1.0
        #sigmoid
        #sigmoid_weight = 0.13
        #layer_factor = 2 * torch.sigmoid(torch.tensor([sigmoid_weight * (13.5 - current['layer'])]))
        layer_weight = 0.6
        layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27

        module_weight = 1.0 #TokenCache N=8 2.5 N=6 2.5 #N=4 2.1
        module_time_weight = 0.6
        module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight)
        
        return fresh_ratio * layer_factor * step_factor * module_factor

    elif fresh_ratio_schedule == 'ToCa':
        step_weight = 0.0 #0.9
        step_factor = 1 - step_weight + 2 * step_weight * step / num_steps

        layer_weight = 0.5
        layer_factor = 1 + layer_weight - 2 * layer_weight * current['layer'] / 27

        #module_weight = 1.0
        #module_time_weight = 0.6
        # this means 60*x% cross-attn computation, and 160*x% mlp computation. This is designed for cross-attn has best temporal redundancy, and mlp has worse.
        # so cross-attn compute less and mlp compute more.
        #module_factor = (1 - (1-module_time_weight) * module_weight) if current['module']=='cross-attn' else (1 + module_time_weight * module_weight)
        stream_weight = 0.6
        stream_factor = (1 - stream_weight) if current['stream']=='double_stream' else (1 + stream_weight)
        return fresh_ratio * layer_factor * step_factor * stream_factor #* module_factor

    else:
        raise ValueError("unrecognized fresh ratio schedule", fresh_ratio_schedule)


================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/global_force_fresh.py
================================================
from .force_scheduler import force_scheduler
def global_force_fresh(cache_dic, current):
    '''
    Return whether to force fresh tokens globally.
    '''
    first_step = (current['step'] == 0)
    second_step = (current['step'] == 1)
    force_fresh = cache_dic['force_fresh']
    if not first_step:
        fresh_threshold = cache_dic['cal_threshold']
    else:
        fresh_threshold = cache_dic['fresh_threshold']

    if force_fresh == 'global':
        return (first_step or (current['step']% fresh_threshold == 0))
    elif force_fresh == 'local':
        return first_step
    elif force_fresh == 'none':
        return first_step
    else:
        raise ValueError("unrecognized force fresh strategy", force_fresh)

================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/score_evaluate.py
================================================
import torch
import torch.nn as nn
from .scores import attn_score, similarity_score, norm_score, k_norm_score, v_norm_score
def score_evaluate(cache_dic, tokens, current) -> torch.Tensor:
    '''
    Return the score tensor (B, N) for the given tokens.
    '''

    #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')):
    #    # abandoned branch, if you want to explore the local force fresh strategy, this may help.
    #    force_fresh_mask = torch.as_tensor((cache_dic['cache_index'][-1][current['layer']][current['module']] >= 2 * cache_dic['fresh_threshold']), dtype = int) # 2 because the threshold is for step, not module
    #    force_len = force_fresh_mask.sum(dim=1)
    #    force_indices = force_fresh_mask.argsort(dim = -1, descending = True)[:, :force_len.min()]
    #    force_indices = force_indices[:, torch.randperm(force_indices.shape[1])]

    # Just see more explanation in the version of DiT-ToCa if needed.

    if cache_dic['cache_type'] == 'random':
        score = torch.rand(tokens.shape[0], tokens.shape[1], device=tokens.device)

    elif cache_dic['cache_type'] == 'straight':
        score = torch.ones(tokens.shape[0], tokens.shape[1]).to(tokens.device)
    
    elif cache_dic['cache_type'] == 'attention':
        # cache_dic['attn_map'][step][layer] (B, N, N), the last dimention has get softmaxed
        score = attn_score(cache_dic, current)
        #score = score + 0.0 * torch.rand_like(score, device= score.device)
    
    elif cache_dic['cache_type'] == 'similarity':
        score = similarity_score(cache_dic, current, tokens)

    elif cache_dic['cache_type'] == 'norm':
        score = norm_score(cache_dic, current, tokens)

    elif cache_dic['cache_type'] == 'k-norm':
        score = k_norm_score(cache_dic, current)

    elif cache_dic['cache_type'] == 'v-norm':
        score = v_norm_score(cache_dic, current)

    elif cache_dic['cache_type'] == 'compress':
        score1 = torch.rand(int(tokens.shape[0]*0.5), tokens.shape[1])
        score1 = torch.cat([score1, score1], dim=0).to(tokens.device)
        score2 = cache_dic['attn_map'][-1][current['layer']].sum(dim=1)#.mean(dim=0) # (B, N)
        # normalize
        score2 = score2 / score2.max(dim=1, keepdim=True)[0]
        score = 0.5 * score1 + 0.5 * score2
    
    # abandoned the branch, if you want to explore the local force fresh strategy, this may help.
    #if ((not current['is_force_fresh']) and (cache_dic['force_fresh'] == 'local')): # current['is_force_fresh'] is False, cause when it is True, no cut and fresh are needed
    #        #print(torch.ones_like(force_indices, dtype=float, device=force_indices.device).dtype)
    #    score.scatter_(dim=1, index=force_indices, src=torch.ones_like(force_indices, dtype=torch.float32, 
    #                                                                       device=force_indices.device))
    
    if (True and (cache_dic['force_fresh'] == 'global')):
        soft_step_score = cache_dic['cache_index'][-1][current['layer']][current['module']].float() / (cache_dic['fresh_threshold'])
        #soft_layer_score = cache_dic['cache_index']['layer_index'][current['module']].float() / (27)
        score = score + cache_dic['soft_fresh_weight'] * soft_step_score #+ 0.1 *soft_layer_score
    
    return score.to(tokens.device)

================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/scores.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

def attn_score(cache_dic, current):
    #self_attn_score = 1- cache_dic['attn_map'][-1][current['layer']].diagonal(dim1=1, dim2=2)
    #self_attn_score = F.normalize(self_attn_score, dim=1, p=2)
    #attention_score = F.normalize(cache_dic['attn_map'][-1][current['layer']].sum(dim=1), dim=1, p=2)
    #cross_attn_map = F.threshold(cache_dic['cross_attn_map'][-1][current['layer']],threshold=0.0, value=0.0)
    #cross_attention_score = F.normalize(cross_attn_map.sum(dim=-1), dim=-1, p=2)

    # Note: It is important to give a same selection method for cfg and no cfg.
    # Because the influence of **Cross-Attention** in text-contidional models makes cfg and no cfg a BIG difference.

    # Same selection for cfg and no cfg
    #cond_cmap, uncond_cmap = torch.split(cache_dic['attn_map'][-1][current['layer']], len(cache_dic['cross_attn_map'][-1][current['layer']]) // 2, dim=0)
    #cond_weight = 0.5
    #cmap = cond_weight * cond_cmap + (1 - cond_weight) * uncond_cmap

    ## Entropy score
    #cross_attention_entropy = -torch.sum(cmap * torch.log(cmap + 1e-7), dim=-1)
    #cross_attention_score   = F.normalize(1 + cross_attention_entropy, dim=1, p=2) # Note here "1" does not influence the sorted sequence, but provie stability.
    #score = cross_attention_score.repeat(2, 1)
    if current['stream'] == 'double_stream':
        score = F.normalize(cache_dic['attn_map'][-1][current['stream']][current['layer']][current['module']], dim=-1, p=2)
    elif current['stream'] == 'single_stream':
        score = F.normalize(cache_dic['attn_map'][-1][current['stream']][current['layer']]['total'], dim=-1, p=2)

    # You can try conbining the self_attention_score (s1) and cross_attention_score (s2) as the final score, there exists a balance.
    #cross_weight = 0.0
    #score =  (1-cross_weight) * attention_score + cross_weight * cross_attention_score
    return score

def similarity_score(cache_dic, current, tokens):
    cosine_sim = F.cosine_similarity(tokens, cache_dic['cache'][-1][current['layer']][current['module']], dim=-1)

    return F.normalize(1- cosine_sim, dim=-1, p=2)

def norm_score(cache_dic, current, tokens):
    norm = tokens.norm(dim=-1, p=2)
    return F.normalize(norm, dim=-1, p=2)

def kv_norm_score(cache_dic, current):
    # (B, N, num_heads)
    #cond_k_norm, uncond_k_norm = torch.split(cache_dic['cache'][-1][current['layer']]['k_norm'], len(cache_dic['cache'][-1][current['layer']]['k_norm']) // 2, dim=0)
    cond_v_norm, uncond_v_norm = torch.split(cache_dic['cache'][-1][current['layer']]['v_norm'], len(cache_dic['cache'][-1][current['layer']]['v_norm']) // 2, dim=0)
    cond_weight = 0.5
    #k_norm = cond_weight * cond_k_norm + (1 - cond_weight) * uncond_k_norm
    v_norm = cond_weight * cond_v_norm + (1 - cond_weight) * uncond_v_norm
    kv_norm = 1 -v_norm

    ## 计算 (B/2, N) 张量在 N 维度上的每个元素与均值的绝对值差
    #kv_norm_mean = kv_norm.mean(dim=-2, keepdim=True)
    #kv_norm_diff = torch.abs(kv_norm - kv_norm_mean)
    
    return F.normalize(kv_norm.sum(dim=-1), p=2).repeat(2, 1)

def k_norm_score(cache_dic, current):
    # (B, N)

    if current['stream'] == 'double_stream':
        score = F.normalize(cache_dic['k-norm'][-1][current['stream']][current['layer']][current['module']], dim=-1, p=2)
    elif current['stream'] == 'single_stream':
        score = F.normalize(cache_dic['k-norm'][-1][current['stream']][current['layer']]['total'], dim=-1, p=2)

    return score

def v_norm_score(cache_dic, current):
    # (B, N)

    if current['stream'] == 'double_stream':
        score = F.normalize(cache_dic['v-norm'][-1][current['stream']][current['layer']][current['module']], dim=-1, p=2)
    elif current['stream'] == 'single_stream':
        score = F.normalize(cache_dic['v-norm'][-1][current['stream']][current['layer']]['total'], dim=-1, p=2)

    return score


================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/support_set_selection.py
================================================
import torch
from typing import Dict

def support_set_selection(x: torch.Tensor, fresh_ratio: float, base_ratio: float, current: Dict, cache_dic: Dict) -> torch.Tensor:
    
    #selection_start = 0
    #
    #if current['stream'] == 'single_stream':
    #    # only select from the img tokens
    #    x = x[:, cache_dic['txt_shape'] :]
    #    selection_start = cache_dic['txt_shape']

    B, N, H = x.shape
    num_total = int(fresh_ratio * N)         # 最终每个 batch 选取的 token 数
    base_count = int(base_ratio * num_total)  # 随机选取的 token 数
    #base_count = 1
    add_count = num_total - base_count  # 需要从候选集中选取的 token 数

    # 1. 随机选取 (B, base_count) 个 token
    random_indices = torch.randperm(N, device=x.device)
    base_indices = random_indices[:base_count]
    other_indices = random_indices[base_count:]

    base_tokens = x.gather(dim=1, index=base_indices.unsqueeze(-1).expand(B, -1, H))
    #other_tokens = x.gather(dim=1, index=other_indices.unsqueeze(-1).expand(-1, -1, H))

    # 2. 计算余下 token 与已选 token 的相似度
    
    # normaize
    base_tokens = base_tokens / base_tokens.norm(dim=-1, keepdim=True)
    #other_tokens = other_tokens / other_tokens.norm(dim=-1, keepdim=True)
    x_norm = x / x.norm(dim=-1, keepdim=True)

    # 计算余下 token 与已选 token 的相似度
    similarity = torch.einsum('bnd,bmd->bnm', base_tokens, x_norm)

    # 计算每列最小值
    min_similarity = similarity.min(dim=1).values
    #min_similarity = similarity.max(dim=1).values

    # 3. 选取相似度最小的 token
    _, min_indices = min_similarity.topk(add_count, largest=False)
    #_, min_indices = min_similarity.topk(add_count, largest=True)

    # 4. 合并 base_indices 和 min_indices
    #indices = torch.cat([base_indices, other_indices[min_indices]], dim=-1)
    indices = torch.cat([base_indices.expand(B, -1), min_indices], dim=-1) #+ selection_start

    return indices


================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/token_merge.py
================================================
import torch
def token_merge(cache_dic, tokens, current, fresh_indices, stale_indices):
    '''
    An abandoned branch in exploring if token merge helps. The answer is no, at least no for training-free strategy.
    '''
    if (current['layer'] % 1 == 0):
        fresh_tokens = torch.gather(input = tokens, dim = 1, index = fresh_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]))
        stale_tokens = torch.gather(input = tokens, dim = 1, index = stale_indices.unsqueeze(-1).expand(-1, -1, tokens.shape[-1]))
        method = 'similarity'
        if method == 'distance':
            descending = False
            distance = torch.cdist(stale_tokens, fresh_tokens, p=1)
            stale_fresh_dist, stale_fresh_indices_allstale = torch.min(distance, dim=2)
        elif method == 'similarity':
            descending = True
            fresh_tokens = torch.nn.functional.normalize(fresh_tokens, p=2, dim=-1)
            stale_tokens = torch.nn.functional.normalize(stale_tokens, p=2, dim=-1)
            similarity = stale_tokens @ fresh_tokens.transpose(1, 2)
            stale_fresh_dist, stale_fresh_indices_allstale = torch.max(similarity, dim=2)
        

        saved_topk_stale = int((stale_fresh_dist > 0.995).sum(dim=1).min())
        merged_stale_sequence = torch.sort(stale_fresh_dist, dim=1, descending=descending)[1][:,:saved_topk_stale]
        stale_fresh_indices = stale_fresh_indices_allstale.gather(1, merged_stale_sequence)
        merged_stale_sequence = stale_indices.gather(1, merged_stale_sequence)
        merged_stale_fresh_indices = fresh_indices.gather(1, stale_fresh_indices)
        cache_dic['merged_stale_fresh_indices'] = merged_stale_fresh_indices
        cache_dic['merged_stale_sequence'] = merged_stale_sequence 


================================================
FILE: flux-ToCa/src/flux/modules/cache_functions/update_cache.py
================================================
import torch
def update_cache(fresh_indices, fresh_tokens, cache_dic, current, fresh_attn_map=None):
    '''
    Update the cache with the fresh tokens.
    '''
    step = current['step']
    layer = current['layer']
    module = current['module']
    # Update the cached tokens at the positions


    indices = fresh_indices

    cache_dic['cache'][-1][current['stream']][current['layer']][current['module']].scatter_(dim=1, index=indices.unsqueeze(-1).expand(-1, -1, fresh_tokens.shape[-1]), src=fresh_tokens)
    
    
================================================
FILE: flux-ToCa/src/flux/modules/conditioner.py
================================================
from torch import Tensor, nn
from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer


class HFEmbedder(nn.Module):
    def __init__(self, version: str, max_length: int, **hf_kwargs):
        super().__init__()
        self.is_clip = "openai" in version
        self.max_length = max_length
        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"

        if self.is_clip:
            self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(version, max_length=max_length)
            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(version, **hf_kwargs)
        else:
            self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(version, max_length=max_length)
            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(version, **hf_kwargs)

        self.hf_module = self.hf_module.eval().requires_grad_(False)

    def forward(self, text: list[str]) -> Tensor:
        batch_encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            return_length=False,
            return_overflowing_tokens=False,
            padding="max_length",
            return_tensors="pt",
        )

        outputs = self.hf_module(
            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
            attention_mask=None,
            output_hidden_states=False,
        )
        return outputs[self.output_key]


================================================
FILE: flux-ToCa/src/flux/modules/image_embedders.py
================================================
import os

import cv2
import numpy as np
import torch
from einops import rearrange, repeat
from PIL import Image
from safetensors.torch import load_file as load_sft
from torch import nn
from transformers import AutoModelForDepthEstimation, AutoProcessor, SiglipImageProcessor, SiglipVisionModel

from flux.util import print_load_warning


class DepthImageEncoder:
    depth_model_name = "LiheYoung/depth-anything-large-hf"

    def __init__(self, device):
        self.device = device
        self.depth_model = AutoModelForDepthEstimation.from_pretrained(self.depth_model_name).to(device)
        self.processor = AutoProcessor.from_pretrained(self.depth_model_name)

    def __call__(self, img: torch.Tensor) -> torch.Tensor:
        hw = img.shape[-2:]

        img = torch.clamp(img, -1.0, 1.0)
        img_byte = ((img + 1.0) * 127.5).byte()

        img = self.processor(img_byte, return_tensors="pt")["pixel_values"]
        depth = self.depth_model(img.to(self.device)).predicted_depth
        depth = repeat(depth, "b h w -> b 3 h w")
        depth = torch.nn.functional.interpolate(depth, hw, mode="bicubic", antialias=True)

        depth = depth / 127.5 - 1.0
        return depth


class CannyImageEncoder:
    def __init__(
        self,
        device,
        min_t: int = 50,
        max_t: int = 200,
    ):
        self.device = device
        self.min_t = min_t
        self.max_t = max_t

    def __call__(self, img: torch.Tensor) -> torch.Tensor:
        assert img.shape[0] == 1, "Only batch size 1 is supported"

        img = rearrange(img[0], "c h w -> h w c")
        img = torch.clamp(img, -1.0, 1.0)
        img_np = ((img + 1.0) * 127.5).numpy().astype(np.uint8)

        # Apply Canny edge detection
        canny = cv2.Canny(img_np, self.min_t, self.max_t)

        # Convert back to torch tensor and reshape
        canny = torch.from_numpy(canny).float() / 127.5 - 1.0
        canny = rearrange(canny, "h w -> 1 1 h w")
        canny = repeat(canny, "b 1 ... -> b 3 ...")
        return canny.to(self.device)


class ReduxImageEncoder(nn.Module):
    siglip_model_name = "google/siglip-so400m-patch14-384"

    def __init__(
        self,
        device,
        redux_dim: int = 1152,
        txt_in_features: int = 4096,
        redux_path: str | None = os.getenv("FLUX_REDUX"),
        dtype=torch.bfloat16,
    ) -> None:
        assert redux_path is not None, "Redux path must be provided"

        super().__init__()

        self.redux_dim = redux_dim
        self.device = device if isinstance(device, torch.device) else torch.device(device)
        self.dtype = dtype

        with self.device:
            self.redux_up = nn.Linear(redux_dim, txt_in_features * 3, dtype=dtype)
            self.redux_down = nn.Linear(txt_in_features * 3, txt_in_features, dtype=dtype)

            sd = load_sft(redux_path, device=str(device))
            missing, unexpected = self.load_state_dict(sd, strict=False, assign=True)
            print_load_warning(missing, unexpected)

            self.siglip = SiglipVisionModel.from_pretrained(self.siglip_model_name).to(dtype=dtype)
        self.normalize = SiglipImageProcessor.from_pretrained(self.siglip_model_name)

    def __call__(self, x: Image.Image) -> torch.Tensor:
        imgs = self.normalize.preprocess(images=[x], do_resize=True, return_tensors="pt", do_convert_rgb=True)

        _encoded_x = self.siglip(**imgs.to(device=self.device, dtype=self.dtype)).last_hidden_state

        projected_x = self.redux_down(nn.functional.silu(self.redux_up(_encoded_x)))

        return projected_x


================================================
FILE: flux-ToCa/src/flux/modules/layers.py
================================================
import math
from dataclasses import dataclass
from typing import Optional
import torch
from einops import rearrange
from torch import Tensor, nn

from flux.math import attention, rope

from flux.modules.cache_functions import force_init, cache_cutfresh, update_cache

class EmbedND(nn.Module):
    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
        super().__init__()
        self.dim = dim
        self.theta = theta
        self.axes_dim = axes_dim

    def forward(self, ids: Tensor) -> Tensor:
        n_axes = ids.shape[-1]
        emb = torch.cat(
            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
            dim=-3,
        )

        return emb.unsqueeze(1)


def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
    """
    Create sinusoidal timestep embeddings.
    :param t: a 1-D Tensor of N indices, one per batch element.
                      These may be fractional.
    :param dim: the dimension of the output.
    :param max_period: controls the minimum frequency of the embeddings.
    :return: an (N, D) Tensor of positional embeddings.
    """
    t = time_factor * t
    half = dim // 2
    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
        t.device
    )

    args = t[:, None].float() * freqs[None]
    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
    if dim % 2:
        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
    if torch.is_floating_point(t):
        embedding = embedding.to(t)
    return embedding


class MLPEmbedder(nn.Module):
    def __init__(self, in_dim: int, hidden_dim: int):
        super().__init__()
        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
        self.silu = nn.SiLU()
        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)

    def forward(self, x: Tensor) -> Tensor:
        return self.out_layer(self.silu(self.in_layer(x)))


class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(dim))

    def forward(self, x: Tensor):
        x_dtype = x.dtype
        x = x.float()
        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
        return (x * rrms).to(dtype=x_dtype) * self.scale


class QKNorm(torch.nn.Module):
    def __init__(self, dim: int):
        super().__init__()
        self.query_norm = RMSNorm(dim)
        self.key_norm = RMSNorm(dim)

    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
        q = self.query_norm(q)
        k = self.key_norm(k)
        return q.to(v), k.to(v)


class SelfAttention(nn.Module):
    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.norm = QKNorm(head_dim)
        self.proj = nn.Linear(dim, dim)

    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
        qkv = self.qkv(x)
        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
        q, k = self.norm(q, k, v)
        x = attention(q, k, v, pe=pe)
        x = self.proj(x)
        return x


@dataclass
class ModulationOut:
    shift: Tensor
    scale: Tensor
    gate: Tensor


class Modulation(nn.Module):
    def __init__(self, dim: int, double: bool):
        super().__init__()
        self.is_double = double
        self.multiplier = 6 if double else 3
        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)

    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)

        return (
            ModulationOut(*out[:3]),
            ModulationOut(*out[3:]) if self.is_double else None,
        )


class DoubleStreamBlock(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
        super().__init__()

        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.img_mod = Modulation(hidden_size, double=True)
        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)

        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.img_mlp = nn.Sequential(
            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
            nn.GELU(approximate="tanh"),
            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
        )

        self.txt_mod = Modulation(hidden_size, double=True)
        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)

        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.txt_mlp = nn.Sequential(
            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
            nn.GELU(approximate="tanh"),
            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
        )

    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, **kwargs) -> tuple[Tensor, Tensor]:
        
        cache_dic = kwargs.get('cache_dic', None)
        current = kwargs.get('current', None)        
        
        if cache_dic is None:
            img_mod1, img_mod2 = self.img_mod(vec)
            txt_mod1, txt_mod2 = self.txt_mod(vec)

            # prepare image for attention
            img_modulated = self.img_norm1(img)
            img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
            img_qkv = self.img_attn.qkv(img_modulated)
            img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
            img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)

            # prepare txt for attention
            txt_modulated = self.txt_norm1(txt)
            txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
            txt_qkv = self.txt_attn.qkv(txt_modulated)
            txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
            txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)

            # run actual attention
            q = torch.cat((txt_q, img_q), dim=2)
            k = torch.cat((txt_k, img_k), dim=2)
            v = torch.cat((txt_v, img_v), dim=2)

            attn = attention(q, k, v, pe=pe)
            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]

            # calculate the img bloks
            img = img + img_mod1.gate * self.img_attn.proj(img_attn)
            img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)

            # calculate the txt bloks
            txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
            txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
        
        else:
            current['stream'] = 'double_stream'

            if current['type'] == 'full':    
                img_mod1, img_mod2 = self.img_mod(vec)
                txt_mod1, txt_mod2 = self.txt_mod(vec)

                # prepare image for attention
                img_modulated = self.img_norm1(img)
                img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
                img_qkv = self.img_attn.qkv(img_modulated)
                img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
                
                if cache_dic['cache_type'] == 'k-norm':
                    img_k_norm = img_k.norm(dim=-1, p=2).mean(dim=1)
                    cache_dic['k-norm'][-1][current['stream']][current['layer']]['img_mlp'] = img_k_norm
                elif cache_dic['cache_type'] == 'v-norm':
                    img_v_norm = img_v.norm(dim=-1, p=2).mean(dim=1)
                    cache_dic['v-norm'][-1][current['stream']][current['layer']]['img_mlp'] = img_v_norm
                
                img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)

                # prepare txt for attention
                txt_modulated = self.txt_norm1(txt)
                txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
                txt_qkv = self.txt_attn.qkv(txt_modulated)
                txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)

                if cache_dic['cache_type'] == 'k-norm':
                    txt_k_norm = txt_k.norm(dim=-1, p=2).mean(dim=1)
                    cache_dic['k-norm'][-1][current['stream']][current['layer']]['txt_mlp'] = txt_k_norm
                elif cache_dic['cache_type'] == 'v-norm':
                    txt_v_norm = txt_v.norm(dim=-1, p=2).mean(dim=1)
                    cache_dic['v-norm'][-1][current['stream']][current['layer']]['txt_mlp'] = txt_v_norm
                
                txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)

                # run actual attention
                q = torch.cat((txt_q, img_q), dim=2)
                k = torch.cat((txt_k, img_k), dim=2)
                v = torch.cat((txt_v, img_v), dim=2)

                attn = attention(q, k, v, pe=pe, cache_dic=cache_dic, current=current)
                cache_dic['cache'][-1]['double_stream'][current['layer']]['attn'] = attn

                txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
                cache_dic['txt_shape'] = txt.shape[1]
                
                if cache_dic['cache_type'] == 'attention':
                    cache_dic['attn_map'][-1][current['stream']][current['layer']]['txt_mlp'] = cache_dic['attn_map'][-1][current['stream']][current['layer']]['total'][:, : txt.shape[1]]
                    cache_dic['attn_map'][-1][current['stream']][current['layer']]['img_mlp'] = cache_dic['attn_map'][-1][current['stream']][current['layer']]['total'][:, txt.shape[1] :]

                current['module'] = 'img_mlp'
                force_init(cache_dic=cache_dic, current=current, tokens=img)
                # calculate the img bloks
                img = img + img_mod1.gate * self.img_attn.proj(img_attn)
                cache_dic['cache'][-1]['double_stream'][current['layer']]['img_mlp'] = self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
                img = img + img_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['img_mlp']

                current['module'] = 'txt_mlp'
                force_init(cache_dic=cache_dic, current=current, tokens=txt)
                # calculate the txt bloks
                txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
                cache_dic['cache'][-1]['double_stream'][current['layer']]['txt_mlp'] = self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
                txt = txt + txt_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['txt_mlp']

            elif current['type'] == 'ToCa':
                img_mod1, img_mod2 = self.img_mod(vec)
                txt_mod1, txt_mod2 = self.txt_mod(vec)

                attn = cache_dic['cache'][-1]['double_stream'][current['layer']]['attn']
                txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]

                current['module'] = 'img_mlp'
                # calculate the img bloks
                img = img + img_mod1.gate * self.img_attn.proj(img_attn)
                fresh_indices, fresh_tokens_img = cache_cutfresh(cache_dic=cache_dic, tokens=img, current=current)
                fresh_tokens_img = self.img_mlp((1 + img_mod2.scale) * self.img_norm2(fresh_tokens_img) + img_mod2.shift)
                update_cache(fresh_indices=fresh_indices, fresh_tokens=fresh_tokens_img, cache_dic=cache_dic, current=current)
                cache_dic['cache'][-1]['double_stream'][current['layer']]['img_mlp']
                img = img + img_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['img_mlp']

                current['module'] = 'txt_mlp'
                # calculate the txt bloks
                txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
                fresh_indices, fresh_tokens_txt = cache_cutfresh(cache_dic=cache_dic, tokens=txt, current=current)
                fresh_tokens_txt = self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(fresh_tokens_txt) + txt_mod2.shift)
                update_cache(fresh_indices=fresh_indices, fresh_tokens=fresh_tokens_txt, cache_dic=cache_dic, current=current)
                txt = txt + txt_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['txt_mlp']
            
            elif current['type'] == 'FORA':
                img_mod1, img_mod2 = self.img_mod(vec)
                txt_mod1, txt_mod2 = self.txt_mod(vec)
                img = img + img_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['img_mlp']
                txt = txt + txt_mod2.gate * cache_dic['cache'][-1]['double_stream'][current['layer']]['txt_mlp']
            elif current['type'] == 'aggressive':
                current['module'] = 'skipped'
            else:
                raise ValueError("Unknown cache type.")
            
        return img, txt


class SingleStreamBlock(nn.Module):
    """
    A DiT block with parallel linear layers as described in
    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
    """

    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        mlp_ratio: float = 4.0,
        qk_scale: float | None = None,
    ):
        super().__init__()
        self.hidden_dim = hidden_size
        self.num_heads = num_heads
        head_dim = hidden_size // num_heads
        self.scale = qk_scale or head_dim**-0.5

        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
        # qkv and mlp_in
        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
        # proj and mlp_out
        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)

        self.norm = QKNorm(head_dim)

        self.hidden_size = hidden_size
        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)

        self.mlp_act = nn.GELU(approximate="tanh")
        self.modulation = Modulation(hidden_size, double=False)
        # mlp_in
        self.mlp_in = nn.Linear(hidden_size, self.mlp_hidden_dim)

    def load_mlp_in_weights(self, linear1_weight: torch.Tensor, linear1_bias: Optional[torch.Tensor] = None):
        """
        Split and load the weights of the original `linear1` layer, keeping only the MLP hidden layer part.

        Parameters:
          - linear1_weight: Tensor, with shape (hidden_size * 3 + mlp_hidden_dim, hidden_size)
          - linear1_bias: Tensor, with shape (hidden_size * 3 + mlp_hidden_dim,) or None

        """
        hidden_size = self.hidden_size
        mlp_hidden_dim = self.mlp_hidden_dim
        device = self.linear1.weight.device  # target device

        self.mlp_in.weight = torch.nn.Parameter(linear1_weight[hidden_size * 3:, :].to(device))

        if linear1_bias is not None:

            self.mlp_in.bias = torch.nn.Parameter(linear1_bias[hidden_size * 3:].to(device))

    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, **kwargs) -> Tensor:

        cache_dic = kwargs.get('cache_dic', None)
        current = kwargs.get('current', None)

        mod, _ = self.modulation(vec)
        
        if cache_dic is None:
            x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
            qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)

            q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
            q, k = self.norm(q, k, v)

            # compute attention
            attn = attention(q, k, v, pe=pe, cache_dic=cache_dic, current=current)
            # compute activation in mlp stream, cat again and run second linear layer
            output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
        
        else:
            current['stream'] = 'single_stream'

            if current['type'] == 'full':
                #if (current['layer'] == 0):
                #    print(current['step'])
                x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
                qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
                cache_dic['cache'][-1]['single_stream'][current['layer']]['mlp'] = mlp
                current['module'] = 'attn'
                q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)

                if cache_dic['cache_type'] == 'k-norm':
                    cache_dic['k-norm'][-1][current['stream']][current['layer']]['total'] = k.norm(dim=-1, p=2).mean(dim=1)
                elif cache_dic['cache_type'] == 'v-norm':
                    cache_dic['v-norm'][-1][current['stream']][current['layer']]['total'] = v.norm(dim=-1, p=2).mean(dim=1)
                
                q, k = self.norm(q, k, v)

                # compute attention
                attn = attention(q, k, v, pe=pe, cache_dic=cache_dic, current=current)
                force_init(cache_dic=cache_dic, current=current, tokens=attn)
                cache_dic['cache'][-1]['single_stream'][current['layer']]['attn'] = attn
                # compute activation in mlp stream, cat again and run second linear layer
                current['module'] = 'mlp'
                output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
                force_init(cache_dic=cache_dic, current=current, tokens=output)
                current['module'] = 'total'
                cache_dic['cache'][-1]['single_stream'][current['layer']]['total'] = output

            elif current['type'] == 'ToCa':
                self.load_mlp_in_weights(self.linear1.weight, self.linear1.bias)
                current['module'] = 'mlp'
                fresh_indices, fresh_tokens_mlp = cache_cutfresh(cache_dic=cache_dic, tokens=x, current=current)
                x_mod = (1 + mod.scale) * self.pre_norm(fresh_tokens_mlp) + mod.shift
                #cache_dic['cache'][-1]['single_stream'][current['layer']]['mlp']
                mlp_fresh = self.mlp_in(x_mod)
                #_, mlp_fresh1 = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
                update_cache(fresh_indices=fresh_indices, fresh_tokens=mlp_fresh, cache_dic=cache_dic, current=current)
                # compute attention
                fake_fresh_attn = torch.gather(input = cache_dic['cache'][-1]['single_stream'][current['layer']]['attn'], dim = 1, 
                                               index = fresh_indices.unsqueeze(-1).expand(-1, -1, cache_dic['cache'][-1]['single_stream'][current['layer']]['attn'].shape[-1]))
                
                current['module'] = 'total'
                fresh_tokens_output = self.linear2(torch.cat((fake_fresh_attn, self.mlp_act(mlp_fresh)), 2))
                update_cache(fresh_indices=fresh_indices, fresh_tokens=fresh_tokens_output, cache_dic=cache_dic, current=current)
                #attn = cache_dic['cache'][-1]['single_stream'][current['layer']]['attn']
                #mlp  = cache_dic['cache'][-1]['single_stream'][current['layer']]['mlp']
                # compute activation in mlp stream, cat again and run second linear layer
                #output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
                output = cache_dic['cache'][-1]['single_stream'][current['layer']]['total']
            
            elif current['type'] == 'FORA':
                output = cache_dic['cache'][-1]['single_stream'][current['layer']]['total']
                
            elif current['type'] == 'aggressive':
                current['module'] = 'skipped'
                if current['layer'] == 37:
                    x = cache_dic['cache'][-1]['aggressive_output']
                return x
            else:
                raise ValueError("Unknown cache type.")
            
            if current['layer'] == 37:
                cache_dic['cache'][-1]['aggressive_output'] = x
            
        return x + mod.gate * output


class LastLayer(nn.Module):
    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
        super().__init__()
        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))

    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
        x = self.linear(x)
        return x


================================================
FILE: flux-ToCa/src/flux/modules/lora.py
================================================
import torch
from torch import nn


def replace_linear_with_lora(
    module: nn.Module,
    max_rank: int,
    scale: float = 1.0,
) -> None:
    for name, child in module.named_children():
        if isinstance(child, nn.Linear):
            new_lora = LinearLora(
                in_features=child.in_features,
                out_features=child.out_features,
                bias=child.bias,
                rank=max_rank,
                scale=scale,
                dtype=child.weight.dtype,
                device=child.weight.device,
            )

            new_lora.weight = child.weight
            new_lora.bias = child.bias if child.bias is not None else None

            setattr(module, name, new_lora)
        else:
            replace_linear_with_lora(
                module=child,
                max_rank=max_rank,
                scale=scale,
            )


class LinearLora(nn.Linear):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool,
        rank: int,
        dtype: torch.dtype,
        device: torch.device,
        lora_bias: bool = True,
        scale: float = 1.0,
        *args,
        **kwargs,
    ) -> None:
        super().__init__(
            in_features=in_features,
            out_features=out_features,
            bias=bias is not None,
            device=device,
            dtype=dtype,
            *args,
            **kwargs,
        )

        assert isinstance(scale, float), "scale must be a float"

        self.scale = scale
        self.rank = rank
        self.lora_bias = lora_bias
        self.dtype = dtype
        self.device = device

        if rank > (new_rank := min(self.out_features, self.in_features)):
            self.rank = new_rank

        self.lora_A = nn.Linear(
            in_features=in_features,
            out_features=self.rank,
            bias=False,
            dtype=dtype,
            device=device,
        )
        self.lora_B = nn.Linear(
            in_features=self.rank,
            out_features=out_features,
            bias=self.lora_bias,
            dtype=dtype,
            device=device,
        )

    def set_scale(self, scale: float) -> None:
        assert isinstance(scale, float), "scalar value must be a float"
        self.scale = scale

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        base_out = super().forward(input)

        _lora_out_B = self.lora_B(self.lora_A(input))
        lora_update = _lora_out_B * self.scale

        return base_out + lora_update


================================================
FILE: flux-ToCa/src/flux/sampling.py
================================================
import math
from typing import Callable

import numpy as np
import torch
from einops import rearrange, repeat
from PIL import Image
from torch import Tensor

from .model import Flux
from .modules.autoencoder import AutoEncoder
from .modules.conditioner import HFEmbedder
from .modules.image_embedders import CannyImageEncoder, DepthImageEncoder, ReduxImageEncoder
from .modules.cache_functions import cache_init

def get_noise(
    num_samples: int,
    height: int,
    width: int,
    device: torch.device,
    dtype: torch.dtype,
    seed: int,
):
    return torch.randn(
        num_samples,
        16,
        # allow for packing
        2 * math.ceil(height / 16),
        2 * math.ceil(width / 16),
        device=device,
        dtype=dtype,
        generator=torch.Generator(device=device).manual_seed(seed),
    )


def prepare(t5: HFEmbedder, clip: HFEmbedder, img: Tensor, prompt: str | list[str]) -> dict[str, Tensor]:
    bs, c, h, w = img.shape
    if bs == 1 and not isinstance(prompt, str):
        bs = len(prompt)

    img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
    if img.shape[0] == 1 and bs > 1:
        img = repeat(img, "1 ... -> bs ...", bs=bs)

    img_ids = torch.zeros(h // 2, w // 2, 3)
    img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
    img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
    img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)

    #small_img_ids = torch.zeros((h // 2) // 2, (w // 2) // 2, 3)
    #small_img_ids[..., 1] = small_img_ids[..., 1] + torch.arange((h // 2) // 2)[:, None]
    #small_img_ids[..., 2] = small_img_ids[..., 2] + torch.arange((w // 2) // 2)[None, :]
    #small_img_ids = repeat(small_img_ids, "h w c -> b (h w) c", b=bs)

    if isinstance(prompt, str):
        prompt = [prompt]
    txt = t5(prompt)
    if txt.shape[0] == 1 and bs > 1:
        txt = repeat(txt, "1 ... -> bs ...", bs=bs)
    txt_ids = torch.zeros(bs, txt.shape[1], 3)

    vec = clip(prompt)
    if vec.shape[0] == 1 and bs > 1:
        vec = repeat(vec, "1 ... -> bs ...", bs=bs)

    return {
        "img": img,
        #"img_ids": [img_ids.to(img.device), small_img_ids.to(img.device)],
        "img_ids": img_ids.to(img.device),
        "txt": txt.to(img.device),
        "txt_ids": txt_ids.to(img.device),
        "vec": vec.to(img.device),
    }


def prepare_control(
    t5: HFEmbedder,
    clip: HFEmbedder,
    img: Tensor,
    prompt: str | list[str],
    ae: AutoEncoder,
    encoder: DepthImageEncoder | CannyImageEncoder,
    img_cond_path: str,
) -> dict[str, Tensor]:
    # load and encode the conditioning image
    bs, _, h, w = img.shape
    if bs == 1 and not isinstance(prompt, str):
        bs = len(prompt)

    img_cond = Image.open(img_cond_path).convert("RGB")

    width = w * 8
    height = h * 8
    img_cond = img_cond.resize((width, height), Image.LANCZOS)
    img_cond = np.array(img_cond)
    img_cond = torch.from_numpy(img_cond).float() / 127.5 - 1.0
    img_cond = rearrange(img_cond, "h w c -> 1 c h w")

    with torch.no_grad():
        img_cond = encoder(img_cond)
        img_cond = ae.encode(img_cond)

    img_cond = img_cond.to(torch.bfloat16)
    img_cond = rearrange(img_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
    if img_cond.shape[0] == 1 and bs > 1:
        img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs)

    return_dict = prepare(t5, clip, img, prompt)
    return_dict["img_cond"] = img_cond
    return return_dict


def prepare_fill(
    t5: HFEmbedder,
    clip: HFEmbedder,
    img: Tensor,
    prompt: str | list[str],
    ae: AutoEncoder,
    img_cond_path: str,
    mask_path: str,
) -> dict[str, Tensor]:
    # load and encode the conditioning image and the mask
    bs, _, _, _ = img.shape
    if bs == 1 and not isinstance(prompt, str):
        bs = len(prompt)

    img_cond = Image.open(img_cond_path).convert("RGB")
    img_cond = np.array(img_cond)
    img_cond = torch.from_numpy(img_cond).float() / 127.5 - 1.0
    img_cond = rearrange(img_cond, "h w c -> 1 c h w")

    mask = Image.open(mask_path).convert("L")
    mask = np.array(mask)
    mask = torch.from_numpy(mask).float() / 255.0
    mask = rearrange(mask, "h w -> 1 1 h w")

    with torch.no_grad():
        img_cond = img_cond.to(img.device)
        mask = mask.to(img.device)
        img_cond = img_cond * (1 - mask)
        img_cond = ae.encode(img_cond)
        mask = mask[:, 0, :, :]
        mask = mask.to(torch.bfloat16)
        mask = rearrange(
            mask,
            "b (h ph) (w pw) -> b (ph pw) h w",
            ph=8,
            pw=8,
        )
        mask = rearrange(mask, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
        if mask.shape[0] == 1 and bs > 1:
            mask = repeat(mask, "1 ... -> bs ...", bs=bs)

    img_cond = img_cond.to(torch.bfloat16)
    img_cond = rearrange(img_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
    if img_cond.shape[0] == 1 and bs > 1:
        img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs)

    img_cond = torch.cat((img_cond, mask), dim=-1)

    return_dict = prepare(t5, clip, img, prompt)
    return_dict["img_cond"] = img_cond.to(img.device)
    return return_dict


def prepare_redux(
    t5: HFEmbedder,
    clip: HFEmbedder,
    img: Tensor,
    prompt: str | list[str],
    encoder: ReduxImageEncoder,
    img_cond_path: str,
) -> dict[str, Tensor]:
    bs, _, h, w = img.shape
    if bs == 1 and not isinstance(prompt, str):
        bs = len(prompt)

    img_cond = Image.open(img_cond_path).convert("RGB")
    with torch.no_grad():
        img_cond = encoder(img_cond)

    img_cond = img_cond.to(torch.bfloat16)
    if img_cond.shape[0] == 1 and bs > 1:
        img_cond = repeat(img_cond, "1 ... -> bs ...", bs=bs)

    img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
    if img.shape[0] == 1 and bs > 1:
        img = repeat(img, "1 ... -> bs ...", bs=bs)

    img_ids = torch.zeros(h // 2, w // 2, 3)
    img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
    img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
    img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)

    if isinstance(prompt, str):
        prompt = [prompt]
    txt = t5(prompt)
    txt = torch.cat((txt, img_cond.to(txt)), dim=-2)
    if txt.shape[0] == 1 and bs > 1:
        txt = repeat(txt, "1 ... -> bs ...", bs=bs)
    txt_ids = torch.zeros(bs, txt.shape[1], 3)

    vec = clip(prompt)
    if vec.shape[0] == 1 and bs > 1:
        vec = repeat(vec, "1 ... -> bs ...", bs=bs)

    return {
        "img": img,
        "img_ids": img_ids.to(img.device),
        "txt": txt.to(img.device),
        "txt_ids": txt_ids.to(img.device),
        "vec": vec.to(img.device),
    }


def time_shift(mu: float, sigma: float, t: Tensor):
    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)


def get_lin_function(
    x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
) -> Callable[[float], float]:
    m = (y2 - y1) / (x2 - x1)
    b = y1 - m * x1
    return lambda x: m * x + b


def get_schedule(
    num_steps: int,
    image_seq_len: int,
    base_shift: float = 0.5,
    max_shift: float = 1.15,
    shift: bool = True,
) -> list[float]:
    # extra step for zero
    timesteps = torch.linspace(1, 0, num_steps + 1)

    # shifting the schedule to favor high timesteps for higher signal images
    if shift:
        # estimate mu based on linear estimation between two points
        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
        timesteps = time_shift(mu, 1.0, timesteps)

    return timesteps.tolist()


def denoise(
    model: Flux,
    # model input
    img: Tensor,
    img_ids: Tensor,
    txt: Tensor,
    txt_ids: Tensor,
    vec: Tensor,
    # sampling parameters
    timesteps: list[float],
    guidance: float = 4.0,
    # extra img tokens
    img_cond: Tensor | None = None,
):
    # this is ignored for schnell
    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)


    for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):


        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
        pred = model(
            img=torch.cat((img, img_cond), dim=-1) if img_cond is not None else img,
            #img_ids=img_ids[1] if small else img_ids[0],
            img_ids=img_ids[0],
            txt=txt,
            txt_ids=txt_ids,
            y=vec,
            timesteps=t_vec,
            guidance=guidance_vec,
        )

        img = img + (t_prev - t_curr) * pred

    return img


def unpack(x: Tensor, height: int, width: int) -> Tensor:
    return rearrange(
        x,
        "b (h w) (c ph pw) -> b c (h ph) (w pw)",
        h=math.ceil(height / 16),
        w=math.ceil(width / 16),
        ph=2,
        pw=2,
    )

####################################################################################################

from calflops import calculate_flops

def denoise_test_FLOPs(
    model: Flux,
    # model input
    img: Tensor,
    img_ids: Tensor,
    txt: Tensor,
    txt_ids: Tensor,
    vec: Tensor,
    # sampling parameters
    timesteps: list[float],
    guidance: float = 4.0,
):  
    # init cache
    cache_dic, current = cache_init(timesteps)
    # this is ignored for schnell
    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
    current['step']=0
    current['num_steps'] = len(timesteps)-1
    total_flops = 0
    for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
        inputs=dict(
            img=img,
            img_ids=img_ids,
            txt=txt,
            txt_ids=txt_ids,
            y=vec,
            timesteps=t_vec,
            cache_dic = cache_dic,
            current = current,
            guidance=guidance_vec,
        )
        flops, macs, params = calculate_flops(model=model,
                                      kwargs = inputs,
                                      print_results=False)
        total_flops += convert_flops(flops)
        current['step'] += 1
    
    print(f"Total {total_flops * 10 **(-12)} TFLOPs." )
    return img

import re

def convert_flops(flops_str):
    """
    将表示 FLOPS 的字符串（如 '12.34 GFLOPS', '1.2 TFLOPS'）转换为对应的数值。
    """
    # 使用正则表达式匹配数字和单位
    match = re.match(r"([\d.]+)\s*([GT]?FLOPS)", flops_str.strip(), re.IGNORECASE)
    if not match:
        raise ValueError(f"无法解析 FLOPS 字符串: {flops_str}")
    
    # 提取数字和单位
    value = float(match.group(1))
    unit = match.group(2).upper()
    
    # 根据单位转换为数字
    if unit == "GFLOPS":
        return value * 10**9
    elif unit == "TFLOPS":
        return value * 10**12
    else:
        raise ValueError(f"未知的 FLOPS 单位: {unit}")


================================================
FILE: flux-ToCa/src/flux/util.py
================================================
import os
from dataclasses import dataclass

import torch
from einops import rearrange
from huggingface_hub import hf_hub_download
from imwatermark import WatermarkEncoder
from PIL import ExifTags, Image
from safetensors.torch import load_file as load_sft

from flux.model import Flux, FluxLoraWrapper, FluxParams
from flux.modules.autoencoder import AutoEncoder, AutoEncoderParams
from flux.modules.conditioner import HFEmbedder


def save_image(
    nsfw_classifier,
    name: str,
    output_name: str,
    idx: int,
    x: torch.Tensor,
    add_sampling_metadata: bool,
    prompt: str,
    nsfw_threshold: float = 0.85,
) -> int:
    fn = output_name.format(idx=idx)
    print(f"Saving {fn}")
    # bring into PIL format and save
    x = x.clamp(-1, 1)
    x = embed_watermark(x.float())
    x = rearrange(x[0], "c h w -> h w c")

    img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
    nsfw_score = [x["score"] for x in nsfw_classifier(img) if x["label"] == "nsfw"][0]

    if nsfw_score < nsfw_threshold:
        exif_data = Image.Exif()
        exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
        exif_data[ExifTags.Base.Make] = "Black Forest Labs"
        exif_data[ExifTags.Base.Model] = name
        if add_sampling_metadata:
            exif_data[ExifTags.Base.ImageDescription] = prompt
        img.save(fn, exif=exif_data, quality=95, subsampling=0)
        idx += 1
    else:
        print("Your generated image may contain NSFW content.")

    return idx


@dataclass
class ModelSpec:
    params: FluxParams
    ae_params: AutoEncoderParams
    ckpt_path: str | None
    lora_path: str | None
    ae_path: str | None
    repo_id: str | None
    repo_flow: str | None
    repo_ae: str | None


configs = {
    "flux-dev": ModelSpec(
        repo_id="black-forest-labs/FLUX.1-dev",
        repo_flow="flux1-dev.safetensors",
        repo_ae="ae.safetensors",
        ckpt_path=os.getenv("FLUX_DEV"),
        lora_path=None,
        params=FluxParams(
            in_channels=64,
            out_channels=64,
            vec_in_dim=768,
            context_in_dim=4096,
            hidden_size=3072,
            mlp_ratio=4.0,
            num_heads=24,
            depth=19,
            depth_single_blocks=38,
            axes_dim=[16, 56, 56],
            theta=10_000,
            qkv_bias=True,
            guidance_embed=True,
        ),
        ae_path=os.getenv("AE"),
        ae_params=AutoEncoderParams(
            resolution=256,
            in_channels=3,
            ch=128,
            out_ch=3,
            ch_mult=[1, 2, 4, 4],
            num_res_blocks=2,
            z_channels=16,
            scale_factor=0.3611,
            shift_factor=0.1159,
        ),
    ),
    "flux-schnell": ModelSpec(
        repo_id="black-forest-labs/FLUX.1-schnell",
        repo_flow="flux1-schnell.safetensors",
        repo_ae="ae.safetensors",
        ckpt_path=os.getenv("FLUX_SCHNELL"),
        lora_path=None,
        params=FluxParams(
            in_channels=64,
            out_channels=64,
            vec_in_dim=768,
            context_in_dim=4096,
            hidden_size=3072,
            mlp_ratio=4.0,
            num_heads=24,
            depth=19,
            depth_single_blocks=38,
            axes_dim=[16, 56, 56],
            theta=10_000,
            qkv_bias=True,
            guidance_embed=False,
        ),
        ae_path=os.getenv("AE"),
        ae_params=AutoEncoderParams(
            resolution=256,
            in_channels=3,
            ch=128,
            out_ch=3,
            ch_mult=[1, 2, 4, 4],
            num_res_blocks=2,
            z_channels=16,
            scale_factor=0.3611,
            shift_factor=0.1159,
        ),
    ),
    "flux-dev-canny": ModelSpec(
        repo_id="black-forest-labs/FLUX.1-Canny-dev",
        repo_flow="flux1-canny-dev.safetensors",
        repo_ae="ae.safetensors",
        ckpt_path=os.getenv("FLUX_DEV_CANNY"),
        lora_path=None,
        params=FluxParams(
            in_channels=128,
            out_channels=64,
            vec_in_dim=768,
            context_in_dim=4096,
            hidden_size=3072,
            mlp_ratio=4.0,
            num_heads=24,
            depth=19,
            depth_single_blocks=38,
            axes_dim=[16, 56, 56],
            theta=10_000,
            qkv_bias=True,
            guidance_embed=True,
        ),
        ae_path=os.getenv("AE"),
        ae_params=AutoEncoderParams(
            resolution=256,
            in_channels=3,
            ch=128,
            out_ch=3,
            ch_mult=[1, 2, 4, 4],
            num_res_blocks=2,
            z_channels=16,
            scale_factor=0.3611,
            shift_factor=0.1159,
        ),
    ),
    "flux-dev-canny-lora": ModelSpec(
        repo_id="black-forest-labs/FLUX.1-dev",
        repo_flow="flux1-dev.safetensors",
        repo_ae="ae.safetensors",
        ckpt_path=os.getenv("FLUX_DEV"),
        lora_path=os.getenv("FLUX_DEV_CANNY_LORA"),
        params=FluxParams(
            in_channels=128,
            out_channels=64,
            vec_in_dim=768,
            context_in_dim=4096,
            hidden_size=3072,
            mlp_ratio=4.0,
            num_heads=24,
            depth=19,
            depth_single_blocks=38,
            axes_dim=[16, 56, 56],
            theta=10_000,
            qkv_bias=True,
            guidance_embed=True,
        ),
        ae_path=os.getenv("AE"),
        ae_params=AutoEncoderParams(
            resolution=256,
            in_channels=3,
            ch=128,
            out_ch=3,
            ch_mult=[1, 2, 4, 4],
            num_res_blocks=2,
            z_channels=16,
            scale_factor=0.3611,
            shift_factor=0.1159,
        ),
    ),
    "flux-dev-depth": ModelSpec(
        repo_id="black-forest-labs/FLUX.1-Depth-dev",
        repo_flow="flux1-depth-dev.safetensors",
        repo_ae="ae.safetensors",
        ckpt_path=os.getenv("FLUX_DEV_DEPTH"),
        lora_path=None,
        params=FluxParams(
            in_channels=128,
            out_channels=64,
            vec_in_dim=768,
            context_in_dim=4096,
            hidden_size=3072,
            mlp_ratio=4.0,
            num_heads=24,
            depth=19,
            depth_single_blocks=38,
            axes_dim=[16, 56, 56],
            theta=10_000,
            qkv_bias=True,
            guidance_embed=True,
        ),
        ae_path=os.getenv("AE"),
        ae_params=AutoEncoderParams(
            resolution=256,
            in_channels=3,
            ch=128,
            out_ch=3,
            ch_mult=[1, 2, 4, 4],
            num_res_blocks=2,
            z_channels=16,
            scale_factor=0.3611,
            shift_factor=0.1159,
        ),
    ),
    "flux-dev-depth-lora": ModelSpec(
        repo_id="black-forest-labs/FLUX.1-dev",
        repo_flow="flux1-dev.safetensors",
        repo_ae="ae.safetensors",
        ckpt_path=os.getenv("FLUX_DEV"),
        lora_path=os.getenv("FLUX_DEV_DEPTH_LORA"),
        params=FluxParams(
            in_channels=128,
            out_channels=64,
            vec_in_dim=768,
            context_in_dim=4096,
            hidden_size=3072,
            mlp_ratio=4.0,
            num_heads=24,
            depth=19,
            depth_single_blocks=38,
            axes_dim=[16, 56, 56],
            theta=10_000,
            qkv_bias=True,
            guidance_embed=True,
        ),
        ae_path=os.getenv("AE"),
        ae_params=AutoEncoderParams(
            resolution=256,
            in_channels=3,
            ch=128,
            out_ch=3,
            ch_mult=[1, 2, 4, 4],
            num_res_blocks=2,
            z_channels=16,
            scale_factor=0.3611,
            shift_factor=0.1159,
        ),
    ),
    "flux-dev-fill": ModelSpec(
        repo_id="black-forest-labs/FLUX.1-Fill-dev",
        repo_flow="flux1-fill-dev.safetensors",
        repo_ae="ae.safetensors",
        ckpt_path=os.getenv("FLUX_DEV_FILL"),
        lora_path=None,
        params=FluxParams(
            in_channels=384,
            out_channels=64,
            vec_in_dim=768,
            context_in_dim=4096,
            hidden_size=3072,
            mlp_ratio=4.0,
            num_heads=24,
            depth=19,
            depth_single_blocks=38,
            axes_dim=[16, 56, 56],
            theta=10_000,
            qkv_bias=True,
            guidance_embed=True,
        ),
        ae_path=os.getenv("AE"),
        ae_params=AutoEncoderParams(
            resolution=256,
            in_channels=3,
            ch=128,
            out_ch=3,
            ch_mult=[1, 2, 4, 4],
            num_res_blocks=2,
            z_channels=16,
            scale_factor=0.3611,
            shift_factor=0.1159,
        ),
    ),
}


def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
    if len(missing) > 0 and len(unexpected) > 0:
        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
        print("\n" + "-" * 79 + "\n")
        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
    elif len(missing) > 0:
        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
    elif len(unexpected) > 0:
        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))


def load_flow_model(
    name: str, device: str | torch.device = "cuda", hf_download: bool = True, verbose: bool = False
) -> Flux:
    # Loading Flux
    print("Init model")
    ckpt_path = configs[name].ckpt_path
    lora_path = configs[name].lora_path
    if (
        ckpt_path is None
        and configs[name].repo_id is not None
        and configs[name].repo_flow is not None
        and hf_download
    ):
        ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow)

    with torch.device("meta" if ckpt_path is not None else device):
        if lora_path is not None:
            model = FluxLoraWrapper(params=configs[name].params).to(torch.bfloat16)
        else:
            model = Flux(configs[name].params).to(torch.bfloat16)

    if ckpt_path is not None:
        print("Loading checkpoint")
        # load_sft doesn't support torch.device
        sd = load_sft(ckpt_path, device=str(device))
        sd = optionally_expand_state_dict(model, sd)
        missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
        if verbose:
            print_load_warning(missing, unexpected)

    if configs[name].lora_path is not None:
        print("Loading LoRA")
        lora_sd = load_sft(configs[name].lora_path, device=str(device))
        # loading the lora params + overwriting scale values in the norms
        missing, unexpected = model.load_state_dict(lora_sd, strict=False, assign=True)
        if verbose:
            print_load_warning(missing, unexpected)
    return model


def load_t5(device: str | torch.device = "cuda", max_length: int = 512) -> HFEmbedder:
    # max length 64, 128, 256 and 512 should work (if your sequence is short enough)
    return HFEmbedder("/root/autodl-tmp/pretrained_models/google/t5-v1_1-xxl", max_length=max_length, torch_dtype=torch.bfloat16).to(device)


def load_clip(device: str | torch.device = "cuda") -> HFEmbedder:
    return HFEmbedder("/root/autodl-tmp/pretrained_models/openai/clip-vit-large-patch14", max_length=77, torch_dtype=torch.bfloat16).to(device)


def load_ae(name: str, device: str | torch.device = "cuda", hf_download: bool = True) -> AutoEncoder:
    ckpt_path = configs[name].ae_path
    if (
        ckpt_path is None
        and configs[name].repo_id is not None
        and configs[name].repo_ae is not None
        and hf_download
    ):
        ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_ae)

    # Loading the autoencoder
    print("Init AE")
    with torch.device("meta" if ckpt_path is not None else device):
        ae = AutoEncoder(configs[name].ae_params)

    if ckpt_path is not None:
        sd = load_sft(ckpt_path, device=str(device))
        missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
        print_load_warning(missing, unexpected)
    return ae


def optionally_expand_state_dict(model: torch.nn.Module, state_dict: dict) -> dict:
    """
    Optionally expand the state dict to match the model's parameters shapes.
    """
    for name, param in model.named_parameters():
        if name in state_dict:
            if state_dict[name].shape != param.shape:
                print(
                    f"Expanding '{name}' with shape {state_dict[name].shape} to model parameter with shape {param.shape}."
                )
                # expand with zeros:
                expanded_state_dict_weight = torch.zeros_like(param, device=state_dict[name].device)
                slices = tuple(slice(0, dim) for dim in state_dict[name].shape)
                expanded_state_dict_weight[slices] = state_dict[name]
                state_dict[name] = expanded_state_dict_weight

    return state_dict


class WatermarkEmbedder:
    def __init__(self, watermark):
        self.watermark = watermark
        self.num_bits = len(WATERMARK_BITS)
        self.encoder = WatermarkEncoder()
        self.encoder.set_watermark("bits", self.watermark)

    def __call__(self, image: torch.Tensor) -> torch.Tensor:
        """
        Adds a predefined watermark to the input image

        Args:
            image: ([N,] B, RGB, H, W) in range [-1, 1]

        Returns:
            same as input but watermarked
        """
        image = 0.5 * image + 0.5
        squeeze = len(image.shape) == 4
        if squeeze:
            image = image[None, ...]
        n = image.shape[0]
        image_np = rearrange((255 * image).detach().cpu(), "n b c h w -> (n b) h w c").numpy()[:, :, :, ::-1]
        # torch (b, c, h, w) in [0, 1] -> numpy (b, h, w, c) [0, 255]
        # watermarking libary expects input as cv2 BGR format
        for k in range(image_np.shape[0]):
            image_np[k] = self.encoder.encode(image_np[k], "dwtDct")
        image = torch.from_numpy(rearrange(image_np[:, :, :, ::-1], "(n b) h w c -> n b c h w", n=n)).to(
            image.device
        )
        image = torch.clamp(image / 255, min=0.0, max=1.0)
        if squeeze:
            image = image[0]
        image = 2 * image - 1
        return image


# A fixed 48-bit message that was chosen at random
WATERMARK_MESSAGE = 0b001010101111111010000111100111001111010100101110
# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
embed_watermark = WatermarkEmbedder(WATERMARK_BITS)


================================================
FILE: flux-ToCa/src/geneval_flux.py
================================================
import argparse
import json
import os

import torch
import numpy as np
from PIL import Image, ExifTags
from tqdm import tqdm, trange
from einops import rearrange
from torchvision.utils import make_grid
from torchvision.transforms import ToTensor

# --- Imports related to FLUX module ---
from flux.sampling import (
    denoise_test_FLOPs,
    get_noise,
    get_schedule,
    prepare,
    unpack,
)
from flux.ideas import denoise_cache
from flux.util import (
    embed_watermark,
    load_ae,
    load_clip,
    load_flow_model,
    load_t5,
)
from transformers import pipeline

# NSFW threshold (adjustable as needed)
NSFW_THRESHOLD = 0.85


def parse_args():
    parser = argparse.ArgumentParser(description="Generate images using the FLUX model within the Geneval framework")
    # Required: input JSONL metadata file, each line must contain at least the "prompt" key
    parser.add_argument(
        "metadata_file",
        type=str,
        help="JSONL file containing metadata for each prompt, each line is a JSON object"
    )
    # FLUX model related parameters
    parser.add_argument(
        "--model_name",
        type=str,
        default="flux-schnell",
        choices=["flux-dev", "flux-schnell"],
        help="FLUX model name"
    )
    parser.add_argument(
        "--n_samples",
        type=int,
        default=1,
        help="Number of images to generate per prompt"
    )
    parser.add_argument(
        "--steps",
        type=int,
        default=None,
        help="Number of sampling steps (if not specified: 4 for flux-schnell, 50 for flux-dev)"
    )
    parser.add_argument(
        "--width",
        type=int,
        default=1360,
        help="Width of the generated image (pixels)"
    )
    parser.add_argument(
        "--height",
        type=int,
        default=768,
        help="Height of the generated image (pixels)"
    )
    parser.add_argument(
        "--guidance",
        type=float,
        default=3.5,
        help="Conditional guidance scale"
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="Random seed"
    )
    parser.add_argument(
        "--batch_size",
        type=int,
        default=1,
        help="Number of samples per batch during image generation"
    )
    # Output related parameters
    parser.add_argument(
        "--output_dir",
        type=str,
        default="outputs",
        help="Output directory to save the generated results"
    )
    parser.add_argument(
        "--skip_grid",
        action="store_true",
        help="Skip saving the overall grid image"
    )
    # Other options
    parser.add_argument(
        "--add_sampling_metadata",
        action="store_true",
        help="Add the prompt text to the metadata of the generated images"
    )
    parser.add_argument(
        "--use_nsfw_filter",
        action="store_true",
        help="Enable NSFW content filtering (requires downloading the relevant model)"
    )
    parser.add_argument(
        "--test_FLOPs",
        action="store_true",
        help="Test inference FLOPs only (no images will be generated)"
    )
    return parser.parse_args()


def main(args):
    # Read the metadata file, each line is a JSON object (must contain at least the "prompt" field)
    with open(args.metadata_file, "r", encoding="utf-8") as fp:
        metadatas = [json.loads(line) for line in fp if line.strip()]

    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # If NSFW filtering is enabled, load the corresponding classifier (please modify the model path or name accordingly)
    if args.use_nsfw_filter:
        nsfw_classifier = pipeline(
            "image-classification",
            model="/path/to/your/nsfw_model",  # Please replace with the actual NSFW model path
            device=0 if torch.cuda.is_available() else -1
        )
    else:
        nsfw_classifier = None

    # If sampling steps are not specified, set default steps based on the model name
    if args.steps is None:
        args.steps = 4 if args.model_name == "flux-schnell" else 50

    # Ensure the image width and height are multiples of 16 (required by FLUX)
    args.width = 16 * (args.width // 16)
    args.height = 16 * (args.height // 16)

    # Load FLUX model components onto the device (T5, CLIP, Flow model, autoencoder)
    t5 = load_t5(device, max_length=256 if args.model_name == "flux-schnell" else 512)
    clip = load_clip(device)
    model = load_flow_model(args.model_name, device=device)
    ae = load_ae(args.model_name, device=device)

    # Generate results for each prompt:
    # Each prompt corresponds to a subfolder (e.g., outputs/00000/), inside which samples and (optionally) a grid image grid.png are saved,
    # along with the prompt's metadata saved in a metadata.jsonl file.
    for idx, metadata in enumerate(metadatas):
        prompt = metadata.get("prompt", "")
        print(f"Processing prompt {idx + 1}/{len(metadatas)}: '{prompt}'")

        # Define output directory and samples directory
        outpath = os.path.join(args.output_dir, f"{idx:05d}")
        sample_path = os.path.join(outpath, "samples")

        # If the output directory already exists, check the number of PNG files already in the samples folder
        existing_samples = []
        sample_count = 0
        if os.path.exists(sample_path):
            files = sorted(
                fname for fname in os.listdir(sample_path)
                if fname.endswith(".png") and fname != "grid.png"
            )
            sample_count = len(files)
            # Load existing images (to be used later for generating the grid image)
            for fname in files:
                full_path = os.path.join(sample_path, fname)
                try:
                    img = Image.open(full_path).convert("RGB")
                    existing_samples.append(ToTensor()(img))
                except Exception as e:
                    print(f"Failed to read existing image {full_path}: {e}")

        # If the number of generated images is sufficient, skip generation
        if sample_count >= args.n_samples:
            print(f"Samples for prompt {idx + 1} already exist ({sample_count} images), skipping generation.")
            continue

        # Create output directory and samples subdirectory
        os.makedirs(outpath, exist_ok=True)
        os.makedirs(sample_path, exist_ok=True)
        # Save the current prompt's metadata to metadata.jsonl
        with open(os.path.join(outpath, "metadata.jsonl"), "w", encoding="utf-8") as fp:
            json.dump(metadata, fp)

        # Initialize: use the number of existing images as the starting count, and copy existing samples for later grid generation
        local_index = sample_count
        all_samples = existing_samples.copy()
        # The initial value of the progress bar is the number of existing samples
        pbar = tqdm(total=args.n_samples, initial=sample_count, desc="Sampling")

        # For the current prompt, only generate the missing images
        while local_index < args.n_samples:
            current_bs = min(args.batch_size, args.n_samples - local_index)
            # Set seed for the current batch (using the number of images already present in the prompt as offset)
            seed = args.seed + local_index
            # Generate random noise
            x = get_noise(current_bs, args.height, args.width, device=device, dtype=torch.bfloat16, seed=seed)
            prompt_list = [prompt] * current_bs
            # Prepare input (prompt encoding, initial image noise, etc.)
            inp = prepare(t5, clip, x, prompt=prompt_list)
            # Compute denoising schedule based on the input shape (note: the second parameter is the number of latent channels)
            timesteps = get_schedule(args.steps, inp["img"].shape[1], shift=(args.model_name != "flux-schnell"))

            with torch.no_grad():
                if args.test_FLOPs:
                    latent = denoise_test_FLOPs(model, **inp, timesteps=timesteps, guidance=args.guidance)
                else:
                    latent = denoise_cache(model, **inp, timesteps=timesteps, guidance=args.guidance)
                # Unpack latent to a shape suitable for the decoder input
                latent = unpack(latent.float(), args.height, args.width)
                # Decode to image with automatic mixed precision
                with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
                    decoded = ae.decode(latent)

            # Post-processing: clamp, embed watermark, and rearrange to [B, H, W, C] format
            decoded = decoded.clamp(-1, 1)
            decoded = embed_watermark(decoded.float())
            images_tensor = rearrange(decoded, "b c h w -> b h w c")

            # Iterate over each generated image in the current batch
            for i in range(current_bs):
                img_array = (127.5 * (images_tensor[i] + 1.0)).cpu().numpy().astype(np.uint8)
                img = Image.fromarray(img_array)
                # NSFW filtering (if enabled)
                if nsfw_classifier is not None:
                    nsfw_result = nsfw_classifier(img)
                    nsfw_score = next((res["score"] for res in nsfw_result if res["label"] == "nsfw"), 0.0)
                else:
                    nsfw_score = 0.0

                if nsfw_score < NSFW_THRESHOLD:
                    # Add sampling metadata (EXIF info); note: PNG format may not fully support EXIF
                    if args.add_sampling_metadata:
                        exif_data = Image.Exif()
                        exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
                        exif_data[ExifTags.Base.Make] = "Black Forest Labs"
                        exif_data[ExifTags.Base.Model] = args.model_name
                        exif_data[ExifTags.Base.ImageDescription] = prompt
                    else:
                        exif_data = None

                    sample_fname = os.path.join(sample_path, f"{local_index:05d}.png")
                    if exif_data is not None:
                        img.save(sample_fname, exif=exif_data)
                    else:
                        img.save(sample_fname)
                    all_samples.append(ToTensor()(img))
                else:
                    print("The generated image may contain inappropriate content and has been skipped.")
                local_index += 1
                pbar.update(1)
            # end for current batch
        pbar.close()

        # If grid generation is not skipped and there is at least one sample, create and save a grid image (consistent with Geneval format)
        if not args.skip_grid and len(all_samples) > 0:
            grid_tensor = torch.stack(all_samples, 0)
            grid = make_grid(grid_tensor, nrow=args.batch_size)
            grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy()
            grid_img = Image.fromarray(grid.astype(np.uint8))
            grid_img.save(os.path.join(outpath, "grid.png"))
    # end for each prompt

    print("Generation completed.")


if __name__ == "__main__":
    args = parse_args()
    main(args)

'''
python src/geneval_flux.py /root/geneval/prompts/evaluation_metadata.jsonl --model_name flux-dev --n_samples 4 --steps 50 --width 1024 --height 1024 --seed 42 --output_dir /root/autodl-tmp/samples/geneval_original --batch_size 1
'''


================================================
FILE: flux-ToCa/src/sample.py
================================================
import os
import re
import time
from dataclasses import dataclass
from glob import iglob

import torch
from einops import rearrange
from PIL import ExifTags, Image
from transformers import pipeline
from tqdm import tqdm

from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack, denoise_test_FLOPs
from flux.ideas import denoise_cache
from flux.util import configs, embed_watermark, load_ae, load_clip, load_flow_model, load_t5

NSFW_THRESHOLD = 0.85  # NSFW score threshold


@dataclass
class SamplingOptions:
    prompts: list[str]          # List of prompts
    width: int                  # Image width
    height: int                 # Image height
    num_steps: int              # Number of sampling steps
    guidance: float             # Guidance value
    seed: int | None            # Random seed
    num_images_per_prompt: int  # Number of images generated per prompt
    batch_size: int             # Batch size (number of prompts per batch)
    model_name: str             # Model name
    output_dir: str             # Output directory
    add_sampling_metadata: bool # Whether to add metadata
    use_nsfw_filter: bool       # Whether to enable NSFW filter
    test_FLOPs: bool            # Whether in FLOPs testing mode (in which case no images are generated)


def main(opts: SamplingOptions):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Optional NSFW classifier
    if opts.use_nsfw_filter:
        nsfw_classifier = pipeline(
            "image-classification",
            model="/root/autodl-tmp/pretrained_models/Falconsai/nsfw_image_detection",
            device=device
        )
    else:
        nsfw_classifier = None

    # Load model
    model_name = opts.model_name
    if model_name not in configs:
        available = ", ".join(configs.keys())
        raise ValueError(f"Unknown model name: {model_name}, available: {available}")

    if opts.num_steps is None:
        opts.num_steps = 4 if model_name == "flux-schnell" else 50

    # Ensure width and height are multiples of 16
    opts.width = 16 * (opts.width // 16)
    opts.height = 16 * (opts.height // 16)

    # Set output directory and index
    output_name = os.path.join(opts.output_dir, f"img_{{idx}}.jpg")
    if not os.path.exists(opts.output_dir):
        os.makedirs(opts.output_dir)
    idx = 0  # Image index

    # Initialize model components
    torch_device = device

    # Load T5 and CLIP models onto GPU
    t5 = load_t5(torch_device, max_length=256 if model_name == "flux-schnell" else 512)
    clip = load_clip(torch_device)

    # Load model onto GPU
    model = load_flow_model(model_name, device=torch_device)
    ae = load_ae(model_name, device=torch_device)

    # Set random seed
    if opts.seed is not None:
        base_seed = opts.seed
    else:
        base_seed = torch.randint(0, 2**32, (1,)).item()

    prompts = opts.prompts

    total_images = len(prompts) * opts.num_images_per_prompt
    progress_bar = tqdm(total=total_images, desc="Generating images")

    # Calculate number of prompt batches
    num_prompt_batches = (len(prompts) + opts.batch_size - 1) // opts.batch_size

    for batch_idx in range(num_prompt_batches):
        prompt_start = batch_idx * opts.batch_size
        prompt_end = min(prompt_start + opts.batch_size, len(prompts))
        batch_prompts = prompts[prompt_start:prompt_end]
        num_prompts_in_batch = len(batch_prompts)

        # For each prompt, generate the corresponding number of images
        for image_idx in range(opts.num_images_per_prompt):
            # Prepare random seed
            seed = base_seed + idx  # Set a different seed for each image
            idx += num_prompts_in_batch  # Update image index

            # Prepare input
            batch_size = num_prompts_in_batch
            x = get_noise(
                batch_size,
                opts.height,
                opts.width,
                device=torch_device,
                dtype=torch.bfloat16,
                seed=seed,
            )

            # Prepare prompts
            # batch_prompts is a list containing the prompts for the current batch
            inp = prepare(t5, clip, x, prompt=batch_prompts)
            timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(model_name != "flux-schnell"))
            
            # Denoise
            with torch.no_grad():
                if opts.test_FLOPs:
                    x = denoise_test_FLOPs(model, **inp, timesteps=timesteps, guidance=opts.guidance)
                else:
                    x = denoise_cache(model, **inp, timesteps=timesteps, guidance=opts.guidance)

                # Decode latent variables
                x = unpack(x.float(), opts.height, opts.width)
                with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
                    x = ae.decode(x)

            # Convert to PIL format and save
            x = x.clamp(-1, 1)
            x = embed_watermark(x.float())
            x = rearrange(x, "b c h w -> b h w c")

            for i in range(batch_size):
                img_array = x[i]
                img = Image.fromarray((127.5 * (img_array + 1.0)).cpu().byte().numpy())

                # Optional NSFW filtering
                if opts.use_nsfw_filter:
                    nsfw_result = nsfw_classifier(img)
                    nsfw_score = next((res["score"] for res in nsfw_result if res["label"] == "nsfw"), 0.0)
                else:
                    nsfw_score = 0.0  # If filter is not enabled, consider safe

                if nsfw_score < NSFW_THRESHOLD:
                    exif_data = Image.Exif()
                    exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
                    exif_data[ExifTags.Base.Make] = "Black Forest Labs"
                    exif_data[ExifTags.Base.Model] = model_name
                    if opts.add_sampling_metadata:
                        exif_data[ExifTags.Base.ImageDescription] = batch_prompts[i]
                    # Save image
                    fn = output_name.format(idx=idx - num_prompts_in_batch + i)
                    img.save(fn, exif=exif_data, quality=95, subsampling=0)
                else:
                    print(f"The generated image may contain inappropriate content and has been skipped.")

                progress_bar.update(1)

    progress_bar.close()


def read_prompts(prompt_file: str):
    with open(prompt_file, 'r', encoding='utf-8') as f:
        prompts = [line.strip() for line in f if line.strip()]
    return prompts


def app():
    import argparse

    parser = argparse.ArgumentParser(description="Generate images using the flux model.")
    parser.add_argument('--prompt_file', type=str, required=True, help='Path to the prompt text file.')
    parser.add_argument('--width', type=int, default=1360, help='Width of the generated image.')
    parser.add_argument('--height', type=int, default=768, help='Height of the generated image.')
    parser.add_argument('--num_steps', type=int, default=None, help='Number of sampling steps.')
    parser.add_argument('--guidance', type=float, default=3.5, help='Guidance value.')
    parser.add_argument('--seed', type=int, default=0, help='Random seed.')
    parser.add_argument('--num_images_per_prompt', type=int, default=1, help='Number of images generated per prompt.')
    parser.add_argument('--batch_size', type=int, default=1, help='Batch size (number of prompts per batch).')
    parser.add_argument('--model_name', type=str, default='flux-schnell', choices=['flux-dev', 'flux-schnell'], help='Model name.')
    parser.add_argument('--output_dir', type=str, default='/root/autodl-tmp/samples', help='Directory to save images.')
    parser.add_argument('--add_sampling_metadata', action='store_true', help='Whether to add prompts to image metadata.')
    parser.add_argument('--use_nsfw_filter', action='store_true', help='Enable NSFW filter.')
    parser.add_argument('--test_FLOPs', action='store_true', help='Test inference FLOPs.')

    args = parser.parse_args()

    prompts = read_prompts(args.prompt_file)

    opts = SamplingOptions(
        prompts=prompts,
        width=args.width,
        height=args.height,
        num_steps=args.num_steps,
        guidance=args.guidance,
        seed=args.seed,
        num_images_per_prompt=args.num_images_per_prompt,
        batch_size=args.batch_size,
        model_name=args.model_name,
        output_dir=args.output_dir,
        add_sampling_metadata=args.add_sampling_metadata,
        use_nsfw_filter=args.use_nsfw_filter,
        test_FLOPs=args.test_FLOPs,
    )

    main(opts)


if __name__ == '__main__':
    app()